| 110 109 56 29 26 22 51 51 38 29 48 1 47 47 1 46 47 47 47 2 2 2 2 2 1 1 48 64 56 29 40 19 46 1 1802 110 29 3 24 32 24 56 56 1948 1953 22 1946 1911 51 110 48 6 104 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1991-1998 Linus Torvalds * Re-organised Feb 1998 Russell King * Copyright (C) 2020 Christoph Hellwig */ #include <linux/fs.h> #include <linux/major.h> #include <linux/slab.h> #include <linux/ctype.h> #include <linux/vmalloc.h> #include <linux/raid/detect.h> #include "check.h" static int (*const check_part[])(struct parsed_partitions *) = { /* * Probe partition formats with tables at disk address 0 * that also have an ADFS boot block at 0xdc0. */ #ifdef CONFIG_ACORN_PARTITION_ICS adfspart_check_ICS, #endif #ifdef CONFIG_ACORN_PARTITION_POWERTEC adfspart_check_POWERTEC, #endif #ifdef CONFIG_ACORN_PARTITION_EESOX adfspart_check_EESOX, #endif /* * Now move on to formats that only have partition info at * disk address 0xdc0. Since these may also have stale * PC/BIOS partition tables, they need to come before * the msdos entry. */ #ifdef CONFIG_ACORN_PARTITION_CUMANA adfspart_check_CUMANA, #endif #ifdef CONFIG_ACORN_PARTITION_ADFS adfspart_check_ADFS, #endif #ifdef CONFIG_CMDLINE_PARTITION cmdline_partition, #endif #ifdef CONFIG_OF_PARTITION of_partition, /* cmdline have priority to OF */ #endif #ifdef CONFIG_EFI_PARTITION efi_partition, /* this must come before msdos */ #endif #ifdef CONFIG_SGI_PARTITION sgi_partition, #endif #ifdef CONFIG_LDM_PARTITION ldm_partition, /* this must come before msdos */ #endif #ifdef CONFIG_MSDOS_PARTITION msdos_partition, #endif #ifdef CONFIG_OSF_PARTITION osf_partition, #endif #ifdef CONFIG_SUN_PARTITION sun_partition, #endif #ifdef CONFIG_AMIGA_PARTITION amiga_partition, #endif #ifdef CONFIG_ATARI_PARTITION atari_partition, #endif #ifdef CONFIG_MAC_PARTITION mac_partition, #endif #ifdef CONFIG_ULTRIX_PARTITION ultrix_partition, #endif #ifdef CONFIG_IBM_PARTITION ibm_partition, #endif #ifdef CONFIG_KARMA_PARTITION karma_partition, #endif #ifdef CONFIG_SYSV68_PARTITION sysv68_partition, #endif NULL }; static struct parsed_partitions *allocate_partitions(struct gendisk *hd) { struct parsed_partitions *state; int nr = DISK_MAX_PARTS; state = kzalloc(sizeof(*state), GFP_KERNEL); if (!state) return NULL; state->parts = vzalloc(array_size(nr, sizeof(state->parts[0]))); if (!state->parts) { kfree(state); return NULL; } state->limit = nr; return state; } static void free_partitions(struct parsed_partitions *state) { vfree(state->parts); kfree(state); } static struct parsed_partitions *check_partition(struct gendisk *hd) { struct parsed_partitions *state; int i, res, err; state = allocate_partitions(hd); if (!state) return NULL; state->pp_buf = (char *)__get_free_page(GFP_KERNEL); if (!state->pp_buf) { free_partitions(state); return NULL; } state->pp_buf[0] = '\0'; state->disk = hd; snprintf(state->name, BDEVNAME_SIZE, "%s", hd->disk_name); snprintf(state->pp_buf, PAGE_SIZE, " %s:", state->name); if (isdigit(state->name[strlen(state->name)-1])) sprintf(state->name, "p"); i = res = err = 0; while (!res && check_part[i]) { memset(state->parts, 0, state->limit * sizeof(state->parts[0])); res = check_part[i++](state); if (res < 0) { /* * We have hit an I/O error which we don't report now. * But record it, and let the others do their job. */ err = res; res = 0; } } if (res > 0) { printk(KERN_INFO "%s", state->pp_buf); free_page((unsigned long)state->pp_buf); return state; } if (state->access_beyond_eod) err = -ENOSPC; /* * The partition is unrecognized. So report I/O errors if there were any */ if (err) res = err; if (res) { strlcat(state->pp_buf, " unable to read partition table\n", PAGE_SIZE); printk(KERN_INFO "%s", state->pp_buf); } free_page((unsigned long)state->pp_buf); free_partitions(state); return ERR_PTR(res); } static ssize_t part_partition_show(struct device *dev, struct device_attribute *attr, char *buf) { return sprintf(buf, "%d\n", bdev_partno(dev_to_bdev(dev))); } static ssize_t part_start_show(struct device *dev, struct device_attribute *attr, char *buf) { return sprintf(buf, "%llu\n", dev_to_bdev(dev)->bd_start_sect); } static ssize_t part_ro_show(struct device *dev, struct device_attribute *attr, char *buf) { return sprintf(buf, "%d\n", bdev_read_only(dev_to_bdev(dev))); } static ssize_t part_alignment_offset_show(struct device *dev, struct device_attribute *attr, char *buf) { return sprintf(buf, "%u\n", bdev_alignment_offset(dev_to_bdev(dev))); } static ssize_t part_discard_alignment_show(struct device *dev, struct device_attribute *attr, char *buf) { return sprintf(buf, "%u\n", bdev_discard_alignment(dev_to_bdev(dev))); } static DEVICE_ATTR(partition, 0444, part_partition_show, NULL); static DEVICE_ATTR(start, 0444, part_start_show, NULL); static DEVICE_ATTR(size, 0444, part_size_show, NULL); static DEVICE_ATTR(ro, 0444, part_ro_show, NULL); static DEVICE_ATTR(alignment_offset, 0444, part_alignment_offset_show, NULL); static DEVICE_ATTR(discard_alignment, 0444, part_discard_alignment_show, NULL); static DEVICE_ATTR(stat, 0444, part_stat_show, NULL); static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL); #ifdef CONFIG_FAIL_MAKE_REQUEST static struct device_attribute dev_attr_fail = __ATTR(make-it-fail, 0644, part_fail_show, part_fail_store); #endif static struct attribute *part_attrs[] = { &dev_attr_partition.attr, &dev_attr_start.attr, &dev_attr_size.attr, &dev_attr_ro.attr, &dev_attr_alignment_offset.attr, &dev_attr_discard_alignment.attr, &dev_attr_stat.attr, &dev_attr_inflight.attr, #ifdef CONFIG_FAIL_MAKE_REQUEST &dev_attr_fail.attr, #endif NULL }; static const struct attribute_group part_attr_group = { .attrs = part_attrs, }; static const struct attribute_group *part_attr_groups[] = { &part_attr_group, #ifdef CONFIG_BLK_DEV_IO_TRACE &blk_trace_attr_group, #endif NULL }; static void part_release(struct device *dev) { put_disk(dev_to_bdev(dev)->bd_disk); bdev_drop(dev_to_bdev(dev)); } static int part_uevent(const struct device *dev, struct kobj_uevent_env *env) { const struct block_device *part = dev_to_bdev(dev); add_uevent_var(env, "PARTN=%u", bdev_partno(part)); if (part->bd_meta_info && part->bd_meta_info->volname[0]) add_uevent_var(env, "PARTNAME=%s", part->bd_meta_info->volname); if (part->bd_meta_info && part->bd_meta_info->uuid[0]) add_uevent_var(env, "PARTUUID=%s", part->bd_meta_info->uuid); return 0; } const struct device_type part_type = { .name = "partition", .groups = part_attr_groups, .release = part_release, .uevent = part_uevent, }; void drop_partition(struct block_device *part) { lockdep_assert_held(&part->bd_disk->open_mutex); xa_erase(&part->bd_disk->part_tbl, bdev_partno(part)); kobject_put(part->bd_holder_dir); device_del(&part->bd_device); put_device(&part->bd_device); } static ssize_t whole_disk_show(struct device *dev, struct device_attribute *attr, char *buf) { return 0; } static const DEVICE_ATTR(whole_disk, 0444, whole_disk_show, NULL); /* * Must be called either with open_mutex held, before a disk can be opened or * after all disk users are gone. */ static struct block_device *add_partition(struct gendisk *disk, int partno, sector_t start, sector_t len, int flags, struct partition_meta_info *info) { dev_t devt = MKDEV(0, 0); struct device *ddev = disk_to_dev(disk); struct device *pdev; struct block_device *bdev; const char *dname; int err; lockdep_assert_held(&disk->open_mutex); if (partno >= DISK_MAX_PARTS) return ERR_PTR(-EINVAL); /* * Partitions are not supported on zoned block devices that are used as * such. */ if (bdev_is_zoned(disk->part0)) { pr_warn("%s: partitions not supported on host managed zoned block device\n", disk->disk_name); return ERR_PTR(-ENXIO); } if (xa_load(&disk->part_tbl, partno)) return ERR_PTR(-EBUSY); /* ensure we always have a reference to the whole disk */ get_device(disk_to_dev(disk)); err = -ENOMEM; bdev = bdev_alloc(disk, partno); if (!bdev) goto out_put_disk; bdev->bd_start_sect = start; bdev_set_nr_sectors(bdev, len); pdev = &bdev->bd_device; dname = dev_name(ddev); if (isdigit(dname[strlen(dname) - 1])) dev_set_name(pdev, "%sp%d", dname, partno); else dev_set_name(pdev, "%s%d", dname, partno); device_initialize(pdev); pdev->class = &block_class; pdev->type = &part_type; pdev->parent = ddev; /* in consecutive minor range? */ if (bdev_partno(bdev) < disk->minors) { devt = MKDEV(disk->major, disk->first_minor + bdev_partno(bdev)); } else { err = blk_alloc_ext_minor(); if (err < 0) goto out_put; devt = MKDEV(BLOCK_EXT_MAJOR, err); } pdev->devt = devt; if (info) { err = -ENOMEM; bdev->bd_meta_info = kmemdup(info, sizeof(*info), GFP_KERNEL); if (!bdev->bd_meta_info) goto out_put; } /* delay uevent until 'holders' subdir is created */ dev_set_uevent_suppress(pdev, 1); err = device_add(pdev); if (err) goto out_put; err = -ENOMEM; bdev->bd_holder_dir = kobject_create_and_add("holders", &pdev->kobj); if (!bdev->bd_holder_dir) goto out_del; dev_set_uevent_suppress(pdev, 0); if (flags & ADDPART_FLAG_WHOLEDISK) { err = device_create_file(pdev, &dev_attr_whole_disk); if (err) goto out_del; } if (flags & ADDPART_FLAG_READONLY) bdev_set_flag(bdev, BD_READ_ONLY); /* everything is up and running, commence */ err = xa_insert(&disk->part_tbl, partno, bdev, GFP_KERNEL); if (err) goto out_del; bdev_add(bdev, devt); /* suppress uevent if the disk suppresses it */ if (!dev_get_uevent_suppress(ddev)) kobject_uevent(&pdev->kobj, KOBJ_ADD); return bdev; out_del: kobject_put(bdev->bd_holder_dir); device_del(pdev); out_put: put_device(pdev); return ERR_PTR(err); out_put_disk: put_disk(disk); return ERR_PTR(err); } static bool partition_overlaps(struct gendisk *disk, sector_t start, sector_t length, int skip_partno) { struct block_device *part; bool overlap = false; unsigned long idx; rcu_read_lock(); xa_for_each_start(&disk->part_tbl, idx, part, 1) { if (bdev_partno(part) != skip_partno && start < part->bd_start_sect + bdev_nr_sectors(part) && start + length > part->bd_start_sect) { overlap = true; break; } } rcu_read_unlock(); return overlap; } int bdev_add_partition(struct gendisk *disk, int partno, sector_t start, sector_t length) { struct block_device *part; int ret; mutex_lock(&disk->open_mutex); if (!disk_live(disk)) { ret = -ENXIO; goto out; } if (disk->flags & GENHD_FL_NO_PART) { ret = -EINVAL; goto out; } if (partition_overlaps(disk, start, length, -1)) { ret = -EBUSY; goto out; } part = add_partition(disk, partno, start, length, ADDPART_FLAG_NONE, NULL); ret = PTR_ERR_OR_ZERO(part); out: mutex_unlock(&disk->open_mutex); return ret; } int bdev_del_partition(struct gendisk *disk, int partno) { struct block_device *part = NULL; int ret = -ENXIO; mutex_lock(&disk->open_mutex); part = xa_load(&disk->part_tbl, partno); if (!part) goto out_unlock; ret = -EBUSY; if (atomic_read(&part->bd_openers)) goto out_unlock; /* * We verified that @part->bd_openers is zero above and so * @part->bd_holder{_ops} can't be set. And since we hold * @disk->open_mutex the device can't be claimed by anyone. * * So no need to call @part->bd_holder_ops->mark_dead() here. * Just delete the partition and invalidate it. */ bdev_unhash(part); invalidate_bdev(part); drop_partition(part); ret = 0; out_unlock: mutex_unlock(&disk->open_mutex); return ret; } int bdev_resize_partition(struct gendisk *disk, int partno, sector_t start, sector_t length) { struct block_device *part = NULL; int ret = -ENXIO; mutex_lock(&disk->open_mutex); part = xa_load(&disk->part_tbl, partno); if (!part) goto out_unlock; ret = -EINVAL; if (start != part->bd_start_sect) goto out_unlock; ret = -EBUSY; if (partition_overlaps(disk, start, length, partno)) goto out_unlock; bdev_set_nr_sectors(part, length); ret = 0; out_unlock: mutex_unlock(&disk->open_mutex); return ret; } static bool disk_unlock_native_capacity(struct gendisk *disk) { if (!disk->fops->unlock_native_capacity || test_and_set_bit(GD_NATIVE_CAPACITY, &disk->state)) { printk(KERN_CONT "truncated\n"); return false; } printk(KERN_CONT "enabling native capacity\n"); disk->fops->unlock_native_capacity(disk); return true; } static bool blk_add_partition(struct gendisk *disk, struct parsed_partitions *state, int p) { sector_t size = state->parts[p].size; sector_t from = state->parts[p].from; struct block_device *part; if (!size) return true; if (from >= get_capacity(disk)) { printk(KERN_WARNING "%s: p%d start %llu is beyond EOD, ", disk->disk_name, p, (unsigned long long) from); if (disk_unlock_native_capacity(disk)) return false; return true; } if (from + size > get_capacity(disk)) { printk(KERN_WARNING "%s: p%d size %llu extends beyond EOD, ", disk->disk_name, p, (unsigned long long) size); if (disk_unlock_native_capacity(disk)) return false; /* * We can not ignore partitions of broken tables created by for * example camera firmware, but we limit them to the end of the * disk to avoid creating invalid block devices. */ size = get_capacity(disk) - from; } part = add_partition(disk, p, from, size, state->parts[p].flags, &state->parts[p].info); if (IS_ERR(part)) { if (PTR_ERR(part) != -ENXIO) { printk(KERN_ERR " %s: p%d could not be added: %pe\n", disk->disk_name, p, part); } return true; } if (IS_BUILTIN(CONFIG_BLK_DEV_MD) && (state->parts[p].flags & ADDPART_FLAG_RAID)) md_autodetect_dev(part->bd_dev); return true; } static int blk_add_partitions(struct gendisk *disk) { struct parsed_partitions *state; int ret = -EAGAIN, p; if (!disk_has_partscan(disk)) return 0; state = check_partition(disk); if (!state) return 0; if (IS_ERR(state)) { /* * I/O error reading the partition table. If we tried to read * beyond EOD, retry after unlocking the native capacity. */ if (PTR_ERR(state) == -ENOSPC) { printk(KERN_WARNING "%s: partition table beyond EOD, ", disk->disk_name); if (disk_unlock_native_capacity(disk)) return -EAGAIN; } return -EIO; } /* * Partitions are not supported on host managed zoned block devices. */ if (bdev_is_zoned(disk->part0)) { pr_warn("%s: ignoring partition table on host managed zoned block device\n", disk->disk_name); ret = 0; goto out_free_state; } /* * If we read beyond EOD, try unlocking native capacity even if the * partition table was successfully read as we could be missing some * partitions. */ if (state->access_beyond_eod) { printk(KERN_WARNING "%s: partition table partially beyond EOD, ", disk->disk_name); if (disk_unlock_native_capacity(disk)) goto out_free_state; } /* tell userspace that the media / partition table may have changed */ kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE); for (p = 1; p < state->limit; p++) if (!blk_add_partition(disk, state, p)) goto out_free_state; ret = 0; out_free_state: free_partitions(state); return ret; } int bdev_disk_changed(struct gendisk *disk, bool invalidate) { struct block_device *part; unsigned long idx; int ret = 0; lockdep_assert_held(&disk->open_mutex); if (!disk_live(disk)) return -ENXIO; rescan: if (disk->open_partitions) return -EBUSY; sync_blockdev(disk->part0); invalidate_bdev(disk->part0); xa_for_each_start(&disk->part_tbl, idx, part, 1) { /* * Remove the block device from the inode hash, so that * it cannot be looked up any more even when openers * still hold references. */ bdev_unhash(part); /* * If @disk->open_partitions isn't elevated but there's * still an active holder of that block device things * are broken. */ WARN_ON_ONCE(atomic_read(&part->bd_openers)); invalidate_bdev(part); drop_partition(part); } clear_bit(GD_NEED_PART_SCAN, &disk->state); /* * Historically we only set the capacity to zero for devices that * support partitions (independ of actually having partitions created). * Doing that is rather inconsistent, but changing it broke legacy * udisks polling for legacy ide-cdrom devices. Use the crude check * below to get the sane behavior for most device while not breaking * userspace for this particular setup. */ if (invalidate) { if (!(disk->flags & GENHD_FL_NO_PART) || !(disk->flags & GENHD_FL_REMOVABLE)) set_capacity(disk, 0); } if (get_capacity(disk)) { ret = blk_add_partitions(disk); if (ret == -EAGAIN) goto rescan; } else if (invalidate) { /* * Tell userspace that the media / partition table may have * changed. */ kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE); } return ret; } /* * Only exported for loop and dasd for historic reasons. Don't use in new * code! */ EXPORT_SYMBOL_GPL(bdev_disk_changed); void *read_part_sector(struct parsed_partitions *state, sector_t n, Sector *p) { struct address_space *mapping = state->disk->part0->bd_mapping; struct folio *folio; if (n >= get_capacity(state->disk)) { state->access_beyond_eod = true; goto out; } folio = read_mapping_folio(mapping, n >> PAGE_SECTORS_SHIFT, NULL); if (IS_ERR(folio)) goto out; p->v = folio; return folio_address(folio) + offset_in_folio(folio, n * SECTOR_SIZE); out: p->v = NULL; return NULL; } |
| 10 29 20 43 4 3 21 6 6 6 22 24 24 2 4 4 1 1 1 21 21 21 20 20 20 19 19 19 4 4 4 4 4 1 1 1 6 6 1 6 6 4 4 1 1 2 2 2 6 36 1 2 8 1 1 2 1 2 2 2 18 18 2 6 20 18 1 1 33 34 33 1 1 1 1 1 18 18 19 19 10 18 18 18 18 18 18 18 19 11 11 11 19 19 19 19 19 19 27 27 29 29 25 25 2 2 2 28 1 2 26 27 27 3 25 25 4 5 20 20 20 3 20 9 17 2 6 10 10 19 10 19 19 2 19 3 19 12 11 19 19 19 19 19 5 3 3 4 1 4 1 7 4 9 6 34 35 1 1 1 35 1 1 1 36 2 1 1 36 2 1 1 2 2 2 2 2 6 6 6 6 6 10 32 32 6 6 6 6 6 28 28 28 21 11 25 8 83 84 5 5 5 3 3 3 3 2 1 2 195 197 20 20 20 19 35 35 35 33 33 34 10 13 55 55 4 2 2 2 49 49 8 40 40 18 2 2 2 2 2 2 2 2 1 1 42 42 2 11 3 35 35 2 2 53 53 1 48 6 3 36 36 53 1 52 3 66 3 53 53 53 53 3 53 3 3 1 1 1 34 10 7 2 2 15 5 4 3 3 23 15 6 22 1 20 22 18 20 19 12 1 3 1 2 2 3 18 5 33 46 1 45 35 19 34 21 46 46 46 45 1 46 4 3 1 63 64 36 36 34 36 34 4 44 45 34 6 14 43 4 10 27 51 22 18 17 32 18 18 19 31 31 32 1852 1854 63 3 16 16 3 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 | // SPDX-License-Identifier: GPL-2.0-or-later /* * drivers/net/team/team.c - Network team device driver * Copyright (c) 2011 Jiri Pirko <jpirko@redhat.com> */ #include <linux/ethtool.h> #include <linux/kernel.h> #include <linux/types.h> #include <linux/module.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/rcupdate.h> #include <linux/errno.h> #include <linux/ctype.h> #include <linux/notifier.h> #include <linux/netdevice.h> #include <linux/netpoll.h> #include <linux/if_vlan.h> #include <linux/if_arp.h> #include <linux/socket.h> #include <linux/etherdevice.h> #include <linux/rtnetlink.h> #include <net/rtnetlink.h> #include <net/genetlink.h> #include <net/netdev_lock.h> #include <net/netlink.h> #include <net/sch_generic.h> #include <linux/if_team.h> #include "team_nl.h" #define DRV_NAME "team" /********** * Helpers **********/ static struct team_port *team_port_get_rtnl(const struct net_device *dev) { struct team_port *port = rtnl_dereference(dev->rx_handler_data); return netif_is_team_port(dev) ? port : NULL; } /* * Since the ability to change device address for open port device is tested in * team_port_add, this function can be called without control of return value */ static int __set_port_dev_addr(struct net_device *port_dev, const unsigned char *dev_addr) { struct sockaddr_storage addr; memcpy(addr.__data, dev_addr, port_dev->addr_len); addr.ss_family = port_dev->type; return dev_set_mac_address(port_dev, &addr, NULL); } static int team_port_set_orig_dev_addr(struct team_port *port) { return __set_port_dev_addr(port->dev, port->orig.dev_addr); } static int team_port_set_team_dev_addr(struct team *team, struct team_port *port) { return __set_port_dev_addr(port->dev, team->dev->dev_addr); } int team_modeop_port_enter(struct team *team, struct team_port *port) { return team_port_set_team_dev_addr(team, port); } EXPORT_SYMBOL(team_modeop_port_enter); void team_modeop_port_change_dev_addr(struct team *team, struct team_port *port) { team_port_set_team_dev_addr(team, port); } EXPORT_SYMBOL(team_modeop_port_change_dev_addr); static void team_lower_state_changed(struct team_port *port) { struct netdev_lag_lower_state_info info; info.link_up = port->linkup; info.tx_enabled = team_port_enabled(port); netdev_lower_state_changed(port->dev, &info); } static void team_refresh_port_linkup(struct team_port *port) { bool new_linkup = port->user.linkup_enabled ? port->user.linkup : port->state.linkup; if (port->linkup != new_linkup) { port->linkup = new_linkup; team_lower_state_changed(port); } } /******************* * Options handling *******************/ struct team_option_inst { /* One for each option instance */ struct list_head list; struct list_head tmp_list; struct team_option *option; struct team_option_inst_info info; bool changed; bool removed; }; static struct team_option *__team_find_option(struct team *team, const char *opt_name) { struct team_option *option; list_for_each_entry(option, &team->option_list, list) { if (strcmp(option->name, opt_name) == 0) return option; } return NULL; } static void __team_option_inst_del(struct team_option_inst *opt_inst) { list_del(&opt_inst->list); kfree(opt_inst); } static void __team_option_inst_del_option(struct team *team, struct team_option *option) { struct team_option_inst *opt_inst, *tmp; list_for_each_entry_safe(opt_inst, tmp, &team->option_inst_list, list) { if (opt_inst->option == option) __team_option_inst_del(opt_inst); } } static int __team_option_inst_add(struct team *team, struct team_option *option, struct team_port *port) { struct team_option_inst *opt_inst; unsigned int array_size; unsigned int i; array_size = option->array_size; if (!array_size) array_size = 1; /* No array but still need one instance */ for (i = 0; i < array_size; i++) { opt_inst = kmalloc(sizeof(*opt_inst), GFP_KERNEL); if (!opt_inst) return -ENOMEM; opt_inst->option = option; opt_inst->info.port = port; opt_inst->info.array_index = i; opt_inst->changed = true; opt_inst->removed = false; list_add_tail(&opt_inst->list, &team->option_inst_list); if (option->init) option->init(team, &opt_inst->info); } return 0; } static int __team_option_inst_add_option(struct team *team, struct team_option *option) { int err; if (!option->per_port) { err = __team_option_inst_add(team, option, NULL); if (err) goto inst_del_option; } return 0; inst_del_option: __team_option_inst_del_option(team, option); return err; } static void __team_option_inst_mark_removed_option(struct team *team, struct team_option *option) { struct team_option_inst *opt_inst; list_for_each_entry(opt_inst, &team->option_inst_list, list) { if (opt_inst->option == option) { opt_inst->changed = true; opt_inst->removed = true; } } } static void __team_option_inst_del_port(struct team *team, struct team_port *port) { struct team_option_inst *opt_inst, *tmp; list_for_each_entry_safe(opt_inst, tmp, &team->option_inst_list, list) { if (opt_inst->option->per_port && opt_inst->info.port == port) __team_option_inst_del(opt_inst); } } static int __team_option_inst_add_port(struct team *team, struct team_port *port) { struct team_option *option; int err; list_for_each_entry(option, &team->option_list, list) { if (!option->per_port) continue; err = __team_option_inst_add(team, option, port); if (err) goto inst_del_port; } return 0; inst_del_port: __team_option_inst_del_port(team, port); return err; } static void __team_option_inst_mark_removed_port(struct team *team, struct team_port *port) { struct team_option_inst *opt_inst; list_for_each_entry(opt_inst, &team->option_inst_list, list) { if (opt_inst->info.port == port) { opt_inst->changed = true; opt_inst->removed = true; } } } static int __team_options_register(struct team *team, const struct team_option *option, size_t option_count) { int i; struct team_option **dst_opts; int err; dst_opts = kcalloc(option_count, sizeof(struct team_option *), GFP_KERNEL); if (!dst_opts) return -ENOMEM; for (i = 0; i < option_count; i++, option++) { if (__team_find_option(team, option->name)) { err = -EEXIST; goto alloc_rollback; } dst_opts[i] = kmemdup(option, sizeof(*option), GFP_KERNEL); if (!dst_opts[i]) { err = -ENOMEM; goto alloc_rollback; } } for (i = 0; i < option_count; i++) { err = __team_option_inst_add_option(team, dst_opts[i]); if (err) goto inst_rollback; list_add_tail(&dst_opts[i]->list, &team->option_list); } kfree(dst_opts); return 0; inst_rollback: for (i--; i >= 0; i--) { __team_option_inst_del_option(team, dst_opts[i]); list_del(&dst_opts[i]->list); } i = option_count; alloc_rollback: for (i--; i >= 0; i--) kfree(dst_opts[i]); kfree(dst_opts); return err; } static void __team_options_mark_removed(struct team *team, const struct team_option *option, size_t option_count) { int i; for (i = 0; i < option_count; i++, option++) { struct team_option *del_opt; del_opt = __team_find_option(team, option->name); if (del_opt) __team_option_inst_mark_removed_option(team, del_opt); } } static void __team_options_unregister(struct team *team, const struct team_option *option, size_t option_count) { int i; for (i = 0; i < option_count; i++, option++) { struct team_option *del_opt; del_opt = __team_find_option(team, option->name); if (del_opt) { __team_option_inst_del_option(team, del_opt); list_del(&del_opt->list); kfree(del_opt); } } } static void __team_options_change_check(struct team *team); int team_options_register(struct team *team, const struct team_option *option, size_t option_count) { int err; err = __team_options_register(team, option, option_count); if (err) return err; __team_options_change_check(team); return 0; } EXPORT_SYMBOL(team_options_register); void team_options_unregister(struct team *team, const struct team_option *option, size_t option_count) { __team_options_mark_removed(team, option, option_count); __team_options_change_check(team); __team_options_unregister(team, option, option_count); } EXPORT_SYMBOL(team_options_unregister); static int team_option_get(struct team *team, struct team_option_inst *opt_inst, struct team_gsetter_ctx *ctx) { if (!opt_inst->option->getter) return -EOPNOTSUPP; opt_inst->option->getter(team, ctx); return 0; } static int team_option_set(struct team *team, struct team_option_inst *opt_inst, struct team_gsetter_ctx *ctx) { if (!opt_inst->option->setter) return -EOPNOTSUPP; return opt_inst->option->setter(team, ctx); } void team_option_inst_set_change(struct team_option_inst_info *opt_inst_info) { struct team_option_inst *opt_inst; opt_inst = container_of(opt_inst_info, struct team_option_inst, info); opt_inst->changed = true; } EXPORT_SYMBOL(team_option_inst_set_change); void team_options_change_check(struct team *team) { __team_options_change_check(team); } EXPORT_SYMBOL(team_options_change_check); /**************** * Mode handling ****************/ static LIST_HEAD(mode_list); static DEFINE_SPINLOCK(mode_list_lock); struct team_mode_item { struct list_head list; const struct team_mode *mode; }; static struct team_mode_item *__find_mode(const char *kind) { struct team_mode_item *mitem; list_for_each_entry(mitem, &mode_list, list) { if (strcmp(mitem->mode->kind, kind) == 0) return mitem; } return NULL; } static bool is_good_mode_name(const char *name) { while (*name != '\0') { if (!isalpha(*name) && !isdigit(*name) && *name != '_') return false; name++; } return true; } int team_mode_register(const struct team_mode *mode) { int err = 0; struct team_mode_item *mitem; if (!is_good_mode_name(mode->kind) || mode->priv_size > TEAM_MODE_PRIV_SIZE) return -EINVAL; mitem = kmalloc(sizeof(*mitem), GFP_KERNEL); if (!mitem) return -ENOMEM; spin_lock(&mode_list_lock); if (__find_mode(mode->kind)) { err = -EEXIST; kfree(mitem); goto unlock; } mitem->mode = mode; list_add_tail(&mitem->list, &mode_list); unlock: spin_unlock(&mode_list_lock); return err; } EXPORT_SYMBOL(team_mode_register); void team_mode_unregister(const struct team_mode *mode) { struct team_mode_item *mitem; spin_lock(&mode_list_lock); mitem = __find_mode(mode->kind); if (mitem) { list_del_init(&mitem->list); kfree(mitem); } spin_unlock(&mode_list_lock); } EXPORT_SYMBOL(team_mode_unregister); static const struct team_mode *team_mode_get(const char *kind) { struct team_mode_item *mitem; const struct team_mode *mode = NULL; if (!try_module_get(THIS_MODULE)) return NULL; spin_lock(&mode_list_lock); mitem = __find_mode(kind); if (!mitem) { spin_unlock(&mode_list_lock); request_module("team-mode-%s", kind); spin_lock(&mode_list_lock); mitem = __find_mode(kind); } if (mitem) { mode = mitem->mode; if (!try_module_get(mode->owner)) mode = NULL; } spin_unlock(&mode_list_lock); module_put(THIS_MODULE); return mode; } static void team_mode_put(const struct team_mode *mode) { module_put(mode->owner); } static bool team_dummy_transmit(struct team *team, struct sk_buff *skb) { dev_kfree_skb_any(skb); return false; } static rx_handler_result_t team_dummy_receive(struct team *team, struct team_port *port, struct sk_buff *skb) { return RX_HANDLER_ANOTHER; } static const struct team_mode __team_no_mode = { .kind = "*NOMODE*", }; static bool team_is_mode_set(struct team *team) { return team->mode != &__team_no_mode; } static void team_set_no_mode(struct team *team) { team->user_carrier_enabled = false; team->mode = &__team_no_mode; } static void team_adjust_ops(struct team *team) { /* * To avoid checks in rx/tx skb paths, ensure here that non-null and * correct ops are always set. */ if (!team->en_port_count || !team_is_mode_set(team) || !team->mode->ops->transmit) team->ops.transmit = team_dummy_transmit; else team->ops.transmit = team->mode->ops->transmit; if (!team->en_port_count || !team_is_mode_set(team) || !team->mode->ops->receive) team->ops.receive = team_dummy_receive; else team->ops.receive = team->mode->ops->receive; } /* * We can benefit from the fact that it's ensured no port is present * at the time of mode change. Therefore no packets are in fly so there's no * need to set mode operations in any special way. */ static int __team_change_mode(struct team *team, const struct team_mode *new_mode) { /* Check if mode was previously set and do cleanup if so */ if (team_is_mode_set(team)) { void (*exit_op)(struct team *team) = team->ops.exit; /* Clear ops area so no callback is called any longer */ memset(&team->ops, 0, sizeof(struct team_mode_ops)); team_adjust_ops(team); if (exit_op) exit_op(team); team_mode_put(team->mode); team_set_no_mode(team); /* zero private data area */ memset(&team->mode_priv, 0, sizeof(struct team) - offsetof(struct team, mode_priv)); } if (!new_mode) return 0; if (new_mode->ops->init) { int err; err = new_mode->ops->init(team); if (err) return err; } team->mode = new_mode; memcpy(&team->ops, new_mode->ops, sizeof(struct team_mode_ops)); team_adjust_ops(team); return 0; } static int team_change_mode(struct team *team, const char *kind) { const struct team_mode *new_mode; struct net_device *dev = team->dev; int err; if (!list_empty(&team->port_list)) { netdev_err(dev, "No ports can be present during mode change\n"); return -EBUSY; } if (team_is_mode_set(team) && strcmp(team->mode->kind, kind) == 0) { netdev_err(dev, "Unable to change to the same mode the team is in\n"); return -EINVAL; } new_mode = team_mode_get(kind); if (!new_mode) { netdev_err(dev, "Mode \"%s\" not found\n", kind); return -EINVAL; } err = __team_change_mode(team, new_mode); if (err) { netdev_err(dev, "Failed to change to mode \"%s\"\n", kind); team_mode_put(new_mode); return err; } netdev_info(dev, "Mode changed to \"%s\"\n", kind); return 0; } /********************* * Peers notification *********************/ static void team_notify_peers_work(struct work_struct *work) { struct team *team; int val; team = container_of(work, struct team, notify_peers.dw.work); if (!rtnl_trylock()) { schedule_delayed_work(&team->notify_peers.dw, 0); return; } val = atomic_dec_if_positive(&team->notify_peers.count_pending); if (val < 0) { rtnl_unlock(); return; } call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, team->dev); rtnl_unlock(); if (val) schedule_delayed_work(&team->notify_peers.dw, msecs_to_jiffies(team->notify_peers.interval)); } static void team_notify_peers(struct team *team) { if (!team->notify_peers.count || !netif_running(team->dev)) return; atomic_add(team->notify_peers.count, &team->notify_peers.count_pending); schedule_delayed_work(&team->notify_peers.dw, 0); } static void team_notify_peers_init(struct team *team) { INIT_DELAYED_WORK(&team->notify_peers.dw, team_notify_peers_work); } static void team_notify_peers_fini(struct team *team) { cancel_delayed_work_sync(&team->notify_peers.dw); } /******************************* * Send multicast group rejoins *******************************/ static void team_mcast_rejoin_work(struct work_struct *work) { struct team *team; int val; team = container_of(work, struct team, mcast_rejoin.dw.work); if (!rtnl_trylock()) { schedule_delayed_work(&team->mcast_rejoin.dw, 0); return; } val = atomic_dec_if_positive(&team->mcast_rejoin.count_pending); if (val < 0) { rtnl_unlock(); return; } call_netdevice_notifiers(NETDEV_RESEND_IGMP, team->dev); rtnl_unlock(); if (val) schedule_delayed_work(&team->mcast_rejoin.dw, msecs_to_jiffies(team->mcast_rejoin.interval)); } static void team_mcast_rejoin(struct team *team) { if (!team->mcast_rejoin.count || !netif_running(team->dev)) return; atomic_add(team->mcast_rejoin.count, &team->mcast_rejoin.count_pending); schedule_delayed_work(&team->mcast_rejoin.dw, 0); } static void team_mcast_rejoin_init(struct team *team) { INIT_DELAYED_WORK(&team->mcast_rejoin.dw, team_mcast_rejoin_work); } static void team_mcast_rejoin_fini(struct team *team) { cancel_delayed_work_sync(&team->mcast_rejoin.dw); } /************************ * Rx path frame handler ************************/ /* note: already called with rcu_read_lock */ static rx_handler_result_t team_handle_frame(struct sk_buff **pskb) { struct sk_buff *skb = *pskb; struct team_port *port; struct team *team; rx_handler_result_t res; skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) return RX_HANDLER_CONSUMED; *pskb = skb; port = team_port_get_rcu(skb->dev); team = port->team; if (!team_port_enabled(port)) { if (is_link_local_ether_addr(eth_hdr(skb)->h_dest)) /* link-local packets are mostly useful when stack receives them * with the link they arrive on. */ return RX_HANDLER_PASS; /* allow exact match delivery for disabled ports */ res = RX_HANDLER_EXACT; } else { res = team->ops.receive(team, port, skb); } if (res == RX_HANDLER_ANOTHER) { struct team_pcpu_stats *pcpu_stats; pcpu_stats = this_cpu_ptr(team->pcpu_stats); u64_stats_update_begin(&pcpu_stats->syncp); u64_stats_inc(&pcpu_stats->rx_packets); u64_stats_add(&pcpu_stats->rx_bytes, skb->len); if (skb->pkt_type == PACKET_MULTICAST) u64_stats_inc(&pcpu_stats->rx_multicast); u64_stats_update_end(&pcpu_stats->syncp); skb->dev = team->dev; } else if (res == RX_HANDLER_EXACT) { this_cpu_inc(team->pcpu_stats->rx_nohandler); } else { this_cpu_inc(team->pcpu_stats->rx_dropped); } return res; } /************************************* * Multiqueue Tx port select override *************************************/ static int team_queue_override_init(struct team *team) { struct list_head *listarr; unsigned int queue_cnt = team->dev->num_tx_queues - 1; unsigned int i; if (!queue_cnt) return 0; listarr = kmalloc_array(queue_cnt, sizeof(struct list_head), GFP_KERNEL); if (!listarr) return -ENOMEM; team->qom_lists = listarr; for (i = 0; i < queue_cnt; i++) INIT_LIST_HEAD(listarr++); return 0; } static void team_queue_override_fini(struct team *team) { kfree(team->qom_lists); } static struct list_head *__team_get_qom_list(struct team *team, u16 queue_id) { return &team->qom_lists[queue_id - 1]; } /* * note: already called with rcu_read_lock */ static bool team_queue_override_transmit(struct team *team, struct sk_buff *skb) { struct list_head *qom_list; struct team_port *port; if (!team->queue_override_enabled || !skb->queue_mapping) return false; qom_list = __team_get_qom_list(team, skb->queue_mapping); list_for_each_entry_rcu(port, qom_list, qom_list) { if (!team_dev_queue_xmit(team, port, skb)) return true; } return false; } static void __team_queue_override_port_del(struct team *team, struct team_port *port) { if (!port->queue_id) return; list_del_rcu(&port->qom_list); } static bool team_queue_override_port_has_gt_prio_than(struct team_port *port, struct team_port *cur) { if (port->priority < cur->priority) return true; if (port->priority > cur->priority) return false; if (port->index < cur->index) return true; return false; } static void __team_queue_override_port_add(struct team *team, struct team_port *port) { struct team_port *cur; struct list_head *qom_list; struct list_head *node; if (!port->queue_id) return; qom_list = __team_get_qom_list(team, port->queue_id); node = qom_list; list_for_each_entry(cur, qom_list, qom_list) { if (team_queue_override_port_has_gt_prio_than(port, cur)) break; node = &cur->qom_list; } list_add_tail_rcu(&port->qom_list, node); } static void __team_queue_override_enabled_check(struct team *team) { struct team_port *port; bool enabled = false; list_for_each_entry(port, &team->port_list, list) { if (port->queue_id) { enabled = true; break; } } if (enabled == team->queue_override_enabled) return; netdev_dbg(team->dev, "%s queue override\n", enabled ? "Enabling" : "Disabling"); team->queue_override_enabled = enabled; } static void team_queue_override_port_prio_changed(struct team *team, struct team_port *port) { if (!port->queue_id || team_port_enabled(port)) return; __team_queue_override_port_del(team, port); __team_queue_override_port_add(team, port); __team_queue_override_enabled_check(team); } static void team_queue_override_port_change_queue_id(struct team *team, struct team_port *port, u16 new_queue_id) { if (team_port_enabled(port)) { __team_queue_override_port_del(team, port); port->queue_id = new_queue_id; __team_queue_override_port_add(team, port); __team_queue_override_enabled_check(team); } else { port->queue_id = new_queue_id; } } static void team_queue_override_port_add(struct team *team, struct team_port *port) { __team_queue_override_port_add(team, port); __team_queue_override_enabled_check(team); } static void team_queue_override_port_del(struct team *team, struct team_port *port) { __team_queue_override_port_del(team, port); __team_queue_override_enabled_check(team); } /**************** * Port handling ****************/ static bool team_port_find(const struct team *team, const struct team_port *port) { struct team_port *cur; list_for_each_entry(cur, &team->port_list, list) if (cur == port) return true; return false; } /* * Enable/disable port by adding to enabled port hashlist and setting * port->index (Might be racy so reader could see incorrect ifindex when * processing a flying packet, but that is not a problem). Write guarded * by RTNL. */ static void team_port_enable(struct team *team, struct team_port *port) { if (team_port_enabled(port)) return; port->index = team->en_port_count++; hlist_add_head_rcu(&port->hlist, team_port_index_hash(team, port->index)); team_adjust_ops(team); team_queue_override_port_add(team, port); if (team->ops.port_enabled) team->ops.port_enabled(team, port); team_notify_peers(team); team_mcast_rejoin(team); team_lower_state_changed(port); } static void __reconstruct_port_hlist(struct team *team, int rm_index) { int i; struct team_port *port; for (i = rm_index + 1; i < team->en_port_count; i++) { port = team_get_port_by_index(team, i); hlist_del_rcu(&port->hlist); port->index--; hlist_add_head_rcu(&port->hlist, team_port_index_hash(team, port->index)); } } static void team_port_disable(struct team *team, struct team_port *port) { if (!team_port_enabled(port)) return; if (team->ops.port_disabled) team->ops.port_disabled(team, port); hlist_del_rcu(&port->hlist); __reconstruct_port_hlist(team, port->index); port->index = -1; team->en_port_count--; team_queue_override_port_del(team, port); team_adjust_ops(team); team_lower_state_changed(port); } static int team_port_enter(struct team *team, struct team_port *port) { int err = 0; dev_hold(team->dev); if (team->ops.port_enter) { err = team->ops.port_enter(team, port); if (err) { netdev_err(team->dev, "Device %s failed to enter team mode\n", port->dev->name); goto err_port_enter; } } return 0; err_port_enter: dev_put(team->dev); return err; } static void team_port_leave(struct team *team, struct team_port *port) { if (team->ops.port_leave) team->ops.port_leave(team, port); dev_put(team->dev); } #ifdef CONFIG_NET_POLL_CONTROLLER static int __team_port_enable_netpoll(struct team_port *port) { struct netpoll *np; int err; np = kzalloc(sizeof(*np), GFP_KERNEL); if (!np) return -ENOMEM; err = __netpoll_setup(np, port->dev); if (err) { kfree(np); return err; } port->np = np; return err; } static int team_port_enable_netpoll(struct team_port *port) { if (!port->team->dev->npinfo) return 0; return __team_port_enable_netpoll(port); } static void team_port_disable_netpoll(struct team_port *port) { struct netpoll *np = port->np; if (!np) return; port->np = NULL; __netpoll_free(np); } #else static int team_port_enable_netpoll(struct team_port *port) { return 0; } static void team_port_disable_netpoll(struct team_port *port) { } #endif static int team_upper_dev_link(struct team *team, struct team_port *port, struct netlink_ext_ack *extack) { struct netdev_lag_upper_info lag_upper_info; int err; lag_upper_info.tx_type = team->mode->lag_tx_type; lag_upper_info.hash_type = NETDEV_LAG_HASH_UNKNOWN; err = netdev_master_upper_dev_link(port->dev, team->dev, NULL, &lag_upper_info, extack); if (err) return err; port->dev->priv_flags |= IFF_TEAM_PORT; return 0; } static void team_upper_dev_unlink(struct team *team, struct team_port *port) { netdev_upper_dev_unlink(port->dev, team->dev); port->dev->priv_flags &= ~IFF_TEAM_PORT; } static void __team_port_change_port_added(struct team_port *port, bool linkup); static int team_dev_type_check_change(struct net_device *dev, struct net_device *port_dev); static int team_port_add(struct team *team, struct net_device *port_dev, struct netlink_ext_ack *extack) { struct net_device *dev = team->dev; struct team_port *port; char *portname = port_dev->name; int err; if (port_dev->flags & IFF_LOOPBACK) { NL_SET_ERR_MSG(extack, "Loopback device can't be added as a team port"); netdev_err(dev, "Device %s is loopback device. Loopback devices can't be added as a team port\n", portname); return -EINVAL; } if (netif_is_team_port(port_dev)) { NL_SET_ERR_MSG(extack, "Device is already a port of a team device"); netdev_err(dev, "Device %s is already a port " "of a team device\n", portname); return -EBUSY; } if (dev == port_dev) { NL_SET_ERR_MSG(extack, "Cannot enslave team device to itself"); netdev_err(dev, "Cannot enslave team device to itself\n"); return -EINVAL; } if (netdev_has_upper_dev(dev, port_dev)) { NL_SET_ERR_MSG(extack, "Device is already an upper device of the team interface"); netdev_err(dev, "Device %s is already an upper device of the team interface\n", portname); return -EBUSY; } if (netdev_has_upper_dev(port_dev, dev)) { NL_SET_ERR_MSG(extack, "Device is already a lower device of the team interface"); netdev_err(dev, "Device %s is already a lower device of the team interface\n", portname); return -EBUSY; } if (port_dev->features & NETIF_F_VLAN_CHALLENGED && vlan_uses_dev(dev)) { NL_SET_ERR_MSG(extack, "Device is VLAN challenged and team device has VLAN set up"); netdev_err(dev, "Device %s is VLAN challenged and team device has VLAN set up\n", portname); return -EPERM; } if (port_dev->flags & IFF_UP) { NL_SET_ERR_MSG(extack, "Device is up. Set it down before adding it as a team port"); netdev_err(dev, "Device %s is up. Set it down before adding it as a team port\n", portname); return -EBUSY; } port = kzalloc(sizeof(struct team_port) + team->mode->port_priv_size, GFP_KERNEL); if (!port) return -ENOMEM; port->dev = port_dev; port->team = team; INIT_LIST_HEAD(&port->qom_list); port->orig.mtu = port_dev->mtu; /* * MTU assignment will be handled in team_dev_type_check_change * if dev and port_dev are of different types */ if (dev->type == port_dev->type) { err = dev_set_mtu(port_dev, dev->mtu); if (err) { netdev_dbg(dev, "Error %d calling dev_set_mtu\n", err); goto err_set_mtu; } } memcpy(port->orig.dev_addr, port_dev->dev_addr, port_dev->addr_len); err = team_port_enter(team, port); if (err) { netdev_err(dev, "Device %s failed to enter team mode\n", portname); goto err_port_enter; } err = dev_open(port_dev, extack); if (err) { netdev_dbg(dev, "Device %s opening failed\n", portname); goto err_dev_open; } err = vlan_vids_add_by_dev(port_dev, dev); if (err) { netdev_err(dev, "Failed to add vlan ids to device %s\n", portname); goto err_vids_add; } err = team_port_enable_netpoll(port); if (err) { netdev_err(dev, "Failed to enable netpoll on device %s\n", portname); goto err_enable_netpoll; } if (!(dev->features & NETIF_F_LRO)) dev_disable_lro(port_dev); err = netdev_rx_handler_register(port_dev, team_handle_frame, port); if (err) { netdev_err(dev, "Device %s failed to register rx_handler\n", portname); goto err_handler_register; } err = team_upper_dev_link(team, port, extack); if (err) { netdev_err(dev, "Device %s failed to set upper link\n", portname); goto err_set_upper_link; } err = __team_option_inst_add_port(team, port); if (err) { netdev_err(dev, "Device %s failed to add per-port options\n", portname); goto err_option_port_add; } /* set promiscuity level to new slave */ if (dev->flags & IFF_PROMISC) { err = dev_set_promiscuity(port_dev, 1); if (err) goto err_set_slave_promisc; } /* set allmulti level to new slave */ if (dev->flags & IFF_ALLMULTI) { err = dev_set_allmulti(port_dev, 1); if (err) { if (dev->flags & IFF_PROMISC) dev_set_promiscuity(port_dev, -1); goto err_set_slave_allmulti; } } err = team_dev_type_check_change(dev, port_dev); if (err) goto err_set_dev_type; if (dev->flags & IFF_UP) { netif_addr_lock_bh(dev); dev_uc_sync_multiple(port_dev, dev); dev_mc_sync_multiple(port_dev, dev); netif_addr_unlock_bh(dev); } port->index = -1; list_add_tail_rcu(&port->list, &team->port_list); team_port_enable(team, port); netdev_compute_master_upper_features(team->dev, true); __team_port_change_port_added(port, !!netif_oper_up(port_dev)); __team_options_change_check(team); netdev_info(dev, "Port device %s added\n", portname); return 0; err_set_dev_type: err_set_slave_allmulti: err_set_slave_promisc: __team_option_inst_del_port(team, port); err_option_port_add: team_upper_dev_unlink(team, port); err_set_upper_link: netdev_rx_handler_unregister(port_dev); err_handler_register: team_port_disable_netpoll(port); err_enable_netpoll: vlan_vids_del_by_dev(port_dev, dev); err_vids_add: dev_close(port_dev); err_dev_open: team_port_leave(team, port); team_port_set_orig_dev_addr(port); err_port_enter: dev_set_mtu(port_dev, port->orig.mtu); err_set_mtu: kfree(port); return err; } static void __team_port_change_port_removed(struct team_port *port); static int team_port_del(struct team *team, struct net_device *port_dev) { struct net_device *dev = team->dev; struct team_port *port; char *portname = port_dev->name; port = team_port_get_rtnl(port_dev); if (!port || !team_port_find(team, port)) { netdev_err(dev, "Device %s does not act as a port of this team\n", portname); return -ENOENT; } team_port_disable(team, port); list_del_rcu(&port->list); if (dev->flags & IFF_PROMISC) dev_set_promiscuity(port_dev, -1); if (dev->flags & IFF_ALLMULTI) dev_set_allmulti(port_dev, -1); team_upper_dev_unlink(team, port); netdev_rx_handler_unregister(port_dev); team_port_disable_netpoll(port); vlan_vids_del_by_dev(port_dev, dev); if (dev->flags & IFF_UP) { dev_uc_unsync(port_dev, dev); dev_mc_unsync(port_dev, dev); } dev_close(port_dev); team_port_leave(team, port); __team_option_inst_mark_removed_port(team, port); __team_options_change_check(team); __team_option_inst_del_port(team, port); __team_port_change_port_removed(port); team_port_set_orig_dev_addr(port); dev_set_mtu(port_dev, port->orig.mtu); kfree_rcu(port, rcu); netdev_info(dev, "Port device %s removed\n", portname); netdev_compute_master_upper_features(team->dev, true); return 0; } /***************** * Net device ops *****************/ static void team_mode_option_get(struct team *team, struct team_gsetter_ctx *ctx) { ctx->data.str_val = team->mode->kind; } static int team_mode_option_set(struct team *team, struct team_gsetter_ctx *ctx) { return team_change_mode(team, ctx->data.str_val); } static void team_notify_peers_count_get(struct team *team, struct team_gsetter_ctx *ctx) { ctx->data.u32_val = team->notify_peers.count; } static int team_notify_peers_count_set(struct team *team, struct team_gsetter_ctx *ctx) { team->notify_peers.count = ctx->data.u32_val; return 0; } static void team_notify_peers_interval_get(struct team *team, struct team_gsetter_ctx *ctx) { ctx->data.u32_val = team->notify_peers.interval; } static int team_notify_peers_interval_set(struct team *team, struct team_gsetter_ctx *ctx) { team->notify_peers.interval = ctx->data.u32_val; return 0; } static void team_mcast_rejoin_count_get(struct team *team, struct team_gsetter_ctx *ctx) { ctx->data.u32_val = team->mcast_rejoin.count; } static int team_mcast_rejoin_count_set(struct team *team, struct team_gsetter_ctx *ctx) { team->mcast_rejoin.count = ctx->data.u32_val; return 0; } static void team_mcast_rejoin_interval_get(struct team *team, struct team_gsetter_ctx *ctx) { ctx->data.u32_val = team->mcast_rejoin.interval; } static int team_mcast_rejoin_interval_set(struct team *team, struct team_gsetter_ctx *ctx) { team->mcast_rejoin.interval = ctx->data.u32_val; return 0; } static void team_port_en_option_get(struct team *team, struct team_gsetter_ctx *ctx) { struct team_port *port = ctx->info->port; ctx->data.bool_val = team_port_enabled(port); } static int team_port_en_option_set(struct team *team, struct team_gsetter_ctx *ctx) { struct team_port *port = ctx->info->port; if (ctx->data.bool_val) team_port_enable(team, port); else team_port_disable(team, port); return 0; } static void team_user_linkup_option_get(struct team *team, struct team_gsetter_ctx *ctx) { struct team_port *port = ctx->info->port; ctx->data.bool_val = port->user.linkup; } static void __team_carrier_check(struct team *team); static int team_user_linkup_option_set(struct team *team, struct team_gsetter_ctx *ctx) { struct team_port *port = ctx->info->port; port->user.linkup = ctx->data.bool_val; team_refresh_port_linkup(port); __team_carrier_check(port->team); return 0; } static void team_user_linkup_en_option_get(struct team *team, struct team_gsetter_ctx *ctx) { struct team_port *port = ctx->info->port; ctx->data.bool_val = port->user.linkup_enabled; } static int team_user_linkup_en_option_set(struct team *team, struct team_gsetter_ctx *ctx) { struct team_port *port = ctx->info->port; port->user.linkup_enabled = ctx->data.bool_val; team_refresh_port_linkup(port); __team_carrier_check(port->team); return 0; } static void team_priority_option_get(struct team *team, struct team_gsetter_ctx *ctx) { struct team_port *port = ctx->info->port; ctx->data.s32_val = port->priority; } static int team_priority_option_set(struct team *team, struct team_gsetter_ctx *ctx) { struct team_port *port = ctx->info->port; s32 priority = ctx->data.s32_val; if (port->priority == priority) return 0; port->priority = priority; team_queue_override_port_prio_changed(team, port); return 0; } static void team_queue_id_option_get(struct team *team, struct team_gsetter_ctx *ctx) { struct team_port *port = ctx->info->port; ctx->data.u32_val = port->queue_id; } static int team_queue_id_option_set(struct team *team, struct team_gsetter_ctx *ctx) { struct team_port *port = ctx->info->port; u16 new_queue_id = ctx->data.u32_val; if (port->queue_id == new_queue_id) return 0; if (new_queue_id >= team->dev->real_num_tx_queues) return -EINVAL; team_queue_override_port_change_queue_id(team, port, new_queue_id); return 0; } static const struct team_option team_options[] = { { .name = "mode", .type = TEAM_OPTION_TYPE_STRING, .getter = team_mode_option_get, .setter = team_mode_option_set, }, { .name = "notify_peers_count", .type = TEAM_OPTION_TYPE_U32, .getter = team_notify_peers_count_get, .setter = team_notify_peers_count_set, }, { .name = "notify_peers_interval", .type = TEAM_OPTION_TYPE_U32, .getter = team_notify_peers_interval_get, .setter = team_notify_peers_interval_set, }, { .name = "mcast_rejoin_count", .type = TEAM_OPTION_TYPE_U32, .getter = team_mcast_rejoin_count_get, .setter = team_mcast_rejoin_count_set, }, { .name = "mcast_rejoin_interval", .type = TEAM_OPTION_TYPE_U32, .getter = team_mcast_rejoin_interval_get, .setter = team_mcast_rejoin_interval_set, }, { .name = "enabled", .type = TEAM_OPTION_TYPE_BOOL, .per_port = true, .getter = team_port_en_option_get, .setter = team_port_en_option_set, }, { .name = "user_linkup", .type = TEAM_OPTION_TYPE_BOOL, .per_port = true, .getter = team_user_linkup_option_get, .setter = team_user_linkup_option_set, }, { .name = "user_linkup_enabled", .type = TEAM_OPTION_TYPE_BOOL, .per_port = true, .getter = team_user_linkup_en_option_get, .setter = team_user_linkup_en_option_set, }, { .name = "priority", .type = TEAM_OPTION_TYPE_S32, .per_port = true, .getter = team_priority_option_get, .setter = team_priority_option_set, }, { .name = "queue_id", .type = TEAM_OPTION_TYPE_U32, .per_port = true, .getter = team_queue_id_option_get, .setter = team_queue_id_option_set, }, }; static int team_init(struct net_device *dev) { struct team *team = netdev_priv(dev); int i; int err; team->dev = dev; team_set_no_mode(team); team->notifier_ctx = false; team->pcpu_stats = netdev_alloc_pcpu_stats(struct team_pcpu_stats); if (!team->pcpu_stats) return -ENOMEM; for (i = 0; i < TEAM_PORT_HASHENTRIES; i++) INIT_HLIST_HEAD(&team->en_port_hlist[i]); INIT_LIST_HEAD(&team->port_list); err = team_queue_override_init(team); if (err) goto err_team_queue_override_init; team_adjust_ops(team); INIT_LIST_HEAD(&team->option_list); INIT_LIST_HEAD(&team->option_inst_list); team_notify_peers_init(team); team_mcast_rejoin_init(team); err = team_options_register(team, team_options, ARRAY_SIZE(team_options)); if (err) goto err_options_register; netif_carrier_off(dev); netdev_lockdep_set_classes(dev); return 0; err_options_register: team_mcast_rejoin_fini(team); team_notify_peers_fini(team); team_queue_override_fini(team); err_team_queue_override_init: free_percpu(team->pcpu_stats); return err; } static void team_uninit(struct net_device *dev) { struct team *team = netdev_priv(dev); struct team_port *port; struct team_port *tmp; ASSERT_RTNL(); list_for_each_entry_safe(port, tmp, &team->port_list, list) team_port_del(team, port->dev); __team_change_mode(team, NULL); /* cleanup */ __team_options_unregister(team, team_options, ARRAY_SIZE(team_options)); team_mcast_rejoin_fini(team); team_notify_peers_fini(team); team_queue_override_fini(team); netdev_change_features(dev); } static void team_destructor(struct net_device *dev) { struct team *team = netdev_priv(dev); free_percpu(team->pcpu_stats); } static int team_open(struct net_device *dev) { return 0; } static int team_close(struct net_device *dev) { struct team *team = netdev_priv(dev); struct team_port *port; list_for_each_entry(port, &team->port_list, list) { dev_uc_unsync(port->dev, dev); dev_mc_unsync(port->dev, dev); } return 0; } /* * note: already called with rcu_read_lock */ static netdev_tx_t team_xmit(struct sk_buff *skb, struct net_device *dev) { struct team *team = netdev_priv(dev); bool tx_success; unsigned int len = skb->len; tx_success = team_queue_override_transmit(team, skb); if (!tx_success) tx_success = team->ops.transmit(team, skb); if (tx_success) { struct team_pcpu_stats *pcpu_stats; pcpu_stats = this_cpu_ptr(team->pcpu_stats); u64_stats_update_begin(&pcpu_stats->syncp); u64_stats_inc(&pcpu_stats->tx_packets); u64_stats_add(&pcpu_stats->tx_bytes, len); u64_stats_update_end(&pcpu_stats->syncp); } else { this_cpu_inc(team->pcpu_stats->tx_dropped); } return NETDEV_TX_OK; } static u16 team_select_queue(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev) { /* * This helper function exists to help dev_pick_tx get the correct * destination queue. Using a helper function skips a call to * skb_tx_hash and will put the skbs in the queue we expect on their * way down to the team driver. */ u16 txq = skb_rx_queue_recorded(skb) ? skb_get_rx_queue(skb) : 0; /* * Save the original txq to restore before passing to the driver */ qdisc_skb_cb(skb)->slave_dev_queue_mapping = skb->queue_mapping; if (unlikely(txq >= dev->real_num_tx_queues)) { do { txq -= dev->real_num_tx_queues; } while (txq >= dev->real_num_tx_queues); } return txq; } static void team_change_rx_flags(struct net_device *dev, int change) { struct team *team = netdev_priv(dev); struct team_port *port; int inc; ASSERT_RTNL(); list_for_each_entry(port, &team->port_list, list) { if (change & IFF_PROMISC) { inc = dev->flags & IFF_PROMISC ? 1 : -1; dev_set_promiscuity(port->dev, inc); } if (change & IFF_ALLMULTI) { inc = dev->flags & IFF_ALLMULTI ? 1 : -1; dev_set_allmulti(port->dev, inc); } } } static void team_set_rx_mode(struct net_device *dev) { struct team *team = netdev_priv(dev); struct team_port *port; rcu_read_lock(); list_for_each_entry_rcu(port, &team->port_list, list) { dev_uc_sync_multiple(port->dev, dev); dev_mc_sync_multiple(port->dev, dev); } rcu_read_unlock(); } static int team_set_mac_address(struct net_device *dev, void *p) { struct sockaddr *addr = p; struct team *team = netdev_priv(dev); struct team_port *port; ASSERT_RTNL(); if (dev->type == ARPHRD_ETHER && !is_valid_ether_addr(addr->sa_data)) return -EADDRNOTAVAIL; dev_addr_set(dev, addr->sa_data); list_for_each_entry(port, &team->port_list, list) if (team->ops.port_change_dev_addr) team->ops.port_change_dev_addr(team, port); return 0; } static int team_change_mtu(struct net_device *dev, int new_mtu) { struct team *team = netdev_priv(dev); struct team_port *port; int err; ASSERT_RTNL(); team->port_mtu_change_allowed = true; list_for_each_entry(port, &team->port_list, list) { err = dev_set_mtu(port->dev, new_mtu); if (err) { netdev_err(dev, "Device %s failed to change mtu", port->dev->name); goto unwind; } } team->port_mtu_change_allowed = false; WRITE_ONCE(dev->mtu, new_mtu); return 0; unwind: list_for_each_entry_continue_reverse(port, &team->port_list, list) dev_set_mtu(port->dev, dev->mtu); team->port_mtu_change_allowed = false; return err; } static void team_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) { struct team *team = netdev_priv(dev); struct team_pcpu_stats *p; u64 rx_packets, rx_bytes, rx_multicast, tx_packets, tx_bytes; u32 rx_dropped = 0, tx_dropped = 0, rx_nohandler = 0; unsigned int start; int i; for_each_possible_cpu(i) { p = per_cpu_ptr(team->pcpu_stats, i); do { start = u64_stats_fetch_begin(&p->syncp); rx_packets = u64_stats_read(&p->rx_packets); rx_bytes = u64_stats_read(&p->rx_bytes); rx_multicast = u64_stats_read(&p->rx_multicast); tx_packets = u64_stats_read(&p->tx_packets); tx_bytes = u64_stats_read(&p->tx_bytes); } while (u64_stats_fetch_retry(&p->syncp, start)); stats->rx_packets += rx_packets; stats->rx_bytes += rx_bytes; stats->multicast += rx_multicast; stats->tx_packets += tx_packets; stats->tx_bytes += tx_bytes; /* * rx_dropped, tx_dropped & rx_nohandler are u32, * updated without syncp protection. */ rx_dropped += READ_ONCE(p->rx_dropped); tx_dropped += READ_ONCE(p->tx_dropped); rx_nohandler += READ_ONCE(p->rx_nohandler); } stats->rx_dropped = rx_dropped; stats->tx_dropped = tx_dropped; stats->rx_nohandler = rx_nohandler; } static int team_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid) { struct team *team = netdev_priv(dev); struct team_port *port; int err; ASSERT_RTNL(); list_for_each_entry(port, &team->port_list, list) { err = vlan_vid_add(port->dev, proto, vid); if (err) goto unwind; } return 0; unwind: list_for_each_entry_continue_reverse(port, &team->port_list, list) vlan_vid_del(port->dev, proto, vid); return err; } static int team_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, u16 vid) { struct team *team = netdev_priv(dev); struct team_port *port; ASSERT_RTNL(); list_for_each_entry(port, &team->port_list, list) vlan_vid_del(port->dev, proto, vid); return 0; } #ifdef CONFIG_NET_POLL_CONTROLLER static void team_poll_controller(struct net_device *dev) { } static void __team_netpoll_cleanup(struct team *team) { struct team_port *port; list_for_each_entry(port, &team->port_list, list) team_port_disable_netpoll(port); } static void team_netpoll_cleanup(struct net_device *dev) { struct team *team = netdev_priv(dev); ASSERT_RTNL(); __team_netpoll_cleanup(team); } static int team_netpoll_setup(struct net_device *dev) { struct team *team = netdev_priv(dev); struct team_port *port; int err = 0; ASSERT_RTNL(); list_for_each_entry(port, &team->port_list, list) { err = __team_port_enable_netpoll(port); if (err) { __team_netpoll_cleanup(team); break; } } return err; } #endif static int team_add_slave(struct net_device *dev, struct net_device *port_dev, struct netlink_ext_ack *extack) { struct team *team = netdev_priv(dev); ASSERT_RTNL(); return team_port_add(team, port_dev, extack); } static int team_del_slave(struct net_device *dev, struct net_device *port_dev) { struct team *team = netdev_priv(dev); ASSERT_RTNL(); return team_port_del(team, port_dev); } static netdev_features_t team_fix_features(struct net_device *dev, netdev_features_t features) { struct team_port *port; struct team *team = netdev_priv(dev); netdev_features_t mask; mask = features; features = netdev_base_features(features); rcu_read_lock(); list_for_each_entry_rcu(port, &team->port_list, list) { features = netdev_increment_features(features, port->dev->features, mask); } rcu_read_unlock(); features = netdev_add_tso_features(features, mask); return features; } static int team_change_carrier(struct net_device *dev, bool new_carrier) { struct team *team = netdev_priv(dev); team->user_carrier_enabled = true; if (new_carrier) netif_carrier_on(dev); else netif_carrier_off(dev); return 0; } static const struct net_device_ops team_netdev_ops = { .ndo_init = team_init, .ndo_uninit = team_uninit, .ndo_open = team_open, .ndo_stop = team_close, .ndo_start_xmit = team_xmit, .ndo_select_queue = team_select_queue, .ndo_change_rx_flags = team_change_rx_flags, .ndo_set_rx_mode = team_set_rx_mode, .ndo_set_mac_address = team_set_mac_address, .ndo_change_mtu = team_change_mtu, .ndo_get_stats64 = team_get_stats64, .ndo_vlan_rx_add_vid = team_vlan_rx_add_vid, .ndo_vlan_rx_kill_vid = team_vlan_rx_kill_vid, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = team_poll_controller, .ndo_netpoll_setup = team_netpoll_setup, .ndo_netpoll_cleanup = team_netpoll_cleanup, #endif .ndo_add_slave = team_add_slave, .ndo_del_slave = team_del_slave, .ndo_fix_features = team_fix_features, .ndo_change_carrier = team_change_carrier, .ndo_features_check = passthru_features_check, }; /*********************** * ethtool interface ***********************/ static void team_ethtool_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *drvinfo) { strscpy(drvinfo->driver, DRV_NAME, sizeof(drvinfo->driver)); } static int team_ethtool_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd) { struct team *team= netdev_priv(dev); unsigned long speed = 0; struct team_port *port; cmd->base.duplex = DUPLEX_UNKNOWN; cmd->base.port = PORT_OTHER; rcu_read_lock(); list_for_each_entry_rcu(port, &team->port_list, list) { if (team_port_txable(port)) { if (port->state.speed != SPEED_UNKNOWN) speed += port->state.speed; if (cmd->base.duplex == DUPLEX_UNKNOWN && port->state.duplex != DUPLEX_UNKNOWN) cmd->base.duplex = port->state.duplex; } } rcu_read_unlock(); cmd->base.speed = speed ? : SPEED_UNKNOWN; return 0; } static const struct ethtool_ops team_ethtool_ops = { .get_drvinfo = team_ethtool_get_drvinfo, .get_link = ethtool_op_get_link, .get_link_ksettings = team_ethtool_get_link_ksettings, }; /*********************** * rt netlink interface ***********************/ static void team_setup_by_port(struct net_device *dev, struct net_device *port_dev) { struct team *team = netdev_priv(dev); if (port_dev->type == ARPHRD_ETHER) dev->header_ops = team->header_ops_cache; else dev->header_ops = port_dev->header_ops; dev->type = port_dev->type; dev->hard_header_len = port_dev->hard_header_len; dev->needed_headroom = port_dev->needed_headroom; dev->addr_len = port_dev->addr_len; dev->mtu = port_dev->mtu; memcpy(dev->broadcast, port_dev->broadcast, port_dev->addr_len); eth_hw_addr_inherit(dev, port_dev); if (port_dev->flags & IFF_POINTOPOINT) { dev->flags &= ~(IFF_BROADCAST | IFF_MULTICAST); dev->flags |= (IFF_POINTOPOINT | IFF_NOARP); } else if ((port_dev->flags & (IFF_BROADCAST | IFF_MULTICAST)) == (IFF_BROADCAST | IFF_MULTICAST)) { dev->flags |= (IFF_BROADCAST | IFF_MULTICAST); dev->flags &= ~(IFF_POINTOPOINT | IFF_NOARP); } } static int team_dev_type_check_change(struct net_device *dev, struct net_device *port_dev) { struct team *team = netdev_priv(dev); char *portname = port_dev->name; int err; if (dev->type == port_dev->type) return 0; if (!list_empty(&team->port_list)) { netdev_err(dev, "Device %s is of different type\n", portname); return -EBUSY; } err = call_netdevice_notifiers(NETDEV_PRE_TYPE_CHANGE, dev); err = notifier_to_errno(err); if (err) { netdev_err(dev, "Refused to change device type\n"); return err; } dev_uc_flush(dev); dev_mc_flush(dev); team_setup_by_port(dev, port_dev); call_netdevice_notifiers(NETDEV_POST_TYPE_CHANGE, dev); return 0; } static void team_setup(struct net_device *dev) { struct team *team = netdev_priv(dev); ether_setup(dev); dev->max_mtu = ETH_MAX_MTU; team->header_ops_cache = dev->header_ops; dev->netdev_ops = &team_netdev_ops; dev->ethtool_ops = &team_ethtool_ops; dev->needs_free_netdev = true; dev->priv_destructor = team_destructor; dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING); dev->priv_flags |= IFF_NO_QUEUE; dev->priv_flags |= IFF_TEAM; /* * Indicate we support unicast address filtering. That way core won't * bring us to promisc mode in case a unicast addr is added. * Let this up to underlay drivers. */ dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE; dev->lltx = true; /* Don't allow team devices to change network namespaces. */ dev->netns_immutable = true; dev->features |= NETIF_F_GRO; dev->hw_features = MASTER_UPPER_DEV_VLAN_FEATURES | NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_HW_VLAN_STAG_RX | NETIF_F_HW_VLAN_STAG_FILTER; dev->hw_features |= NETIF_F_GSO_ENCAP_ALL; dev->features |= dev->hw_features; dev->features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; } static int team_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct nlattr **tb = params->tb; if (tb[IFLA_ADDRESS] == NULL) eth_hw_addr_random(dev); return register_netdevice(dev); } static int team_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { if (tb[IFLA_ADDRESS]) { if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) return -EINVAL; if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) return -EADDRNOTAVAIL; } return 0; } static unsigned int team_get_num_tx_queues(void) { return TEAM_DEFAULT_NUM_TX_QUEUES; } static unsigned int team_get_num_rx_queues(void) { return TEAM_DEFAULT_NUM_RX_QUEUES; } static struct rtnl_link_ops team_link_ops __read_mostly = { .kind = DRV_NAME, .priv_size = sizeof(struct team), .setup = team_setup, .newlink = team_newlink, .validate = team_validate, .get_num_tx_queues = team_get_num_tx_queues, .get_num_rx_queues = team_get_num_rx_queues, }; /*********************************** * Generic netlink custom interface ***********************************/ static struct genl_family team_nl_family; int team_nl_noop_doit(struct sk_buff *skb, struct genl_info *info) { struct sk_buff *msg; void *hdr; int err; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq, &team_nl_family, 0, TEAM_CMD_NOOP); if (!hdr) { err = -EMSGSIZE; goto err_msg_put; } genlmsg_end(msg, hdr); return genlmsg_unicast(genl_info_net(info), msg, info->snd_portid); err_msg_put: nlmsg_free(msg); return err; } /* * Netlink cmd functions should be locked by following two functions. * Since dev gets held here, that ensures dev won't disappear in between. */ static struct team *team_nl_team_get(struct genl_info *info) { struct net *net = genl_info_net(info); struct net_device *dev; int ifindex; ASSERT_RTNL(); if (!info->attrs[TEAM_ATTR_TEAM_IFINDEX]) return NULL; ifindex = nla_get_u32(info->attrs[TEAM_ATTR_TEAM_IFINDEX]); dev = dev_get_by_index(net, ifindex); if (!dev || dev->netdev_ops != &team_netdev_ops) { dev_put(dev); return NULL; } return netdev_priv(dev); } static void team_nl_team_put(struct team *team) { dev_put(team->dev); } typedef int team_nl_send_func_t(struct sk_buff *skb, struct team *team, u32 portid); static int team_nl_send_unicast(struct sk_buff *skb, struct team *team, u32 portid) { return genlmsg_unicast(dev_net(team->dev), skb, portid); } static int team_nl_fill_one_option_get(struct sk_buff *skb, struct team *team, struct team_option_inst *opt_inst) { struct nlattr *option_item; struct team_option *option = opt_inst->option; struct team_option_inst_info *opt_inst_info = &opt_inst->info; struct team_gsetter_ctx ctx; int err; ctx.info = opt_inst_info; err = team_option_get(team, opt_inst, &ctx); if (err) return err; option_item = nla_nest_start_noflag(skb, TEAM_ATTR_ITEM_OPTION); if (!option_item) return -EMSGSIZE; if (nla_put_string(skb, TEAM_ATTR_OPTION_NAME, option->name)) goto nest_cancel; if (opt_inst_info->port && nla_put_u32(skb, TEAM_ATTR_OPTION_PORT_IFINDEX, opt_inst_info->port->dev->ifindex)) goto nest_cancel; if (opt_inst->option->array_size && nla_put_u32(skb, TEAM_ATTR_OPTION_ARRAY_INDEX, opt_inst_info->array_index)) goto nest_cancel; switch (option->type) { case TEAM_OPTION_TYPE_U32: if (nla_put_u8(skb, TEAM_ATTR_OPTION_TYPE, NLA_U32)) goto nest_cancel; if (nla_put_u32(skb, TEAM_ATTR_OPTION_DATA, ctx.data.u32_val)) goto nest_cancel; break; case TEAM_OPTION_TYPE_STRING: if (nla_put_u8(skb, TEAM_ATTR_OPTION_TYPE, NLA_STRING)) goto nest_cancel; if (nla_put_string(skb, TEAM_ATTR_OPTION_DATA, ctx.data.str_val)) goto nest_cancel; break; case TEAM_OPTION_TYPE_BINARY: if (nla_put_u8(skb, TEAM_ATTR_OPTION_TYPE, NLA_BINARY)) goto nest_cancel; if (nla_put(skb, TEAM_ATTR_OPTION_DATA, ctx.data.bin_val.len, ctx.data.bin_val.ptr)) goto nest_cancel; break; case TEAM_OPTION_TYPE_BOOL: if (nla_put_u8(skb, TEAM_ATTR_OPTION_TYPE, NLA_FLAG)) goto nest_cancel; if (ctx.data.bool_val && nla_put_flag(skb, TEAM_ATTR_OPTION_DATA)) goto nest_cancel; break; case TEAM_OPTION_TYPE_S32: if (nla_put_u8(skb, TEAM_ATTR_OPTION_TYPE, NLA_S32)) goto nest_cancel; if (nla_put_s32(skb, TEAM_ATTR_OPTION_DATA, ctx.data.s32_val)) goto nest_cancel; break; default: BUG(); } if (opt_inst->removed && nla_put_flag(skb, TEAM_ATTR_OPTION_REMOVED)) goto nest_cancel; if (opt_inst->changed) { if (nla_put_flag(skb, TEAM_ATTR_OPTION_CHANGED)) goto nest_cancel; opt_inst->changed = false; } nla_nest_end(skb, option_item); return 0; nest_cancel: nla_nest_cancel(skb, option_item); return -EMSGSIZE; } static int __send_and_alloc_skb(struct sk_buff **pskb, struct team *team, u32 portid, team_nl_send_func_t *send_func) { int err; if (*pskb) { err = send_func(*pskb, team, portid); if (err) return err; } *pskb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!*pskb) return -ENOMEM; return 0; } static int team_nl_send_options_get(struct team *team, u32 portid, u32 seq, int flags, team_nl_send_func_t *send_func, struct list_head *sel_opt_inst_list) { struct nlattr *option_list; struct nlmsghdr *nlh; void *hdr; struct team_option_inst *opt_inst; int err; struct sk_buff *skb = NULL; bool incomplete; int i; opt_inst = list_first_entry(sel_opt_inst_list, struct team_option_inst, tmp_list); start_again: err = __send_and_alloc_skb(&skb, team, portid, send_func); if (err) return err; hdr = genlmsg_put(skb, portid, seq, &team_nl_family, flags | NLM_F_MULTI, TEAM_CMD_OPTIONS_GET); if (!hdr) { nlmsg_free(skb); return -EMSGSIZE; } if (nla_put_u32(skb, TEAM_ATTR_TEAM_IFINDEX, team->dev->ifindex)) goto nla_put_failure; option_list = nla_nest_start_noflag(skb, TEAM_ATTR_LIST_OPTION); if (!option_list) goto nla_put_failure; i = 0; incomplete = false; list_for_each_entry_from(opt_inst, sel_opt_inst_list, tmp_list) { err = team_nl_fill_one_option_get(skb, team, opt_inst); if (err) { if (err == -EMSGSIZE) { if (!i) goto errout; incomplete = true; break; } goto errout; } i++; } nla_nest_end(skb, option_list); genlmsg_end(skb, hdr); if (incomplete) goto start_again; send_done: nlh = nlmsg_put(skb, portid, seq, NLMSG_DONE, 0, flags | NLM_F_MULTI); if (!nlh) { err = __send_and_alloc_skb(&skb, team, portid, send_func); if (err) return err; goto send_done; } return send_func(skb, team, portid); nla_put_failure: err = -EMSGSIZE; errout: nlmsg_free(skb); return err; } int team_nl_options_get_doit(struct sk_buff *skb, struct genl_info *info) { struct team *team; struct team_option_inst *opt_inst; int err; LIST_HEAD(sel_opt_inst_list); rtnl_lock(); team = team_nl_team_get(info); if (!team) { err = -EINVAL; goto rtnl_unlock; } list_for_each_entry(opt_inst, &team->option_inst_list, list) list_add_tail(&opt_inst->tmp_list, &sel_opt_inst_list); err = team_nl_send_options_get(team, info->snd_portid, info->snd_seq, NLM_F_ACK, team_nl_send_unicast, &sel_opt_inst_list); team_nl_team_put(team); rtnl_unlock: rtnl_unlock(); return err; } static int team_nl_send_event_options_get(struct team *team, struct list_head *sel_opt_inst_list); int team_nl_options_set_doit(struct sk_buff *skb, struct genl_info *info) { struct team *team; int err = 0; int i; struct nlattr *nl_option; rtnl_lock(); team = team_nl_team_get(info); if (!team) { err = -EINVAL; goto rtnl_unlock; } err = -EINVAL; if (!info->attrs[TEAM_ATTR_LIST_OPTION]) { err = -EINVAL; goto team_put; } nla_for_each_nested(nl_option, info->attrs[TEAM_ATTR_LIST_OPTION], i) { struct nlattr *opt_attrs[TEAM_ATTR_OPTION_MAX + 1]; struct nlattr *attr; struct nlattr *attr_data; LIST_HEAD(opt_inst_list); enum team_option_type opt_type; int opt_port_ifindex = 0; /* != 0 for per-port options */ u32 opt_array_index = 0; bool opt_is_array = false; struct team_option_inst *opt_inst; char *opt_name; bool opt_found = false; if (nla_type(nl_option) != TEAM_ATTR_ITEM_OPTION) { err = -EINVAL; goto team_put; } err = nla_parse_nested_deprecated(opt_attrs, TEAM_ATTR_OPTION_MAX, nl_option, team_attr_option_nl_policy, info->extack); if (err) goto team_put; if (!opt_attrs[TEAM_ATTR_OPTION_NAME] || !opt_attrs[TEAM_ATTR_OPTION_TYPE]) { err = -EINVAL; goto team_put; } switch (nla_get_u8(opt_attrs[TEAM_ATTR_OPTION_TYPE])) { case NLA_U32: opt_type = TEAM_OPTION_TYPE_U32; break; case NLA_STRING: opt_type = TEAM_OPTION_TYPE_STRING; break; case NLA_BINARY: opt_type = TEAM_OPTION_TYPE_BINARY; break; case NLA_FLAG: opt_type = TEAM_OPTION_TYPE_BOOL; break; case NLA_S32: opt_type = TEAM_OPTION_TYPE_S32; break; default: goto team_put; } attr_data = opt_attrs[TEAM_ATTR_OPTION_DATA]; if (opt_type != TEAM_OPTION_TYPE_BOOL && !attr_data) { err = -EINVAL; goto team_put; } opt_name = nla_data(opt_attrs[TEAM_ATTR_OPTION_NAME]); attr = opt_attrs[TEAM_ATTR_OPTION_PORT_IFINDEX]; if (attr) opt_port_ifindex = nla_get_u32(attr); attr = opt_attrs[TEAM_ATTR_OPTION_ARRAY_INDEX]; if (attr) { opt_is_array = true; opt_array_index = nla_get_u32(attr); } list_for_each_entry(opt_inst, &team->option_inst_list, list) { struct team_option *option = opt_inst->option; struct team_gsetter_ctx ctx; struct team_option_inst_info *opt_inst_info; int tmp_ifindex; opt_inst_info = &opt_inst->info; tmp_ifindex = opt_inst_info->port ? opt_inst_info->port->dev->ifindex : 0; if (option->type != opt_type || strcmp(option->name, opt_name) || tmp_ifindex != opt_port_ifindex || (option->array_size && !opt_is_array) || opt_inst_info->array_index != opt_array_index) continue; opt_found = true; ctx.info = opt_inst_info; switch (opt_type) { case TEAM_OPTION_TYPE_U32: ctx.data.u32_val = nla_get_u32(attr_data); break; case TEAM_OPTION_TYPE_STRING: if (nla_len(attr_data) > TEAM_STRING_MAX_LEN || !memchr(nla_data(attr_data), '\0', nla_len(attr_data))) { err = -EINVAL; goto team_put; } ctx.data.str_val = nla_data(attr_data); break; case TEAM_OPTION_TYPE_BINARY: ctx.data.bin_val.len = nla_len(attr_data); ctx.data.bin_val.ptr = nla_data(attr_data); break; case TEAM_OPTION_TYPE_BOOL: ctx.data.bool_val = attr_data ? true : false; break; case TEAM_OPTION_TYPE_S32: ctx.data.s32_val = nla_get_s32(attr_data); break; default: BUG(); } err = team_option_set(team, opt_inst, &ctx); if (err) goto team_put; opt_inst->changed = true; list_add(&opt_inst->tmp_list, &opt_inst_list); } if (!opt_found) { err = -ENOENT; goto team_put; } err = team_nl_send_event_options_get(team, &opt_inst_list); if (err) break; } team_put: team_nl_team_put(team); rtnl_unlock: rtnl_unlock(); return err; } static int team_nl_fill_one_port_get(struct sk_buff *skb, struct team_port *port) { struct nlattr *port_item; port_item = nla_nest_start_noflag(skb, TEAM_ATTR_ITEM_PORT); if (!port_item) goto nest_cancel; if (nla_put_u32(skb, TEAM_ATTR_PORT_IFINDEX, port->dev->ifindex)) goto nest_cancel; if (port->changed) { if (nla_put_flag(skb, TEAM_ATTR_PORT_CHANGED)) goto nest_cancel; port->changed = false; } if ((port->removed && nla_put_flag(skb, TEAM_ATTR_PORT_REMOVED)) || (port->state.linkup && nla_put_flag(skb, TEAM_ATTR_PORT_LINKUP)) || nla_put_u32(skb, TEAM_ATTR_PORT_SPEED, port->state.speed) || nla_put_u8(skb, TEAM_ATTR_PORT_DUPLEX, port->state.duplex)) goto nest_cancel; nla_nest_end(skb, port_item); return 0; nest_cancel: nla_nest_cancel(skb, port_item); return -EMSGSIZE; } static int team_nl_send_port_list_get(struct team *team, u32 portid, u32 seq, int flags, team_nl_send_func_t *send_func, struct team_port *one_port) { struct nlattr *port_list; struct nlmsghdr *nlh; void *hdr; struct team_port *port; int err; struct sk_buff *skb = NULL; bool incomplete; int i; port = list_first_entry_or_null(&team->port_list, struct team_port, list); start_again: err = __send_and_alloc_skb(&skb, team, portid, send_func); if (err) return err; hdr = genlmsg_put(skb, portid, seq, &team_nl_family, flags | NLM_F_MULTI, TEAM_CMD_PORT_LIST_GET); if (!hdr) { nlmsg_free(skb); return -EMSGSIZE; } if (nla_put_u32(skb, TEAM_ATTR_TEAM_IFINDEX, team->dev->ifindex)) goto nla_put_failure; port_list = nla_nest_start_noflag(skb, TEAM_ATTR_LIST_PORT); if (!port_list) goto nla_put_failure; i = 0; incomplete = false; /* If one port is selected, called wants to send port list containing * only this port. Otherwise go through all listed ports and send all */ if (one_port) { err = team_nl_fill_one_port_get(skb, one_port); if (err) goto errout; } else if (port) { list_for_each_entry_from(port, &team->port_list, list) { err = team_nl_fill_one_port_get(skb, port); if (err) { if (err == -EMSGSIZE) { if (!i) goto errout; incomplete = true; break; } goto errout; } i++; } } nla_nest_end(skb, port_list); genlmsg_end(skb, hdr); if (incomplete) goto start_again; send_done: nlh = nlmsg_put(skb, portid, seq, NLMSG_DONE, 0, flags | NLM_F_MULTI); if (!nlh) { err = __send_and_alloc_skb(&skb, team, portid, send_func); if (err) return err; goto send_done; } return send_func(skb, team, portid); nla_put_failure: err = -EMSGSIZE; errout: nlmsg_free(skb); return err; } int team_nl_port_list_get_doit(struct sk_buff *skb, struct genl_info *info) { struct team *team; int err; rtnl_lock(); team = team_nl_team_get(info); if (!team) { err = -EINVAL; goto rtnl_unlock; } err = team_nl_send_port_list_get(team, info->snd_portid, info->snd_seq, NLM_F_ACK, team_nl_send_unicast, NULL); team_nl_team_put(team); rtnl_unlock: rtnl_unlock(); return err; } static const struct genl_multicast_group team_nl_mcgrps[] = { { .name = TEAM_GENL_CHANGE_EVENT_MC_GRP_NAME, }, }; static struct genl_family team_nl_family __ro_after_init = { .name = TEAM_GENL_NAME, .version = TEAM_GENL_VERSION, .maxattr = ARRAY_SIZE(team_nl_policy) - 1, .policy = team_nl_policy, .netnsok = true, .module = THIS_MODULE, .small_ops = team_nl_ops, .n_small_ops = ARRAY_SIZE(team_nl_ops), .resv_start_op = TEAM_CMD_PORT_LIST_GET + 1, .mcgrps = team_nl_mcgrps, .n_mcgrps = ARRAY_SIZE(team_nl_mcgrps), }; static int team_nl_send_multicast(struct sk_buff *skb, struct team *team, u32 portid) { return genlmsg_multicast_netns(&team_nl_family, dev_net(team->dev), skb, 0, 0, GFP_KERNEL); } static int team_nl_send_event_options_get(struct team *team, struct list_head *sel_opt_inst_list) { return team_nl_send_options_get(team, 0, 0, 0, team_nl_send_multicast, sel_opt_inst_list); } static int team_nl_send_event_port_get(struct team *team, struct team_port *port) { return team_nl_send_port_list_get(team, 0, 0, 0, team_nl_send_multicast, port); } static int __init team_nl_init(void) { return genl_register_family(&team_nl_family); } static void __exit team_nl_fini(void) { genl_unregister_family(&team_nl_family); } /****************** * Change checkers ******************/ static void __team_options_change_check(struct team *team) { int err; struct team_option_inst *opt_inst; LIST_HEAD(sel_opt_inst_list); list_for_each_entry(opt_inst, &team->option_inst_list, list) { if (opt_inst->changed) list_add_tail(&opt_inst->tmp_list, &sel_opt_inst_list); } err = team_nl_send_event_options_get(team, &sel_opt_inst_list); if (err && err != -ESRCH) netdev_warn(team->dev, "Failed to send options change via netlink (err %d)\n", err); } /* rtnl lock is held */ static void __team_port_change_send(struct team_port *port, bool linkup) { int err; port->changed = true; port->state.linkup = linkup; team_refresh_port_linkup(port); if (linkup) { struct ethtool_link_ksettings ecmd; err = __ethtool_get_link_ksettings(port->dev, &ecmd); if (!err) { port->state.speed = ecmd.base.speed; port->state.duplex = ecmd.base.duplex; goto send_event; } } port->state.speed = 0; port->state.duplex = 0; send_event: err = team_nl_send_event_port_get(port->team, port); if (err && err != -ESRCH) netdev_warn(port->team->dev, "Failed to send port change of device %s via netlink (err %d)\n", port->dev->name, err); } static void __team_carrier_check(struct team *team) { struct team_port *port; bool team_linkup; if (team->user_carrier_enabled) return; team_linkup = false; list_for_each_entry(port, &team->port_list, list) { if (port->linkup) { team_linkup = true; break; } } if (team_linkup) netif_carrier_on(team->dev); else netif_carrier_off(team->dev); } static void __team_port_change_check(struct team_port *port, bool linkup) { if (port->state.linkup != linkup) __team_port_change_send(port, linkup); __team_carrier_check(port->team); } static void __team_port_change_port_added(struct team_port *port, bool linkup) { __team_port_change_send(port, linkup); __team_carrier_check(port->team); } static void __team_port_change_port_removed(struct team_port *port) { port->removed = true; __team_port_change_send(port, false); __team_carrier_check(port->team); } static void team_port_change_check(struct team_port *port, bool linkup) { ASSERT_RTNL(); __team_port_change_check(port, linkup); } /************************************ * Net device notifier event handler ************************************/ static int team_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct team_port *port; port = team_port_get_rtnl(dev); if (!port) return NOTIFY_DONE; switch (event) { case NETDEV_UP: if (netif_oper_up(dev)) team_port_change_check(port, true); break; case NETDEV_DOWN: team_port_change_check(port, false); break; case NETDEV_CHANGE: if (netif_running(port->dev)) team_port_change_check(port, !!netif_oper_up(port->dev)); break; case NETDEV_UNREGISTER: team_del_slave(port->team->dev, dev); break; case NETDEV_FEAT_CHANGE: if (!port->team->notifier_ctx) { port->team->notifier_ctx = true; netdev_compute_master_upper_features(port->team->dev, true); port->team->notifier_ctx = false; } break; case NETDEV_PRECHANGEMTU: /* Forbid to change mtu of underlaying device */ if (!port->team->port_mtu_change_allowed) return NOTIFY_BAD; break; case NETDEV_PRE_TYPE_CHANGE: /* Forbid to change type of underlaying device */ return NOTIFY_BAD; case NETDEV_RESEND_IGMP: /* Propagate to master device */ call_netdevice_notifiers(event, port->team->dev); break; } return NOTIFY_DONE; } static struct notifier_block team_notifier_block __read_mostly = { .notifier_call = team_device_event, }; /*********************** * Module init and exit ***********************/ static int __init team_module_init(void) { int err; register_netdevice_notifier(&team_notifier_block); err = rtnl_link_register(&team_link_ops); if (err) goto err_rtnl_reg; err = team_nl_init(); if (err) goto err_nl_init; return 0; err_nl_init: rtnl_link_unregister(&team_link_ops); err_rtnl_reg: unregister_netdevice_notifier(&team_notifier_block); return err; } static void __exit team_module_exit(void) { team_nl_fini(); rtnl_link_unregister(&team_link_ops); unregister_netdevice_notifier(&team_notifier_block); } module_init(team_module_init); module_exit(team_module_exit); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Jiri Pirko <jpirko@redhat.com>"); MODULE_DESCRIPTION("Ethernet team device driver"); MODULE_ALIAS_RTNL_LINK(DRV_NAME); |
| 1 1 1 2 3 1 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Roccat Kova[+] driver for Linux * * Copyright (c) 2011 Stefan Achatz <erazor_de@users.sourceforge.net> */ /* */ /* * Roccat Kova[+] is a bigger version of the Pyra with two more side buttons. */ #include <linux/device.h> #include <linux/input.h> #include <linux/hid.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/hid-roccat.h> #include "hid-ids.h" #include "hid-roccat-common.h" #include "hid-roccat-kovaplus.h" static uint profile_numbers[5] = {0, 1, 2, 3, 4}; static uint kovaplus_convert_event_cpi(uint value) { return (value == 7 ? 4 : (value == 4 ? 3 : value)); } static void kovaplus_profile_activated(struct kovaplus_device *kovaplus, uint new_profile_index) { if (new_profile_index >= ARRAY_SIZE(kovaplus->profile_settings)) return; kovaplus->actual_profile = new_profile_index; kovaplus->actual_cpi = kovaplus->profile_settings[new_profile_index].cpi_startup_level; kovaplus->actual_x_sensitivity = kovaplus->profile_settings[new_profile_index].sensitivity_x; kovaplus->actual_y_sensitivity = kovaplus->profile_settings[new_profile_index].sensitivity_y; } static int kovaplus_send_control(struct usb_device *usb_dev, uint value, enum kovaplus_control_requests request) { int retval; struct roccat_common2_control control; if ((request == KOVAPLUS_CONTROL_REQUEST_PROFILE_SETTINGS || request == KOVAPLUS_CONTROL_REQUEST_PROFILE_BUTTONS) && value > 4) return -EINVAL; control.command = ROCCAT_COMMON_COMMAND_CONTROL; control.value = value; control.request = request; retval = roccat_common2_send(usb_dev, ROCCAT_COMMON_COMMAND_CONTROL, &control, sizeof(struct roccat_common2_control)); return retval; } static int kovaplus_select_profile(struct usb_device *usb_dev, uint number, enum kovaplus_control_requests request) { return kovaplus_send_control(usb_dev, number, request); } static int kovaplus_get_profile_settings(struct usb_device *usb_dev, struct kovaplus_profile_settings *buf, uint number) { int retval; retval = kovaplus_select_profile(usb_dev, number, KOVAPLUS_CONTROL_REQUEST_PROFILE_SETTINGS); if (retval) return retval; return roccat_common2_receive(usb_dev, KOVAPLUS_COMMAND_PROFILE_SETTINGS, buf, KOVAPLUS_SIZE_PROFILE_SETTINGS); } static int kovaplus_get_profile_buttons(struct usb_device *usb_dev, struct kovaplus_profile_buttons *buf, int number) { int retval; retval = kovaplus_select_profile(usb_dev, number, KOVAPLUS_CONTROL_REQUEST_PROFILE_BUTTONS); if (retval) return retval; return roccat_common2_receive(usb_dev, KOVAPLUS_COMMAND_PROFILE_BUTTONS, buf, KOVAPLUS_SIZE_PROFILE_BUTTONS); } /* retval is 0-4 on success, < 0 on error */ static int kovaplus_get_actual_profile(struct usb_device *usb_dev) { struct kovaplus_actual_profile buf; int retval; retval = roccat_common2_receive(usb_dev, KOVAPLUS_COMMAND_ACTUAL_PROFILE, &buf, sizeof(struct kovaplus_actual_profile)); return retval ? retval : buf.actual_profile; } static int kovaplus_set_actual_profile(struct usb_device *usb_dev, int new_profile) { struct kovaplus_actual_profile buf; buf.command = KOVAPLUS_COMMAND_ACTUAL_PROFILE; buf.size = sizeof(struct kovaplus_actual_profile); buf.actual_profile = new_profile; return roccat_common2_send_with_status(usb_dev, KOVAPLUS_COMMAND_ACTUAL_PROFILE, &buf, sizeof(struct kovaplus_actual_profile)); } static ssize_t kovaplus_sysfs_read(struct file *fp, struct kobject *kobj, char *buf, loff_t off, size_t count, size_t real_size, uint command) { struct device *dev = kobj_to_dev(kobj)->parent->parent; struct kovaplus_device *kovaplus = hid_get_drvdata(dev_get_drvdata(dev)); struct usb_device *usb_dev = interface_to_usbdev(to_usb_interface(dev)); int retval; if (off >= real_size) return 0; if (off != 0 || count != real_size) return -EINVAL; mutex_lock(&kovaplus->kovaplus_lock); retval = roccat_common2_receive(usb_dev, command, buf, real_size); mutex_unlock(&kovaplus->kovaplus_lock); if (retval) return retval; return real_size; } static ssize_t kovaplus_sysfs_write(struct file *fp, struct kobject *kobj, void const *buf, loff_t off, size_t count, size_t real_size, uint command) { struct device *dev = kobj_to_dev(kobj)->parent->parent; struct kovaplus_device *kovaplus = hid_get_drvdata(dev_get_drvdata(dev)); struct usb_device *usb_dev = interface_to_usbdev(to_usb_interface(dev)); int retval; if (off != 0 || count != real_size) return -EINVAL; mutex_lock(&kovaplus->kovaplus_lock); retval = roccat_common2_send_with_status(usb_dev, command, buf, real_size); mutex_unlock(&kovaplus->kovaplus_lock); if (retval) return retval; return real_size; } #define KOVAPLUS_SYSFS_W(thingy, THINGY) \ static ssize_t kovaplus_sysfs_write_ ## thingy(struct file *fp, \ struct kobject *kobj, const struct bin_attribute *attr, \ char *buf, loff_t off, size_t count) \ { \ return kovaplus_sysfs_write(fp, kobj, buf, off, count, \ KOVAPLUS_SIZE_ ## THINGY, KOVAPLUS_COMMAND_ ## THINGY); \ } #define KOVAPLUS_SYSFS_R(thingy, THINGY) \ static ssize_t kovaplus_sysfs_read_ ## thingy(struct file *fp, \ struct kobject *kobj, const struct bin_attribute *attr, \ char *buf, loff_t off, size_t count) \ { \ return kovaplus_sysfs_read(fp, kobj, buf, off, count, \ KOVAPLUS_SIZE_ ## THINGY, KOVAPLUS_COMMAND_ ## THINGY); \ } #define KOVAPLUS_SYSFS_RW(thingy, THINGY) \ KOVAPLUS_SYSFS_W(thingy, THINGY) \ KOVAPLUS_SYSFS_R(thingy, THINGY) #define KOVAPLUS_BIN_ATTRIBUTE_RW(thingy, THINGY) \ KOVAPLUS_SYSFS_RW(thingy, THINGY); \ static const struct bin_attribute bin_attr_##thingy = { \ .attr = { .name = #thingy, .mode = 0660 }, \ .size = KOVAPLUS_SIZE_ ## THINGY, \ .read = kovaplus_sysfs_read_ ## thingy, \ .write = kovaplus_sysfs_write_ ## thingy \ } #define KOVAPLUS_BIN_ATTRIBUTE_W(thingy, THINGY) \ KOVAPLUS_SYSFS_W(thingy, THINGY); \ static const struct bin_attribute bin_attr_##thingy = { \ .attr = { .name = #thingy, .mode = 0220 }, \ .size = KOVAPLUS_SIZE_ ## THINGY, \ .write = kovaplus_sysfs_write_ ## thingy \ } KOVAPLUS_BIN_ATTRIBUTE_W(control, CONTROL); KOVAPLUS_BIN_ATTRIBUTE_RW(info, INFO); KOVAPLUS_BIN_ATTRIBUTE_RW(profile_settings, PROFILE_SETTINGS); KOVAPLUS_BIN_ATTRIBUTE_RW(profile_buttons, PROFILE_BUTTONS); static ssize_t kovaplus_sysfs_read_profilex_settings(struct file *fp, struct kobject *kobj, const struct bin_attribute *attr, char *buf, loff_t off, size_t count) { struct device *dev = kobj_to_dev(kobj)->parent->parent; struct usb_device *usb_dev = interface_to_usbdev(to_usb_interface(dev)); ssize_t retval; retval = kovaplus_select_profile(usb_dev, *(uint *)(attr->private), KOVAPLUS_CONTROL_REQUEST_PROFILE_SETTINGS); if (retval) return retval; return kovaplus_sysfs_read(fp, kobj, buf, off, count, KOVAPLUS_SIZE_PROFILE_SETTINGS, KOVAPLUS_COMMAND_PROFILE_SETTINGS); } static ssize_t kovaplus_sysfs_read_profilex_buttons(struct file *fp, struct kobject *kobj, const struct bin_attribute *attr, char *buf, loff_t off, size_t count) { struct device *dev = kobj_to_dev(kobj)->parent->parent; struct usb_device *usb_dev = interface_to_usbdev(to_usb_interface(dev)); ssize_t retval; retval = kovaplus_select_profile(usb_dev, *(uint *)(attr->private), KOVAPLUS_CONTROL_REQUEST_PROFILE_BUTTONS); if (retval) return retval; return kovaplus_sysfs_read(fp, kobj, buf, off, count, KOVAPLUS_SIZE_PROFILE_BUTTONS, KOVAPLUS_COMMAND_PROFILE_BUTTONS); } #define PROFILE_ATTR(number) \ static const struct bin_attribute bin_attr_profile##number##_settings = { \ .attr = { .name = "profile" #number "_settings", .mode = 0440 }, \ .size = KOVAPLUS_SIZE_PROFILE_SETTINGS, \ .read = kovaplus_sysfs_read_profilex_settings, \ .private = &profile_numbers[number-1], \ }; \ static const struct bin_attribute bin_attr_profile##number##_buttons = { \ .attr = { .name = "profile" #number "_buttons", .mode = 0440 }, \ .size = KOVAPLUS_SIZE_PROFILE_BUTTONS, \ .read = kovaplus_sysfs_read_profilex_buttons, \ .private = &profile_numbers[number-1], \ }; PROFILE_ATTR(1); PROFILE_ATTR(2); PROFILE_ATTR(3); PROFILE_ATTR(4); PROFILE_ATTR(5); static ssize_t kovaplus_sysfs_show_actual_profile(struct device *dev, struct device_attribute *attr, char *buf) { struct kovaplus_device *kovaplus = hid_get_drvdata(dev_get_drvdata(dev->parent->parent)); return sysfs_emit(buf, "%d\n", kovaplus->actual_profile); } static ssize_t kovaplus_sysfs_set_actual_profile(struct device *dev, struct device_attribute *attr, char const *buf, size_t size) { struct kovaplus_device *kovaplus; struct usb_device *usb_dev; unsigned long profile; int retval; struct kovaplus_roccat_report roccat_report; dev = dev->parent->parent; kovaplus = hid_get_drvdata(dev_get_drvdata(dev)); usb_dev = interface_to_usbdev(to_usb_interface(dev)); retval = kstrtoul(buf, 10, &profile); if (retval) return retval; if (profile >= 5) return -EINVAL; mutex_lock(&kovaplus->kovaplus_lock); retval = kovaplus_set_actual_profile(usb_dev, profile); if (retval) { mutex_unlock(&kovaplus->kovaplus_lock); return retval; } kovaplus_profile_activated(kovaplus, profile); roccat_report.type = KOVAPLUS_MOUSE_REPORT_BUTTON_TYPE_PROFILE_1; roccat_report.profile = profile + 1; roccat_report.button = 0; roccat_report.data1 = profile + 1; roccat_report.data2 = 0; roccat_report_event(kovaplus->chrdev_minor, (uint8_t const *)&roccat_report); mutex_unlock(&kovaplus->kovaplus_lock); return size; } static DEVICE_ATTR(actual_profile, 0660, kovaplus_sysfs_show_actual_profile, kovaplus_sysfs_set_actual_profile); static ssize_t kovaplus_sysfs_show_actual_cpi(struct device *dev, struct device_attribute *attr, char *buf) { struct kovaplus_device *kovaplus = hid_get_drvdata(dev_get_drvdata(dev->parent->parent)); return sysfs_emit(buf, "%d\n", kovaplus->actual_cpi); } static DEVICE_ATTR(actual_cpi, 0440, kovaplus_sysfs_show_actual_cpi, NULL); static ssize_t kovaplus_sysfs_show_actual_sensitivity_x(struct device *dev, struct device_attribute *attr, char *buf) { struct kovaplus_device *kovaplus = hid_get_drvdata(dev_get_drvdata(dev->parent->parent)); return sysfs_emit(buf, "%d\n", kovaplus->actual_x_sensitivity); } static DEVICE_ATTR(actual_sensitivity_x, 0440, kovaplus_sysfs_show_actual_sensitivity_x, NULL); static ssize_t kovaplus_sysfs_show_actual_sensitivity_y(struct device *dev, struct device_attribute *attr, char *buf) { struct kovaplus_device *kovaplus = hid_get_drvdata(dev_get_drvdata(dev->parent->parent)); return sysfs_emit(buf, "%d\n", kovaplus->actual_y_sensitivity); } static DEVICE_ATTR(actual_sensitivity_y, 0440, kovaplus_sysfs_show_actual_sensitivity_y, NULL); static ssize_t kovaplus_sysfs_show_firmware_version(struct device *dev, struct device_attribute *attr, char *buf) { struct kovaplus_device *kovaplus; struct usb_device *usb_dev; struct kovaplus_info info; dev = dev->parent->parent; kovaplus = hid_get_drvdata(dev_get_drvdata(dev)); usb_dev = interface_to_usbdev(to_usb_interface(dev)); mutex_lock(&kovaplus->kovaplus_lock); roccat_common2_receive(usb_dev, KOVAPLUS_COMMAND_INFO, &info, KOVAPLUS_SIZE_INFO); mutex_unlock(&kovaplus->kovaplus_lock); return sysfs_emit(buf, "%d\n", info.firmware_version); } static DEVICE_ATTR(firmware_version, 0440, kovaplus_sysfs_show_firmware_version, NULL); static struct attribute *kovaplus_attrs[] = { &dev_attr_actual_cpi.attr, &dev_attr_firmware_version.attr, &dev_attr_actual_profile.attr, &dev_attr_actual_sensitivity_x.attr, &dev_attr_actual_sensitivity_y.attr, NULL, }; static const struct bin_attribute *const kovaplus_bin_attributes[] = { &bin_attr_control, &bin_attr_info, &bin_attr_profile_settings, &bin_attr_profile_buttons, &bin_attr_profile1_settings, &bin_attr_profile2_settings, &bin_attr_profile3_settings, &bin_attr_profile4_settings, &bin_attr_profile5_settings, &bin_attr_profile1_buttons, &bin_attr_profile2_buttons, &bin_attr_profile3_buttons, &bin_attr_profile4_buttons, &bin_attr_profile5_buttons, NULL, }; static const struct attribute_group kovaplus_group = { .attrs = kovaplus_attrs, .bin_attrs = kovaplus_bin_attributes, }; static const struct attribute_group *kovaplus_groups[] = { &kovaplus_group, NULL, }; static const struct class kovaplus_class = { .name = "kovaplus", .dev_groups = kovaplus_groups, }; static int kovaplus_init_kovaplus_device_struct(struct usb_device *usb_dev, struct kovaplus_device *kovaplus) { int retval, i; static uint wait = 70; /* device will freeze with just 60 */ mutex_init(&kovaplus->kovaplus_lock); for (i = 0; i < 5; ++i) { msleep(wait); retval = kovaplus_get_profile_settings(usb_dev, &kovaplus->profile_settings[i], i); if (retval) return retval; msleep(wait); retval = kovaplus_get_profile_buttons(usb_dev, &kovaplus->profile_buttons[i], i); if (retval) return retval; } msleep(wait); retval = kovaplus_get_actual_profile(usb_dev); if (retval < 0) return retval; kovaplus_profile_activated(kovaplus, retval); return 0; } static int kovaplus_init_specials(struct hid_device *hdev) { struct usb_interface *intf = to_usb_interface(hdev->dev.parent); struct usb_device *usb_dev = interface_to_usbdev(intf); struct kovaplus_device *kovaplus; int retval; if (intf->cur_altsetting->desc.bInterfaceProtocol == USB_INTERFACE_PROTOCOL_MOUSE) { kovaplus = kzalloc(sizeof(*kovaplus), GFP_KERNEL); if (!kovaplus) { hid_err(hdev, "can't alloc device descriptor\n"); return -ENOMEM; } hid_set_drvdata(hdev, kovaplus); retval = kovaplus_init_kovaplus_device_struct(usb_dev, kovaplus); if (retval) { hid_err(hdev, "couldn't init struct kovaplus_device\n"); goto exit_free; } retval = roccat_connect(&kovaplus_class, hdev, sizeof(struct kovaplus_roccat_report)); if (retval < 0) { hid_err(hdev, "couldn't init char dev\n"); } else { kovaplus->chrdev_minor = retval; kovaplus->roccat_claimed = 1; } } else { hid_set_drvdata(hdev, NULL); } return 0; exit_free: kfree(kovaplus); return retval; } static void kovaplus_remove_specials(struct hid_device *hdev) { struct usb_interface *intf = to_usb_interface(hdev->dev.parent); struct kovaplus_device *kovaplus; if (intf->cur_altsetting->desc.bInterfaceProtocol == USB_INTERFACE_PROTOCOL_MOUSE) { kovaplus = hid_get_drvdata(hdev); if (kovaplus->roccat_claimed) roccat_disconnect(kovaplus->chrdev_minor); kfree(kovaplus); } } static int kovaplus_probe(struct hid_device *hdev, const struct hid_device_id *id) { int retval; if (!hid_is_usb(hdev)) return -EINVAL; retval = hid_parse(hdev); if (retval) { hid_err(hdev, "parse failed\n"); goto exit; } retval = hid_hw_start(hdev, HID_CONNECT_DEFAULT); if (retval) { hid_err(hdev, "hw start failed\n"); goto exit; } retval = kovaplus_init_specials(hdev); if (retval) { hid_err(hdev, "couldn't install mouse\n"); goto exit_stop; } return 0; exit_stop: hid_hw_stop(hdev); exit: return retval; } static void kovaplus_remove(struct hid_device *hdev) { kovaplus_remove_specials(hdev); hid_hw_stop(hdev); } static void kovaplus_keep_values_up_to_date(struct kovaplus_device *kovaplus, u8 const *data) { struct kovaplus_mouse_report_button const *button_report; if (data[0] != KOVAPLUS_MOUSE_REPORT_NUMBER_BUTTON) return; button_report = (struct kovaplus_mouse_report_button const *)data; switch (button_report->type) { case KOVAPLUS_MOUSE_REPORT_BUTTON_TYPE_PROFILE_1: kovaplus_profile_activated(kovaplus, button_report->data1 - 1); break; case KOVAPLUS_MOUSE_REPORT_BUTTON_TYPE_CPI: kovaplus->actual_cpi = kovaplus_convert_event_cpi(button_report->data1); break; case KOVAPLUS_MOUSE_REPORT_BUTTON_TYPE_SENSITIVITY: kovaplus->actual_x_sensitivity = button_report->data1; kovaplus->actual_y_sensitivity = button_report->data2; break; default: break; } } static void kovaplus_report_to_chrdev(struct kovaplus_device const *kovaplus, u8 const *data) { struct kovaplus_roccat_report roccat_report; struct kovaplus_mouse_report_button const *button_report; if (data[0] != KOVAPLUS_MOUSE_REPORT_NUMBER_BUTTON) return; button_report = (struct kovaplus_mouse_report_button const *)data; if (button_report->type == KOVAPLUS_MOUSE_REPORT_BUTTON_TYPE_PROFILE_2) return; roccat_report.type = button_report->type; roccat_report.profile = kovaplus->actual_profile + 1; if (roccat_report.type == KOVAPLUS_MOUSE_REPORT_BUTTON_TYPE_MACRO || roccat_report.type == KOVAPLUS_MOUSE_REPORT_BUTTON_TYPE_SHORTCUT || roccat_report.type == KOVAPLUS_MOUSE_REPORT_BUTTON_TYPE_QUICKLAUNCH || roccat_report.type == KOVAPLUS_MOUSE_REPORT_BUTTON_TYPE_TIMER) roccat_report.button = button_report->data1; else roccat_report.button = 0; if (roccat_report.type == KOVAPLUS_MOUSE_REPORT_BUTTON_TYPE_CPI) roccat_report.data1 = kovaplus_convert_event_cpi(button_report->data1); else roccat_report.data1 = button_report->data1; roccat_report.data2 = button_report->data2; roccat_report_event(kovaplus->chrdev_minor, (uint8_t const *)&roccat_report); } static int kovaplus_raw_event(struct hid_device *hdev, struct hid_report *report, u8 *data, int size) { struct usb_interface *intf = to_usb_interface(hdev->dev.parent); struct kovaplus_device *kovaplus = hid_get_drvdata(hdev); if (intf->cur_altsetting->desc.bInterfaceProtocol != USB_INTERFACE_PROTOCOL_MOUSE) return 0; if (kovaplus == NULL) return 0; kovaplus_keep_values_up_to_date(kovaplus, data); if (kovaplus->roccat_claimed) kovaplus_report_to_chrdev(kovaplus, data); return 0; } static const struct hid_device_id kovaplus_devices[] = { { HID_USB_DEVICE(USB_VENDOR_ID_ROCCAT, USB_DEVICE_ID_ROCCAT_KOVAPLUS) }, { } }; MODULE_DEVICE_TABLE(hid, kovaplus_devices); static struct hid_driver kovaplus_driver = { .name = "kovaplus", .id_table = kovaplus_devices, .probe = kovaplus_probe, .remove = kovaplus_remove, .raw_event = kovaplus_raw_event }; static int __init kovaplus_init(void) { int retval; retval = class_register(&kovaplus_class); if (retval) return retval; retval = hid_register_driver(&kovaplus_driver); if (retval) class_unregister(&kovaplus_class); return retval; } static void __exit kovaplus_exit(void) { hid_unregister_driver(&kovaplus_driver); class_unregister(&kovaplus_class); } module_init(kovaplus_init); module_exit(kovaplus_exit); MODULE_AUTHOR("Stefan Achatz"); MODULE_DESCRIPTION("USB Roccat Kova[+] driver"); MODULE_LICENSE("GPL v2"); |
| 12 6 15 23 11 12 19 170 5 5 7 4 4 4 1 12 1 1 8 3 10 16 1 2 2 8 4 7 5 8 1 8 30 13 17 18 5 12 19 12 7 39 46 4 36 2 4 23 12 4 1 6 42 1 1 1 1 1 11 23 3 2 2 1 4 4 2 2 1 1 1 2 2 2 2 1 2 169 12 5 159 9 9 9 9 9 40 40 30 13 3 25 6 33 2 2 19 17 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 | // SPDX-License-Identifier: GPL-2.0 /* * Encryption policy functions for per-file encryption support. * * Copyright (C) 2015, Google, Inc. * Copyright (C) 2015, Motorola Mobility. * * Originally written by Michael Halcrow, 2015. * Modified by Jaegeuk Kim, 2015. * Modified by Eric Biggers, 2019 for v2 policy support. */ #include <linux/export.h> #include <linux/fs_context.h> #include <linux/mount.h> #include <linux/random.h> #include <linux/seq_file.h> #include <linux/string.h> #include "fscrypt_private.h" /** * fscrypt_policies_equal() - check whether two encryption policies are the same * @policy1: the first policy * @policy2: the second policy * * Return: %true if equal, else %false */ bool fscrypt_policies_equal(const union fscrypt_policy *policy1, const union fscrypt_policy *policy2) { if (policy1->version != policy2->version) return false; return !memcmp(policy1, policy2, fscrypt_policy_size(policy1)); } int fscrypt_policy_to_key_spec(const union fscrypt_policy *policy, struct fscrypt_key_specifier *key_spec) { switch (policy->version) { case FSCRYPT_POLICY_V1: key_spec->type = FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR; memcpy(key_spec->u.descriptor, policy->v1.master_key_descriptor, FSCRYPT_KEY_DESCRIPTOR_SIZE); return 0; case FSCRYPT_POLICY_V2: key_spec->type = FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER; memcpy(key_spec->u.identifier, policy->v2.master_key_identifier, FSCRYPT_KEY_IDENTIFIER_SIZE); return 0; default: WARN_ON_ONCE(1); return -EINVAL; } } const union fscrypt_policy *fscrypt_get_dummy_policy(struct super_block *sb) { if (!sb->s_cop->get_dummy_policy) return NULL; return sb->s_cop->get_dummy_policy(sb); } /* * Return %true if the given combination of encryption modes is supported for v1 * (and later) encryption policies. * * Do *not* add anything new here, since v1 encryption policies are deprecated. * New combinations of modes should go in fscrypt_valid_enc_modes_v2() only. */ static bool fscrypt_valid_enc_modes_v1(u32 contents_mode, u32 filenames_mode) { if (contents_mode == FSCRYPT_MODE_AES_256_XTS && filenames_mode == FSCRYPT_MODE_AES_256_CTS) return true; if (contents_mode == FSCRYPT_MODE_AES_128_CBC && filenames_mode == FSCRYPT_MODE_AES_128_CTS) return true; if (contents_mode == FSCRYPT_MODE_ADIANTUM && filenames_mode == FSCRYPT_MODE_ADIANTUM) return true; return false; } static bool fscrypt_valid_enc_modes_v2(u32 contents_mode, u32 filenames_mode) { if (contents_mode == FSCRYPT_MODE_AES_256_XTS && filenames_mode == FSCRYPT_MODE_AES_256_HCTR2) return true; if (contents_mode == FSCRYPT_MODE_SM4_XTS && filenames_mode == FSCRYPT_MODE_SM4_CTS) return true; return fscrypt_valid_enc_modes_v1(contents_mode, filenames_mode); } static bool supported_direct_key_modes(const struct inode *inode, u32 contents_mode, u32 filenames_mode) { const struct fscrypt_mode *mode; if (contents_mode != filenames_mode) { fscrypt_warn(inode, "Direct key flag not allowed with different contents and filenames modes"); return false; } mode = &fscrypt_modes[contents_mode]; if (mode->ivsize < offsetofend(union fscrypt_iv, nonce)) { fscrypt_warn(inode, "Direct key flag not allowed with %s", mode->friendly_name); return false; } return true; } static bool supported_iv_ino_lblk_policy(const struct fscrypt_policy_v2 *policy, const struct inode *inode) { const char *type = (policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64) ? "IV_INO_LBLK_64" : "IV_INO_LBLK_32"; struct super_block *sb = inode->i_sb; /* * IV_INO_LBLK_* exist only because of hardware limitations, and * currently the only known use case for them involves AES-256-XTS. * That's also all we test currently. For these reasons, for now only * allow AES-256-XTS here. This can be relaxed later if a use case for * IV_INO_LBLK_* with other encryption modes arises. */ if (policy->contents_encryption_mode != FSCRYPT_MODE_AES_256_XTS) { fscrypt_warn(inode, "Can't use %s policy with contents mode other than AES-256-XTS", type); return false; } /* * It's unsafe to include inode numbers in the IVs if the filesystem can * potentially renumber inodes, e.g. via filesystem shrinking. */ if (!sb->s_cop->has_stable_inodes || !sb->s_cop->has_stable_inodes(sb)) { fscrypt_warn(inode, "Can't use %s policy on filesystem '%s' because it doesn't have stable inode numbers", type, sb->s_id); return false; } /* * IV_INO_LBLK_64 and IV_INO_LBLK_32 both require that inode numbers fit * in 32 bits. In principle, IV_INO_LBLK_32 could support longer inode * numbers because it hashes the inode number; however, currently the * inode number is gotten from inode::i_ino which is 'unsigned long'. * So for now the implementation limit is 32 bits. */ if (!sb->s_cop->has_32bit_inodes) { fscrypt_warn(inode, "Can't use %s policy on filesystem '%s' because its inode numbers are too long", type, sb->s_id); return false; } /* * IV_INO_LBLK_64 and IV_INO_LBLK_32 both require that file data unit * indices fit in 32 bits. */ if (fscrypt_max_file_dun_bits(sb, fscrypt_policy_v2_du_bits(policy, inode)) > 32) { fscrypt_warn(inode, "Can't use %s policy on filesystem '%s' because its maximum file size is too large", type, sb->s_id); return false; } return true; } static bool fscrypt_supported_v1_policy(const struct fscrypt_policy_v1 *policy, const struct inode *inode) { if (!fscrypt_valid_enc_modes_v1(policy->contents_encryption_mode, policy->filenames_encryption_mode)) { fscrypt_warn(inode, "Unsupported encryption modes (contents %d, filenames %d)", policy->contents_encryption_mode, policy->filenames_encryption_mode); return false; } if (policy->flags & ~(FSCRYPT_POLICY_FLAGS_PAD_MASK | FSCRYPT_POLICY_FLAG_DIRECT_KEY)) { fscrypt_warn(inode, "Unsupported encryption flags (0x%02x)", policy->flags); return false; } if ((policy->flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) && !supported_direct_key_modes(inode, policy->contents_encryption_mode, policy->filenames_encryption_mode)) return false; if (IS_CASEFOLDED(inode)) { /* With v1, there's no way to derive dirhash keys. */ fscrypt_warn(inode, "v1 policies can't be used on casefolded directories"); return false; } return true; } static bool fscrypt_supported_v2_policy(const struct fscrypt_policy_v2 *policy, const struct inode *inode) { int count = 0; if (!fscrypt_valid_enc_modes_v2(policy->contents_encryption_mode, policy->filenames_encryption_mode)) { fscrypt_warn(inode, "Unsupported encryption modes (contents %d, filenames %d)", policy->contents_encryption_mode, policy->filenames_encryption_mode); return false; } if (policy->flags & ~(FSCRYPT_POLICY_FLAGS_PAD_MASK | FSCRYPT_POLICY_FLAG_DIRECT_KEY | FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64 | FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)) { fscrypt_warn(inode, "Unsupported encryption flags (0x%02x)", policy->flags); return false; } count += !!(policy->flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY); count += !!(policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64); count += !!(policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32); if (count > 1) { fscrypt_warn(inode, "Mutually exclusive encryption flags (0x%02x)", policy->flags); return false; } if (policy->log2_data_unit_size) { if (!inode->i_sb->s_cop->supports_subblock_data_units) { fscrypt_warn(inode, "Filesystem does not support configuring crypto data unit size"); return false; } if (policy->log2_data_unit_size > inode->i_blkbits || policy->log2_data_unit_size < SECTOR_SHIFT /* 9 */) { fscrypt_warn(inode, "Unsupported log2_data_unit_size in encryption policy: %d", policy->log2_data_unit_size); return false; } if (policy->log2_data_unit_size != inode->i_blkbits && (policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)) { /* * Not safe to enable yet, as we need to ensure that DUN * wraparound can only occur on a FS block boundary. */ fscrypt_warn(inode, "Sub-block data units not yet supported with IV_INO_LBLK_32"); return false; } } if ((policy->flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) && !supported_direct_key_modes(inode, policy->contents_encryption_mode, policy->filenames_encryption_mode)) return false; if ((policy->flags & (FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64 | FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)) && !supported_iv_ino_lblk_policy(policy, inode)) return false; if (memchr_inv(policy->__reserved, 0, sizeof(policy->__reserved))) { fscrypt_warn(inode, "Reserved bits set in encryption policy"); return false; } return true; } /** * fscrypt_supported_policy() - check whether an encryption policy is supported * @policy_u: the encryption policy * @inode: the inode on which the policy will be used * * Given an encryption policy, check whether all its encryption modes and other * settings are supported by this kernel on the given inode. (But we don't * currently don't check for crypto API support here, so attempting to use an * algorithm not configured into the crypto API will still fail later.) * * Return: %true if supported, else %false */ bool fscrypt_supported_policy(const union fscrypt_policy *policy_u, const struct inode *inode) { switch (policy_u->version) { case FSCRYPT_POLICY_V1: return fscrypt_supported_v1_policy(&policy_u->v1, inode); case FSCRYPT_POLICY_V2: return fscrypt_supported_v2_policy(&policy_u->v2, inode); } return false; } /** * fscrypt_new_context() - create a new fscrypt_context * @ctx_u: output context * @policy_u: input policy * @nonce: nonce to use * * Create an fscrypt_context for an inode that is being assigned the given * encryption policy. @nonce must be a new random nonce. * * Return: the size of the new context in bytes. */ static int fscrypt_new_context(union fscrypt_context *ctx_u, const union fscrypt_policy *policy_u, const u8 nonce[FSCRYPT_FILE_NONCE_SIZE]) { memset(ctx_u, 0, sizeof(*ctx_u)); switch (policy_u->version) { case FSCRYPT_POLICY_V1: { const struct fscrypt_policy_v1 *policy = &policy_u->v1; struct fscrypt_context_v1 *ctx = &ctx_u->v1; ctx->version = FSCRYPT_CONTEXT_V1; ctx->contents_encryption_mode = policy->contents_encryption_mode; ctx->filenames_encryption_mode = policy->filenames_encryption_mode; ctx->flags = policy->flags; memcpy(ctx->master_key_descriptor, policy->master_key_descriptor, sizeof(ctx->master_key_descriptor)); memcpy(ctx->nonce, nonce, FSCRYPT_FILE_NONCE_SIZE); return sizeof(*ctx); } case FSCRYPT_POLICY_V2: { const struct fscrypt_policy_v2 *policy = &policy_u->v2; struct fscrypt_context_v2 *ctx = &ctx_u->v2; ctx->version = FSCRYPT_CONTEXT_V2; ctx->contents_encryption_mode = policy->contents_encryption_mode; ctx->filenames_encryption_mode = policy->filenames_encryption_mode; ctx->flags = policy->flags; ctx->log2_data_unit_size = policy->log2_data_unit_size; memcpy(ctx->master_key_identifier, policy->master_key_identifier, sizeof(ctx->master_key_identifier)); memcpy(ctx->nonce, nonce, FSCRYPT_FILE_NONCE_SIZE); return sizeof(*ctx); } } BUG(); } /** * fscrypt_policy_from_context() - convert an fscrypt_context to * an fscrypt_policy * @policy_u: output policy * @ctx_u: input context * @ctx_size: size of input context in bytes * * Given an fscrypt_context, build the corresponding fscrypt_policy. * * Return: 0 on success, or -EINVAL if the fscrypt_context has an unrecognized * version number or size. * * This does *not* validate the settings within the policy itself, e.g. the * modes, flags, and reserved bits. Use fscrypt_supported_policy() for that. */ int fscrypt_policy_from_context(union fscrypt_policy *policy_u, const union fscrypt_context *ctx_u, int ctx_size) { memset(policy_u, 0, sizeof(*policy_u)); if (!fscrypt_context_is_valid(ctx_u, ctx_size)) return -EINVAL; switch (ctx_u->version) { case FSCRYPT_CONTEXT_V1: { const struct fscrypt_context_v1 *ctx = &ctx_u->v1; struct fscrypt_policy_v1 *policy = &policy_u->v1; policy->version = FSCRYPT_POLICY_V1; policy->contents_encryption_mode = ctx->contents_encryption_mode; policy->filenames_encryption_mode = ctx->filenames_encryption_mode; policy->flags = ctx->flags; memcpy(policy->master_key_descriptor, ctx->master_key_descriptor, sizeof(policy->master_key_descriptor)); return 0; } case FSCRYPT_CONTEXT_V2: { const struct fscrypt_context_v2 *ctx = &ctx_u->v2; struct fscrypt_policy_v2 *policy = &policy_u->v2; policy->version = FSCRYPT_POLICY_V2; policy->contents_encryption_mode = ctx->contents_encryption_mode; policy->filenames_encryption_mode = ctx->filenames_encryption_mode; policy->flags = ctx->flags; policy->log2_data_unit_size = ctx->log2_data_unit_size; memcpy(policy->__reserved, ctx->__reserved, sizeof(policy->__reserved)); memcpy(policy->master_key_identifier, ctx->master_key_identifier, sizeof(policy->master_key_identifier)); return 0; } } /* unreachable */ return -EINVAL; } /* Retrieve an inode's encryption policy */ static int fscrypt_get_policy(struct inode *inode, union fscrypt_policy *policy) { const struct fscrypt_inode_info *ci; union fscrypt_context ctx; int ret; ci = fscrypt_get_inode_info(inode); if (ci) { /* key available, use the cached policy */ *policy = ci->ci_policy; return 0; } if (!IS_ENCRYPTED(inode)) return -ENODATA; ret = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); if (ret < 0) return (ret == -ERANGE) ? -EINVAL : ret; return fscrypt_policy_from_context(policy, &ctx, ret); } static int set_encryption_policy(struct inode *inode, const union fscrypt_policy *policy) { u8 nonce[FSCRYPT_FILE_NONCE_SIZE]; union fscrypt_context ctx; int ctxsize; int err; if (!fscrypt_supported_policy(policy, inode)) return -EINVAL; switch (policy->version) { case FSCRYPT_POLICY_V1: /* * The original encryption policy version provided no way of * verifying that the correct master key was supplied, which was * insecure in scenarios where multiple users have access to the * same encrypted files (even just read-only access). The new * encryption policy version fixes this and also implies use of * an improved key derivation function and allows non-root users * to securely remove keys. So as long as compatibility with * old kernels isn't required, it is recommended to use the new * policy version for all new encrypted directories. */ pr_warn_once("%s (pid %d) is setting deprecated v1 encryption policy; recommend upgrading to v2.\n", current->comm, current->pid); break; case FSCRYPT_POLICY_V2: err = fscrypt_verify_key_added(inode->i_sb, policy->v2.master_key_identifier); if (err) return err; if (policy->v2.flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32) pr_warn_once("%s (pid %d) is setting an IV_INO_LBLK_32 encryption policy. This should only be used if there are certain hardware limitations.\n", current->comm, current->pid); break; default: WARN_ON_ONCE(1); return -EINVAL; } get_random_bytes(nonce, FSCRYPT_FILE_NONCE_SIZE); ctxsize = fscrypt_new_context(&ctx, policy, nonce); return inode->i_sb->s_cop->set_context(inode, &ctx, ctxsize, NULL); } int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg) { union fscrypt_policy policy; union fscrypt_policy existing_policy; struct inode *inode = file_inode(filp); u8 version; int size; int ret; if (get_user(policy.version, (const u8 __user *)arg)) return -EFAULT; size = fscrypt_policy_size(&policy); if (size <= 0) return -EINVAL; /* * We should just copy the remaining 'size - 1' bytes here, but a * bizarre bug in gcc 7 and earlier (fixed by gcc r255731) causes gcc to * think that size can be 0 here (despite the check above!) *and* that * it's a compile-time constant. Thus it would think copy_from_user() * is passed compile-time constant ULONG_MAX, causing the compile-time * buffer overflow check to fail, breaking the build. This only occurred * when building an i386 kernel with -Os and branch profiling enabled. * * Work around it by just copying the first byte again... */ version = policy.version; if (copy_from_user(&policy, arg, size)) return -EFAULT; policy.version = version; if (!inode_owner_or_capable(&nop_mnt_idmap, inode)) return -EACCES; ret = mnt_want_write_file(filp); if (ret) return ret; inode_lock(inode); ret = fscrypt_get_policy(inode, &existing_policy); if (ret == -ENODATA) { if (!S_ISDIR(inode->i_mode)) ret = -ENOTDIR; else if (IS_DEADDIR(inode)) ret = -ENOENT; else if (!inode->i_sb->s_cop->empty_dir(inode)) ret = -ENOTEMPTY; else ret = set_encryption_policy(inode, &policy); } else if (ret == -EINVAL || (ret == 0 && !fscrypt_policies_equal(&policy, &existing_policy))) { /* The file already uses a different encryption policy. */ ret = -EEXIST; } inode_unlock(inode); mnt_drop_write_file(filp); return ret; } EXPORT_SYMBOL(fscrypt_ioctl_set_policy); /* Original ioctl version; can only get the original policy version */ int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg) { union fscrypt_policy policy; int err; err = fscrypt_get_policy(file_inode(filp), &policy); if (err) return err; if (policy.version != FSCRYPT_POLICY_V1) return -EINVAL; if (copy_to_user(arg, &policy, sizeof(policy.v1))) return -EFAULT; return 0; } EXPORT_SYMBOL(fscrypt_ioctl_get_policy); /* Extended ioctl version; can get policies of any version */ int fscrypt_ioctl_get_policy_ex(struct file *filp, void __user *uarg) { struct fscrypt_get_policy_ex_arg arg; union fscrypt_policy *policy = (union fscrypt_policy *)&arg.policy; size_t policy_size; int err; /* arg is policy_size, then policy */ BUILD_BUG_ON(offsetof(typeof(arg), policy_size) != 0); BUILD_BUG_ON(offsetofend(typeof(arg), policy_size) != offsetof(typeof(arg), policy)); BUILD_BUG_ON(sizeof(arg.policy) != sizeof(*policy)); err = fscrypt_get_policy(file_inode(filp), policy); if (err) return err; policy_size = fscrypt_policy_size(policy); if (copy_from_user(&arg, uarg, sizeof(arg.policy_size))) return -EFAULT; if (policy_size > arg.policy_size) return -EOVERFLOW; arg.policy_size = policy_size; if (copy_to_user(uarg, &arg, sizeof(arg.policy_size) + policy_size)) return -EFAULT; return 0; } EXPORT_SYMBOL_GPL(fscrypt_ioctl_get_policy_ex); /* FS_IOC_GET_ENCRYPTION_NONCE: retrieve file's encryption nonce for testing */ int fscrypt_ioctl_get_nonce(struct file *filp, void __user *arg) { struct inode *inode = file_inode(filp); union fscrypt_context ctx; int ret; ret = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); if (ret < 0) return ret; if (!fscrypt_context_is_valid(&ctx, ret)) return -EINVAL; if (copy_to_user(arg, fscrypt_context_nonce(&ctx), FSCRYPT_FILE_NONCE_SIZE)) return -EFAULT; return 0; } EXPORT_SYMBOL_GPL(fscrypt_ioctl_get_nonce); /** * fscrypt_has_permitted_context() - is a file's encryption policy permitted * within its directory? * * @parent: inode for parent directory * @child: inode for file being looked up, opened, or linked into @parent * * Filesystems must call this before permitting access to an inode in a * situation where the parent directory is encrypted (either before allowing * ->lookup() to succeed, or for a regular file before allowing it to be opened) * and before any operation that involves linking an inode into an encrypted * directory, including link, rename, and cross rename. It enforces the * constraint that within a given encrypted directory tree, all files use the * same encryption policy. The pre-access check is needed to detect potentially * malicious offline violations of this constraint, while the link and rename * checks are needed to prevent online violations of this constraint. * * Return: 1 if permitted, 0 if forbidden. */ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child) { union fscrypt_policy parent_policy, child_policy; int err, err1, err2; /* No restrictions on file types which are never encrypted */ if (!S_ISREG(child->i_mode) && !S_ISDIR(child->i_mode) && !S_ISLNK(child->i_mode)) return 1; /* No restrictions if the parent directory is unencrypted */ if (!IS_ENCRYPTED(parent)) return 1; /* Encrypted directories must not contain unencrypted files */ if (!IS_ENCRYPTED(child)) return 0; /* * Both parent and child are encrypted, so verify they use the same * encryption policy. Compare the cached policies if the keys are * available, otherwise retrieve and compare the fscrypt_contexts. * * Note that the fscrypt_context retrieval will be required frequently * when accessing an encrypted directory tree without the key. * Performance-wise this is not a big deal because we already don't * really optimize for file access without the key (to the extent that * such access is even possible), given that any attempted access * already causes a fscrypt_context retrieval and keyring search. * * In any case, if an unexpected error occurs, fall back to "forbidden". */ err = fscrypt_get_encryption_info(parent, true); if (err) return 0; err = fscrypt_get_encryption_info(child, true); if (err) return 0; err1 = fscrypt_get_policy(parent, &parent_policy); err2 = fscrypt_get_policy(child, &child_policy); /* * Allow the case where the parent and child both have an unrecognized * encryption policy, so that files with an unrecognized encryption * policy can be deleted. */ if (err1 == -EINVAL && err2 == -EINVAL) return 1; if (err1 || err2) return 0; return fscrypt_policies_equal(&parent_policy, &child_policy); } EXPORT_SYMBOL(fscrypt_has_permitted_context); /* * Return the encryption policy that new files in the directory will inherit, or * NULL if none, or an ERR_PTR() on error. If the directory is encrypted, also * ensure that its key is set up, so that the new filename can be encrypted. */ const union fscrypt_policy *fscrypt_policy_to_inherit(struct inode *dir) { int err; if (IS_ENCRYPTED(dir)) { err = fscrypt_require_key(dir); if (err) return ERR_PTR(err); return &fscrypt_get_inode_info_raw(dir)->ci_policy; } return fscrypt_get_dummy_policy(dir->i_sb); } /** * fscrypt_context_for_new_inode() - create an encryption context for a new inode * @ctx: where context should be written * @inode: inode from which to fetch policy and nonce * * Given an in-core "prepared" (via fscrypt_prepare_new_inode) inode, * generate a new context and write it to ctx. ctx _must_ be at least * FSCRYPT_SET_CONTEXT_MAX_SIZE bytes. * * Return: size of the resulting context or a negative error code. */ int fscrypt_context_for_new_inode(void *ctx, struct inode *inode) { struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode); BUILD_BUG_ON(sizeof(union fscrypt_context) != FSCRYPT_SET_CONTEXT_MAX_SIZE); /* fscrypt_prepare_new_inode() should have set up the key already. */ if (WARN_ON_ONCE(!ci)) return -ENOKEY; return fscrypt_new_context(ctx, &ci->ci_policy, ci->ci_nonce); } EXPORT_SYMBOL_GPL(fscrypt_context_for_new_inode); /** * fscrypt_set_context() - Set the fscrypt context of a new inode * @inode: a new inode * @fs_data: private data given by FS and passed to ->set_context() * * This should be called after fscrypt_prepare_new_inode(), generally during a * filesystem transaction. Everything here must be %GFP_NOFS-safe. * * Return: 0 on success, -errno on failure */ int fscrypt_set_context(struct inode *inode, void *fs_data) { struct fscrypt_inode_info *ci; union fscrypt_context ctx; int ctxsize; ctxsize = fscrypt_context_for_new_inode(&ctx, inode); if (ctxsize < 0) return ctxsize; /* * This may be the first time the inode number is available, so do any * delayed key setup that requires the inode number. */ ci = fscrypt_get_inode_info_raw(inode); if (ci->ci_policy.version == FSCRYPT_POLICY_V2 && (ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)) fscrypt_hash_inode_number(ci, ci->ci_master_key); return inode->i_sb->s_cop->set_context(inode, &ctx, ctxsize, fs_data); } EXPORT_SYMBOL_GPL(fscrypt_set_context); /** * fscrypt_parse_test_dummy_encryption() - parse the test_dummy_encryption mount option * @param: the mount option * @dummy_policy: (input/output) the place to write the dummy policy that will * result from parsing the option. Zero-initialize this. If a policy is * already set here (due to test_dummy_encryption being given multiple * times), then this function will verify that the policies are the same. * * Return: 0 on success; -EINVAL if the argument is invalid; -EEXIST if the * argument conflicts with one already specified; or -ENOMEM. */ int fscrypt_parse_test_dummy_encryption(const struct fs_parameter *param, struct fscrypt_dummy_policy *dummy_policy) { const char *arg = "v2"; union fscrypt_policy *policy; int err; if (param->type == fs_value_is_string && *param->string) arg = param->string; policy = kzalloc(sizeof(*policy), GFP_KERNEL); if (!policy) return -ENOMEM; if (!strcmp(arg, "v1")) { policy->version = FSCRYPT_POLICY_V1; policy->v1.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS; policy->v1.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS; memset(policy->v1.master_key_descriptor, 0x42, FSCRYPT_KEY_DESCRIPTOR_SIZE); } else if (!strcmp(arg, "v2")) { policy->version = FSCRYPT_POLICY_V2; policy->v2.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS; policy->v2.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS; fscrypt_get_test_dummy_key_identifier( policy->v2.master_key_identifier); } else { err = -EINVAL; goto out; } if (dummy_policy->policy) { if (fscrypt_policies_equal(policy, dummy_policy->policy)) err = 0; else err = -EEXIST; goto out; } dummy_policy->policy = policy; policy = NULL; err = 0; out: kfree(policy); return err; } EXPORT_SYMBOL_GPL(fscrypt_parse_test_dummy_encryption); /** * fscrypt_dummy_policies_equal() - check whether two dummy policies are equal * @p1: the first test dummy policy (may be unset) * @p2: the second test dummy policy (may be unset) * * Return: %true if the dummy policies are both set and equal, or both unset. */ bool fscrypt_dummy_policies_equal(const struct fscrypt_dummy_policy *p1, const struct fscrypt_dummy_policy *p2) { if (!p1->policy && !p2->policy) return true; if (!p1->policy || !p2->policy) return false; return fscrypt_policies_equal(p1->policy, p2->policy); } EXPORT_SYMBOL_GPL(fscrypt_dummy_policies_equal); /** * fscrypt_show_test_dummy_encryption() - show '-o test_dummy_encryption' * @seq: the seq_file to print the option to * @sep: the separator character to use * @sb: the filesystem whose options are being shown * * Show the test_dummy_encryption mount option, if it was specified. * This is mainly used for /proc/mounts. */ void fscrypt_show_test_dummy_encryption(struct seq_file *seq, char sep, struct super_block *sb) { const union fscrypt_policy *policy = fscrypt_get_dummy_policy(sb); int vers; if (!policy) return; vers = policy->version; if (vers == FSCRYPT_POLICY_V1) /* Handle numbering quirk */ vers = 1; seq_printf(seq, "%ctest_dummy_encryption=v%d", sep, vers); } EXPORT_SYMBOL_GPL(fscrypt_show_test_dummy_encryption); |
| 6 6 3 12 1 4 3 1 3 4 4 2 6 3 6 6 6 12 3 9 6 4 5 1 4 4 2 1 1 8 2 6 5 4 5 1 4 4 4 3 1 1 4 12 12 1 9 1 10 2 5 3 1 2 17 1 1 2 1 6 5 1 1 1 4 1 1 3 1 1 2 1 1 1 16 5 2 3 3 1 1 1 2 1 7 7 1 6 4 2 6 9 4 6 5 1 3 3 1 1 10 10 9 9 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 | /* BlueZ - Bluetooth protocol stack for Linux Copyright (C) 2000-2001 Qualcomm Incorporated Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License version 2 as published by the Free Software Foundation; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS SOFTWARE IS DISCLAIMED. */ /* Bluetooth SCO sockets. */ #include <linux/module.h> #include <linux/debugfs.h> #include <linux/seq_file.h> #include <linux/sched/signal.h> #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> #include <net/bluetooth/sco.h> static bool disable_esco; static const struct proto_ops sco_sock_ops; static struct bt_sock_list sco_sk_list = { .lock = __RW_LOCK_UNLOCKED(sco_sk_list.lock) }; /* ---- SCO connections ---- */ struct sco_conn { struct hci_conn *hcon; spinlock_t lock; struct sock *sk; struct delayed_work timeout_work; unsigned int mtu; struct kref ref; }; #define sco_conn_lock(c) spin_lock(&c->lock) #define sco_conn_unlock(c) spin_unlock(&c->lock) static void sco_sock_close(struct sock *sk); static void sco_sock_kill(struct sock *sk); /* ----- SCO socket info ----- */ #define sco_pi(sk) ((struct sco_pinfo *) sk) struct sco_pinfo { struct bt_sock bt; bdaddr_t src; bdaddr_t dst; __u32 flags; __u16 setting; struct bt_codec codec; struct sco_conn *conn; }; /* ---- SCO timers ---- */ #define SCO_CONN_TIMEOUT (HZ * 40) #define SCO_DISCONN_TIMEOUT (HZ * 2) static void sco_conn_free(struct kref *ref) { struct sco_conn *conn = container_of(ref, struct sco_conn, ref); BT_DBG("conn %p", conn); if (conn->sk) sco_pi(conn->sk)->conn = NULL; if (conn->hcon) { conn->hcon->sco_data = NULL; hci_conn_drop(conn->hcon); } /* Ensure no more work items will run since hci_conn has been dropped */ disable_delayed_work_sync(&conn->timeout_work); kfree(conn); } static void sco_conn_put(struct sco_conn *conn) { if (!conn) return; BT_DBG("conn %p refcnt %d", conn, kref_read(&conn->ref)); kref_put(&conn->ref, sco_conn_free); } static struct sco_conn *sco_conn_hold(struct sco_conn *conn) { BT_DBG("conn %p refcnt %u", conn, kref_read(&conn->ref)); kref_get(&conn->ref); return conn; } static struct sco_conn *sco_conn_hold_unless_zero(struct sco_conn *conn) { if (!conn) return NULL; BT_DBG("conn %p refcnt %u", conn, kref_read(&conn->ref)); if (!kref_get_unless_zero(&conn->ref)) return NULL; return conn; } static struct sock *sco_sock_hold(struct sco_conn *conn) { if (!conn || !bt_sock_linked(&sco_sk_list, conn->sk)) return NULL; sock_hold(conn->sk); return conn->sk; } static void sco_sock_timeout(struct work_struct *work) { struct sco_conn *conn = container_of(work, struct sco_conn, timeout_work.work); struct sock *sk; conn = sco_conn_hold_unless_zero(conn); if (!conn) return; sco_conn_lock(conn); if (!conn->hcon) { sco_conn_unlock(conn); sco_conn_put(conn); return; } sk = sco_sock_hold(conn); sco_conn_unlock(conn); sco_conn_put(conn); if (!sk) return; BT_DBG("sock %p state %d", sk, sk->sk_state); lock_sock(sk); sk->sk_err = ETIMEDOUT; sk->sk_state_change(sk); release_sock(sk); sock_put(sk); } static void sco_sock_set_timer(struct sock *sk, long timeout) { if (!sco_pi(sk)->conn) return; BT_DBG("sock %p state %d timeout %ld", sk, sk->sk_state, timeout); cancel_delayed_work(&sco_pi(sk)->conn->timeout_work); schedule_delayed_work(&sco_pi(sk)->conn->timeout_work, timeout); } static void sco_sock_clear_timer(struct sock *sk) { if (!sco_pi(sk)->conn) return; BT_DBG("sock %p state %d", sk, sk->sk_state); cancel_delayed_work(&sco_pi(sk)->conn->timeout_work); } /* ---- SCO connections ---- */ static struct sco_conn *sco_conn_add(struct hci_conn *hcon) { struct sco_conn *conn = hcon->sco_data; conn = sco_conn_hold_unless_zero(conn); if (conn) { if (!conn->hcon) { sco_conn_lock(conn); conn->hcon = hcon; sco_conn_unlock(conn); } return conn; } conn = kzalloc(sizeof(struct sco_conn), GFP_KERNEL); if (!conn) return NULL; kref_init(&conn->ref); spin_lock_init(&conn->lock); INIT_DELAYED_WORK(&conn->timeout_work, sco_sock_timeout); hcon->sco_data = conn; conn->hcon = hcon; conn->mtu = hcon->mtu; if (hcon->mtu > 0) conn->mtu = hcon->mtu; else conn->mtu = 60; BT_DBG("hcon %p conn %p", hcon, conn); return conn; } /* Delete channel. * Must be called on the locked socket. */ static void sco_chan_del(struct sock *sk, int err) { struct sco_conn *conn; conn = sco_pi(sk)->conn; sco_pi(sk)->conn = NULL; BT_DBG("sk %p, conn %p, err %d", sk, conn, err); if (conn) { sco_conn_lock(conn); conn->sk = NULL; sco_conn_unlock(conn); sco_conn_put(conn); } sk->sk_state = BT_CLOSED; sk->sk_err = err; sk->sk_state_change(sk); sock_set_flag(sk, SOCK_ZAPPED); } static void sco_conn_del(struct hci_conn *hcon, int err) { struct sco_conn *conn = hcon->sco_data; struct sock *sk; conn = sco_conn_hold_unless_zero(conn); if (!conn) return; BT_DBG("hcon %p conn %p, err %d", hcon, conn, err); sco_conn_lock(conn); sk = sco_sock_hold(conn); sco_conn_unlock(conn); sco_conn_put(conn); if (!sk) { sco_conn_put(conn); return; } /* Kill socket */ lock_sock(sk); sco_sock_clear_timer(sk); sco_chan_del(sk, err); release_sock(sk); sock_put(sk); } static void __sco_chan_add(struct sco_conn *conn, struct sock *sk, struct sock *parent) { BT_DBG("conn %p", conn); sco_pi(sk)->conn = conn; conn->sk = sk; if (parent) bt_accept_enqueue(parent, sk, true); } static int sco_chan_add(struct sco_conn *conn, struct sock *sk, struct sock *parent) { int err = 0; sco_conn_lock(conn); if (conn->sk) err = -EBUSY; else __sco_chan_add(conn, sk, parent); sco_conn_unlock(conn); return err; } static int sco_connect(struct sock *sk) { struct sco_conn *conn; struct hci_conn *hcon; struct hci_dev *hdev; int err, type; BT_DBG("%pMR -> %pMR", &sco_pi(sk)->src, &sco_pi(sk)->dst); hdev = hci_get_route(&sco_pi(sk)->dst, &sco_pi(sk)->src, BDADDR_BREDR); if (!hdev) return -EHOSTUNREACH; hci_dev_lock(hdev); if (lmp_esco_capable(hdev) && !disable_esco) type = ESCO_LINK; else type = SCO_LINK; switch (sco_pi(sk)->setting & SCO_AIRMODE_MASK) { case SCO_AIRMODE_TRANSP: if (!lmp_transp_capable(hdev) || !lmp_esco_capable(hdev)) { err = -EOPNOTSUPP; goto unlock; } break; } hcon = hci_connect_sco(hdev, type, &sco_pi(sk)->dst, sco_pi(sk)->setting, &sco_pi(sk)->codec, READ_ONCE(sk->sk_sndtimeo)); if (IS_ERR(hcon)) { err = PTR_ERR(hcon); goto unlock; } conn = sco_conn_add(hcon); if (!conn) { hci_conn_drop(hcon); err = -ENOMEM; goto unlock; } lock_sock(sk); err = sco_chan_add(conn, sk, NULL); if (err) { release_sock(sk); goto unlock; } /* Update source addr of the socket */ bacpy(&sco_pi(sk)->src, &hcon->src); if (hcon->state == BT_CONNECTED) { sco_sock_clear_timer(sk); sk->sk_state = BT_CONNECTED; } else { sk->sk_state = BT_CONNECT; sco_sock_set_timer(sk, READ_ONCE(sk->sk_sndtimeo)); } release_sock(sk); unlock: hci_dev_unlock(hdev); hci_dev_put(hdev); return err; } static int sco_send_frame(struct sock *sk, struct sk_buff *skb, const struct sockcm_cookie *sockc) { struct sco_conn *conn = sco_pi(sk)->conn; int len = skb->len; /* Check outgoing MTU */ if (len > conn->mtu) return -EINVAL; BT_DBG("sk %p len %d", sk, len); hci_setup_tx_timestamp(skb, 1, sockc); hci_send_sco(conn->hcon, skb); return len; } static void sco_recv_frame(struct sco_conn *conn, struct sk_buff *skb) { struct sock *sk; sco_conn_lock(conn); sk = conn->sk; sco_conn_unlock(conn); if (!sk) goto drop; BT_DBG("sk %p len %u", sk, skb->len); if (sk->sk_state != BT_CONNECTED) goto drop; if (!sock_queue_rcv_skb(sk, skb)) return; drop: kfree_skb(skb); } /* -------- Socket interface ---------- */ static struct sock *__sco_get_sock_listen_by_addr(bdaddr_t *ba) { struct sock *sk; sk_for_each(sk, &sco_sk_list.head) { if (sk->sk_state != BT_LISTEN) continue; if (!bacmp(&sco_pi(sk)->src, ba)) return sk; } return NULL; } /* Find socket listening on source bdaddr. * Returns closest match. */ static struct sock *sco_get_sock_listen(bdaddr_t *src) { struct sock *sk = NULL, *sk1 = NULL; read_lock(&sco_sk_list.lock); sk_for_each(sk, &sco_sk_list.head) { if (sk->sk_state != BT_LISTEN) continue; /* Exact match. */ if (!bacmp(&sco_pi(sk)->src, src)) break; /* Closest match */ if (!bacmp(&sco_pi(sk)->src, BDADDR_ANY)) sk1 = sk; } read_unlock(&sco_sk_list.lock); return sk ? sk : sk1; } static void sco_sock_destruct(struct sock *sk) { BT_DBG("sk %p", sk); sco_conn_put(sco_pi(sk)->conn); skb_queue_purge(&sk->sk_receive_queue); skb_queue_purge(&sk->sk_write_queue); } static void sco_sock_cleanup_listen(struct sock *parent) { struct sock *sk; BT_DBG("parent %p", parent); /* Close not yet accepted channels */ while ((sk = bt_accept_dequeue(parent, NULL))) { sco_sock_close(sk); sco_sock_kill(sk); } parent->sk_state = BT_CLOSED; sock_set_flag(parent, SOCK_ZAPPED); } /* Kill socket (only if zapped and orphan) * Must be called on unlocked socket. */ static void sco_sock_kill(struct sock *sk) { if (!sock_flag(sk, SOCK_ZAPPED) || sk->sk_socket) return; BT_DBG("sk %p state %d", sk, sk->sk_state); /* Sock is dead, so set conn->sk to NULL to avoid possible UAF */ if (sco_pi(sk)->conn) { sco_conn_lock(sco_pi(sk)->conn); sco_pi(sk)->conn->sk = NULL; sco_conn_unlock(sco_pi(sk)->conn); } /* Kill poor orphan */ bt_sock_unlink(&sco_sk_list, sk); sock_set_flag(sk, SOCK_DEAD); sock_put(sk); } static void __sco_sock_close(struct sock *sk) { BT_DBG("sk %p state %d socket %p", sk, sk->sk_state, sk->sk_socket); switch (sk->sk_state) { case BT_LISTEN: sco_sock_cleanup_listen(sk); break; case BT_CONNECTED: case BT_CONFIG: case BT_CONNECT2: case BT_CONNECT: case BT_DISCONN: sco_chan_del(sk, ECONNRESET); break; default: sock_set_flag(sk, SOCK_ZAPPED); break; } } /* Must be called on unlocked socket. */ static void sco_sock_close(struct sock *sk) { lock_sock(sk); sco_sock_clear_timer(sk); __sco_sock_close(sk); release_sock(sk); } static void sco_sock_init(struct sock *sk, struct sock *parent) { BT_DBG("sk %p", sk); if (parent) { sk->sk_type = parent->sk_type; bt_sk(sk)->flags = bt_sk(parent)->flags; security_sk_clone(parent, sk); } } static struct proto sco_proto = { .name = "SCO", .owner = THIS_MODULE, .obj_size = sizeof(struct sco_pinfo) }; static struct sock *sco_sock_alloc(struct net *net, struct socket *sock, int proto, gfp_t prio, int kern) { struct sock *sk; sk = bt_sock_alloc(net, sock, &sco_proto, proto, prio, kern); if (!sk) return NULL; sk->sk_destruct = sco_sock_destruct; sk->sk_sndtimeo = SCO_CONN_TIMEOUT; sco_pi(sk)->setting = BT_VOICE_CVSD_16BIT; sco_pi(sk)->codec.id = BT_CODEC_CVSD; sco_pi(sk)->codec.cid = 0xffff; sco_pi(sk)->codec.vid = 0xffff; sco_pi(sk)->codec.data_path = 0x00; bt_sock_link(&sco_sk_list, sk); return sk; } static int sco_sock_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; BT_DBG("sock %p", sock); sock->state = SS_UNCONNECTED; if (sock->type != SOCK_SEQPACKET) return -ESOCKTNOSUPPORT; sock->ops = &sco_sock_ops; sk = sco_sock_alloc(net, sock, protocol, GFP_ATOMIC, kern); if (!sk) return -ENOMEM; sco_sock_init(sk, NULL); return 0; } static int sco_sock_bind(struct socket *sock, struct sockaddr_unsized *addr, int addr_len) { struct sockaddr_sco *sa = (struct sockaddr_sco *) addr; struct sock *sk = sock->sk; int err = 0; if (!addr || addr_len < sizeof(struct sockaddr_sco) || addr->sa_family != AF_BLUETOOTH) return -EINVAL; BT_DBG("sk %p %pMR", sk, &sa->sco_bdaddr); lock_sock(sk); if (sk->sk_state != BT_OPEN) { err = -EBADFD; goto done; } if (sk->sk_type != SOCK_SEQPACKET) { err = -EINVAL; goto done; } bacpy(&sco_pi(sk)->src, &sa->sco_bdaddr); sk->sk_state = BT_BOUND; done: release_sock(sk); return err; } static int sco_sock_connect(struct socket *sock, struct sockaddr_unsized *addr, int alen, int flags) { struct sockaddr_sco *sa = (struct sockaddr_sco *) addr; struct sock *sk = sock->sk; int err; BT_DBG("sk %p", sk); if (alen < sizeof(struct sockaddr_sco) || addr->sa_family != AF_BLUETOOTH) return -EINVAL; if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND) return -EBADFD; if (sk->sk_type != SOCK_SEQPACKET) err = -EINVAL; lock_sock(sk); /* Set destination address and psm */ bacpy(&sco_pi(sk)->dst, &sa->sco_bdaddr); release_sock(sk); err = sco_connect(sk); if (err) return err; lock_sock(sk); err = bt_sock_wait_state(sk, BT_CONNECTED, sock_sndtimeo(sk, flags & O_NONBLOCK)); release_sock(sk); return err; } static int sco_sock_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; bdaddr_t *src = &sco_pi(sk)->src; int err = 0; BT_DBG("sk %p backlog %d", sk, backlog); lock_sock(sk); if (sk->sk_state != BT_BOUND) { err = -EBADFD; goto done; } if (sk->sk_type != SOCK_SEQPACKET) { err = -EINVAL; goto done; } write_lock(&sco_sk_list.lock); if (__sco_get_sock_listen_by_addr(src)) { err = -EADDRINUSE; goto unlock; } sk->sk_max_ack_backlog = backlog; sk->sk_ack_backlog = 0; sk->sk_state = BT_LISTEN; unlock: write_unlock(&sco_sk_list.lock); done: release_sock(sk); return err; } static int sco_sock_accept(struct socket *sock, struct socket *newsock, struct proto_accept_arg *arg) { DEFINE_WAIT_FUNC(wait, woken_wake_function); struct sock *sk = sock->sk, *ch; long timeo; int err = 0; lock_sock(sk); timeo = sock_rcvtimeo(sk, arg->flags & O_NONBLOCK); BT_DBG("sk %p timeo %ld", sk, timeo); /* Wait for an incoming connection. (wake-one). */ add_wait_queue_exclusive(sk_sleep(sk), &wait); while (1) { if (sk->sk_state != BT_LISTEN) { err = -EBADFD; break; } ch = bt_accept_dequeue(sk, newsock); if (ch) break; if (!timeo) { err = -EAGAIN; break; } if (signal_pending(current)) { err = sock_intr_errno(timeo); break; } release_sock(sk); timeo = wait_woken(&wait, TASK_INTERRUPTIBLE, timeo); lock_sock(sk); } remove_wait_queue(sk_sleep(sk), &wait); if (err) goto done; newsock->state = SS_CONNECTED; BT_DBG("new socket %p", ch); done: release_sock(sk); return err; } static int sco_sock_getname(struct socket *sock, struct sockaddr *addr, int peer) { struct sockaddr_sco *sa = (struct sockaddr_sco *) addr; struct sock *sk = sock->sk; BT_DBG("sock %p, sk %p", sock, sk); addr->sa_family = AF_BLUETOOTH; if (peer) bacpy(&sa->sco_bdaddr, &sco_pi(sk)->dst); else bacpy(&sa->sco_bdaddr, &sco_pi(sk)->src); return sizeof(struct sockaddr_sco); } static int sco_sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct sk_buff *skb; struct sockcm_cookie sockc; int err; BT_DBG("sock %p, sk %p", sock, sk); err = sock_error(sk); if (err) return err; if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; hci_sockcm_init(&sockc, sk); if (msg->msg_controllen) { err = sock_cmsg_send(sk, msg, &sockc); if (err) return err; } skb = bt_skb_sendmsg(sk, msg, len, len, 0, 0); if (IS_ERR(skb)) return PTR_ERR(skb); lock_sock(sk); if (sk->sk_state == BT_CONNECTED) err = sco_send_frame(sk, skb, &sockc); else err = -ENOTCONN; release_sock(sk); if (err < 0) kfree_skb(skb); return err; } static void sco_conn_defer_accept(struct hci_conn *conn, u16 setting) { struct hci_dev *hdev = conn->hdev; BT_DBG("conn %p", conn); conn->state = BT_CONFIG; if (!lmp_esco_capable(hdev)) { struct hci_cp_accept_conn_req cp; bacpy(&cp.bdaddr, &conn->dst); cp.role = 0x00; /* Ignored */ hci_send_cmd(hdev, HCI_OP_ACCEPT_CONN_REQ, sizeof(cp), &cp); } else { struct hci_cp_accept_sync_conn_req cp; bacpy(&cp.bdaddr, &conn->dst); cp.pkt_type = cpu_to_le16(conn->pkt_type); cp.tx_bandwidth = cpu_to_le32(0x00001f40); cp.rx_bandwidth = cpu_to_le32(0x00001f40); cp.content_format = cpu_to_le16(setting); switch (setting & SCO_AIRMODE_MASK) { case SCO_AIRMODE_TRANSP: if (conn->pkt_type & ESCO_2EV3) cp.max_latency = cpu_to_le16(0x0008); else cp.max_latency = cpu_to_le16(0x000D); cp.retrans_effort = 0x02; break; case SCO_AIRMODE_CVSD: cp.max_latency = cpu_to_le16(0xffff); cp.retrans_effort = 0xff; break; default: /* use CVSD settings as fallback */ cp.max_latency = cpu_to_le16(0xffff); cp.retrans_effort = 0xff; break; } hci_send_cmd(hdev, HCI_OP_ACCEPT_SYNC_CONN_REQ, sizeof(cp), &cp); } } static int sco_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { struct sock *sk = sock->sk; struct sco_pinfo *pi = sco_pi(sk); if (unlikely(flags & MSG_ERRQUEUE)) return sock_recv_errqueue(sk, msg, len, SOL_BLUETOOTH, BT_SCM_ERROR); lock_sock(sk); if (sk->sk_state == BT_CONNECT2 && test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) { sco_conn_defer_accept(pi->conn->hcon, pi->setting); sk->sk_state = BT_CONFIG; release_sock(sk); return 0; } release_sock(sk); return bt_sock_recvmsg(sock, msg, len, flags); } static int sco_sock_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; int err = 0; struct bt_voice voice; u32 opt; struct bt_codecs *codecs; struct hci_dev *hdev; __u8 buffer[255]; BT_DBG("sk %p", sk); lock_sock(sk); switch (optname) { case BT_DEFER_SETUP: if (sk->sk_state != BT_BOUND && sk->sk_state != BT_LISTEN) { err = -EINVAL; break; } err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen); if (err) break; if (opt) set_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags); else clear_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags); break; case BT_VOICE: if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND && sk->sk_state != BT_CONNECT2) { err = -EINVAL; break; } voice.setting = sco_pi(sk)->setting; err = copy_safe_from_sockptr(&voice, sizeof(voice), optval, optlen); if (err) break; sco_pi(sk)->setting = voice.setting; hdev = hci_get_route(&sco_pi(sk)->dst, &sco_pi(sk)->src, BDADDR_BREDR); if (!hdev) { err = -EBADFD; break; } switch (sco_pi(sk)->setting & SCO_AIRMODE_MASK) { case SCO_AIRMODE_TRANSP: if (enhanced_sync_conn_capable(hdev)) sco_pi(sk)->codec.id = BT_CODEC_TRANSPARENT; break; } hci_dev_put(hdev); break; case BT_PKT_STATUS: err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen); if (err) break; if (opt) set_bit(BT_SK_PKT_STATUS, &bt_sk(sk)->flags); else clear_bit(BT_SK_PKT_STATUS, &bt_sk(sk)->flags); break; case BT_CODEC: if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND && sk->sk_state != BT_CONNECT2) { err = -EINVAL; break; } hdev = hci_get_route(&sco_pi(sk)->dst, &sco_pi(sk)->src, BDADDR_BREDR); if (!hdev) { err = -EBADFD; break; } if (!hci_dev_test_flag(hdev, HCI_OFFLOAD_CODECS_ENABLED)) { hci_dev_put(hdev); err = -EOPNOTSUPP; break; } if (!hdev->get_data_path_id) { hci_dev_put(hdev); err = -EOPNOTSUPP; break; } if (optlen < sizeof(struct bt_codecs) || optlen > sizeof(buffer)) { hci_dev_put(hdev); err = -EINVAL; break; } err = copy_struct_from_sockptr(buffer, sizeof(buffer), optval, optlen); if (err) { hci_dev_put(hdev); break; } codecs = (void *)buffer; if (codecs->num_codecs > 1) { hci_dev_put(hdev); err = -EINVAL; break; } sco_pi(sk)->codec = codecs->codecs[0]; hci_dev_put(hdev); break; default: err = -ENOPROTOOPT; break; } release_sock(sk); return err; } static int sco_sock_getsockopt_old(struct socket *sock, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; struct sco_options opts; struct sco_conninfo cinfo; int err = 0; size_t len; BT_DBG("sk %p", sk); if (get_user(len, optlen)) return -EFAULT; lock_sock(sk); switch (optname) { case SCO_OPTIONS: if (sk->sk_state != BT_CONNECTED && !(sk->sk_state == BT_CONNECT2 && test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags))) { err = -ENOTCONN; break; } opts.mtu = sco_pi(sk)->conn->mtu; BT_DBG("mtu %u", opts.mtu); len = min(len, sizeof(opts)); if (copy_to_user(optval, (char *)&opts, len)) err = -EFAULT; break; case SCO_CONNINFO: if (sk->sk_state != BT_CONNECTED && !(sk->sk_state == BT_CONNECT2 && test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags))) { err = -ENOTCONN; break; } memset(&cinfo, 0, sizeof(cinfo)); cinfo.hci_handle = sco_pi(sk)->conn->hcon->handle; memcpy(cinfo.dev_class, sco_pi(sk)->conn->hcon->dev_class, 3); len = min(len, sizeof(cinfo)); if (copy_to_user(optval, (char *)&cinfo, len)) err = -EFAULT; break; default: err = -ENOPROTOOPT; break; } release_sock(sk); return err; } static int sco_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; int len, err = 0; struct bt_voice voice; u32 phys; int buf_len; struct codec_list *c; u8 num_codecs, i, __user *ptr; struct hci_dev *hdev; struct hci_codec_caps *caps; struct bt_codec codec; BT_DBG("sk %p", sk); if (level == SOL_SCO) return sco_sock_getsockopt_old(sock, optname, optval, optlen); if (get_user(len, optlen)) return -EFAULT; lock_sock(sk); switch (optname) { case BT_DEFER_SETUP: if (sk->sk_state != BT_BOUND && sk->sk_state != BT_LISTEN) { err = -EINVAL; break; } if (put_user(test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags), (u32 __user *)optval)) err = -EFAULT; break; case BT_VOICE: voice.setting = sco_pi(sk)->setting; len = min_t(unsigned int, len, sizeof(voice)); if (copy_to_user(optval, (char *)&voice, len)) err = -EFAULT; break; case BT_PHY: if (sk->sk_state != BT_CONNECTED) { err = -ENOTCONN; break; } phys = hci_conn_get_phy(sco_pi(sk)->conn->hcon); if (put_user(phys, (u32 __user *) optval)) err = -EFAULT; break; case BT_PKT_STATUS: if (put_user(test_bit(BT_SK_PKT_STATUS, &bt_sk(sk)->flags), (int __user *)optval)) err = -EFAULT; break; case BT_SNDMTU: case BT_RCVMTU: if (sk->sk_state != BT_CONNECTED) { err = -ENOTCONN; break; } if (put_user(sco_pi(sk)->conn->mtu, (u32 __user *)optval)) err = -EFAULT; break; case BT_CODEC: num_codecs = 0; buf_len = 0; hdev = hci_get_route(&sco_pi(sk)->dst, &sco_pi(sk)->src, BDADDR_BREDR); if (!hdev) { err = -EBADFD; break; } if (!hci_dev_test_flag(hdev, HCI_OFFLOAD_CODECS_ENABLED)) { hci_dev_put(hdev); err = -EOPNOTSUPP; break; } if (!hdev->get_data_path_id) { hci_dev_put(hdev); err = -EOPNOTSUPP; break; } release_sock(sk); /* find total buffer size required to copy codec + caps */ hci_dev_lock(hdev); list_for_each_entry(c, &hdev->local_codecs, list) { if (c->transport != HCI_TRANSPORT_SCO_ESCO) continue; num_codecs++; for (i = 0, caps = c->caps; i < c->num_caps; i++) { buf_len += 1 + caps->len; caps = (void *)&caps->data[caps->len]; } buf_len += sizeof(struct bt_codec); } hci_dev_unlock(hdev); buf_len += sizeof(struct bt_codecs); if (buf_len > len) { hci_dev_put(hdev); return -ENOBUFS; } ptr = optval; if (put_user(num_codecs, ptr)) { hci_dev_put(hdev); return -EFAULT; } ptr += sizeof(num_codecs); /* Iterate all the codecs supported over SCO and populate * codec data */ hci_dev_lock(hdev); list_for_each_entry(c, &hdev->local_codecs, list) { if (c->transport != HCI_TRANSPORT_SCO_ESCO) continue; codec.id = c->id; codec.cid = c->cid; codec.vid = c->vid; err = hdev->get_data_path_id(hdev, &codec.data_path); if (err < 0) break; codec.num_caps = c->num_caps; if (copy_to_user(ptr, &codec, sizeof(codec))) { err = -EFAULT; break; } ptr += sizeof(codec); /* find codec capabilities data length */ len = 0; for (i = 0, caps = c->caps; i < c->num_caps; i++) { len += 1 + caps->len; caps = (void *)&caps->data[caps->len]; } /* copy codec capabilities data */ if (len && copy_to_user(ptr, c->caps, len)) { err = -EFAULT; break; } ptr += len; } hci_dev_unlock(hdev); hci_dev_put(hdev); lock_sock(sk); if (!err && put_user(buf_len, optlen)) err = -EFAULT; break; default: err = -ENOPROTOOPT; break; } release_sock(sk); return err; } static int sco_sock_shutdown(struct socket *sock, int how) { struct sock *sk = sock->sk; int err = 0; BT_DBG("sock %p, sk %p", sock, sk); if (!sk) return 0; sock_hold(sk); lock_sock(sk); if (!sk->sk_shutdown) { sk->sk_shutdown = SHUTDOWN_MASK; sco_sock_clear_timer(sk); __sco_sock_close(sk); if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime && !(current->flags & PF_EXITING)) err = bt_sock_wait_state(sk, BT_CLOSED, sk->sk_lingertime); } release_sock(sk); sock_put(sk); return err; } static int sco_sock_release(struct socket *sock) { struct sock *sk = sock->sk; int err = 0; BT_DBG("sock %p, sk %p", sock, sk); if (!sk) return 0; sco_sock_close(sk); if (sock_flag(sk, SOCK_LINGER) && READ_ONCE(sk->sk_lingertime) && !(current->flags & PF_EXITING)) { lock_sock(sk); err = bt_sock_wait_state(sk, BT_CLOSED, sk->sk_lingertime); release_sock(sk); } sock_orphan(sk); sco_sock_kill(sk); return err; } static void sco_conn_ready(struct sco_conn *conn) { struct sock *parent; struct sock *sk = conn->sk; BT_DBG("conn %p", conn); if (sk) { lock_sock(sk); sco_sock_clear_timer(sk); sk->sk_state = BT_CONNECTED; sk->sk_state_change(sk); release_sock(sk); } else { sco_conn_lock(conn); if (!conn->hcon) { sco_conn_unlock(conn); return; } parent = sco_get_sock_listen(&conn->hcon->src); if (!parent) { sco_conn_unlock(conn); return; } lock_sock(parent); sk = sco_sock_alloc(sock_net(parent), NULL, BTPROTO_SCO, GFP_ATOMIC, 0); if (!sk) { release_sock(parent); sco_conn_unlock(conn); return; } sco_sock_init(sk, parent); bacpy(&sco_pi(sk)->src, &conn->hcon->src); bacpy(&sco_pi(sk)->dst, &conn->hcon->dst); sco_conn_hold(conn); hci_conn_hold(conn->hcon); __sco_chan_add(conn, sk, parent); if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(parent)->flags)) sk->sk_state = BT_CONNECT2; else sk->sk_state = BT_CONNECTED; /* Wake up parent */ parent->sk_data_ready(parent); release_sock(parent); sco_conn_unlock(conn); } } /* ----- SCO interface with lower layer (HCI) ----- */ int sco_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) { struct sock *sk; int lm = 0; BT_DBG("hdev %s, bdaddr %pMR", hdev->name, bdaddr); /* Find listening sockets */ read_lock(&sco_sk_list.lock); sk_for_each(sk, &sco_sk_list.head) { if (sk->sk_state != BT_LISTEN) continue; if (!bacmp(&sco_pi(sk)->src, &hdev->bdaddr) || !bacmp(&sco_pi(sk)->src, BDADDR_ANY)) { lm |= HCI_LM_ACCEPT; if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) *flags |= HCI_PROTO_DEFER; break; } } read_unlock(&sco_sk_list.lock); return lm; } static void sco_connect_cfm(struct hci_conn *hcon, __u8 status) { if (hcon->type != SCO_LINK && hcon->type != ESCO_LINK) return; BT_DBG("hcon %p bdaddr %pMR status %u", hcon, &hcon->dst, status); if (!status) { struct sco_conn *conn; conn = sco_conn_add(hcon); if (conn) { sco_conn_ready(conn); sco_conn_put(conn); } } else sco_conn_del(hcon, bt_to_errno(status)); } static void sco_disconn_cfm(struct hci_conn *hcon, __u8 reason) { if (hcon->type != SCO_LINK && hcon->type != ESCO_LINK) return; BT_DBG("hcon %p reason %d", hcon, reason); sco_conn_del(hcon, bt_to_errno(reason)); } int sco_recv_scodata(struct hci_dev *hdev, u16 handle, struct sk_buff *skb) { struct hci_conn *hcon; struct sco_conn *conn; hci_dev_lock(hdev); hcon = hci_conn_hash_lookup_handle(hdev, handle); if (!hcon) { hci_dev_unlock(hdev); kfree_skb(skb); return -ENOENT; } conn = sco_conn_hold_unless_zero(hcon->sco_data); hcon = NULL; hci_dev_unlock(hdev); if (!conn) { kfree_skb(skb); return -EINVAL; } BT_DBG("conn %p len %u", conn, skb->len); if (skb->len) sco_recv_frame(conn, skb); else kfree_skb(skb); sco_conn_put(conn); return 0; } static struct hci_cb sco_cb = { .name = "SCO", .connect_cfm = sco_connect_cfm, .disconn_cfm = sco_disconn_cfm, }; static int sco_debugfs_show(struct seq_file *f, void *p) { struct sock *sk; read_lock(&sco_sk_list.lock); sk_for_each(sk, &sco_sk_list.head) { seq_printf(f, "%pMR %pMR %d\n", &sco_pi(sk)->src, &sco_pi(sk)->dst, sk->sk_state); } read_unlock(&sco_sk_list.lock); return 0; } DEFINE_SHOW_ATTRIBUTE(sco_debugfs); static struct dentry *sco_debugfs; static const struct proto_ops sco_sock_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, .release = sco_sock_release, .bind = sco_sock_bind, .connect = sco_sock_connect, .listen = sco_sock_listen, .accept = sco_sock_accept, .getname = sco_sock_getname, .sendmsg = sco_sock_sendmsg, .recvmsg = sco_sock_recvmsg, .poll = bt_sock_poll, .ioctl = bt_sock_ioctl, .gettstamp = sock_gettstamp, .mmap = sock_no_mmap, .socketpair = sock_no_socketpair, .shutdown = sco_sock_shutdown, .setsockopt = sco_sock_setsockopt, .getsockopt = sco_sock_getsockopt }; static const struct net_proto_family sco_sock_family_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, .create = sco_sock_create, }; int __init sco_init(void) { int err; BUILD_BUG_ON(sizeof(struct sockaddr_sco) > sizeof(struct sockaddr)); err = proto_register(&sco_proto, 0); if (err < 0) return err; err = bt_sock_register(BTPROTO_SCO, &sco_sock_family_ops); if (err < 0) { BT_ERR("SCO socket registration failed"); goto error; } err = bt_procfs_init(&init_net, "sco", &sco_sk_list, NULL); if (err < 0) { BT_ERR("Failed to create SCO proc file"); bt_sock_unregister(BTPROTO_SCO); goto error; } BT_INFO("SCO socket layer initialized"); hci_register_cb(&sco_cb); if (IS_ERR_OR_NULL(bt_debugfs)) return 0; sco_debugfs = debugfs_create_file("sco", 0444, bt_debugfs, NULL, &sco_debugfs_fops); return 0; error: proto_unregister(&sco_proto); return err; } void sco_exit(void) { bt_procfs_cleanup(&init_net, "sco"); debugfs_remove(sco_debugfs); hci_unregister_cb(&sco_cb); bt_sock_unregister(BTPROTO_SCO); proto_unregister(&sco_proto); } module_param(disable_esco, bool, 0644); MODULE_PARM_DESC(disable_esco, "Disable eSCO connection creation"); |
| 131 133 2 7 1 6 23 2 1 127 118 10 216 119 4 5 1 8 58 55 65 65 10 56 57 2 58 51 5 5 7 8 58 28 7 2 10 10 2 1 1 1 2 1 1 1 47 193 192 192 194 191 3 194 143 53 72 101 45 192 12 193 193 191 1 1 8 3 8 19 19 19 4 3 1 3 4 8 2 1 1 1 3 2 4 1 8 417 400 145 74 65 159 93 135 25 220 154 149 5 4 9 1 13 4 32 195 133 262 182 186 147 43 1 183 173 8 2 16 171 356 338 22 2 25 18 1 15 7 17 321 356 1 134 233 25 113 259 239 137 106 176 44 20 2 192 195 5 119 165 100 175 172 172 168 59 169 151 16 147 148 1 148 1 148 148 25 143 6 11 11 2 2 2 13 14 14 2 14 14 88 167 88 15 71 47 45 51 7 73 25 49 28 128 181 12 189 126 229 12 230 1 230 342 2 1 1 1 10 7 4 2 2 1 3 1 10 10 2 9 86 83 24 4 4 14 82 30 35 84 48 87 54 47 20 41 41 3 19 1 6 1 6 3 9 2 4 2 2 15 6 9 1 8 1 7 1 4 3 1 1 14 14 3 1 2 2 7 3 8 2 6 6 3 7 6 2 2 1 2 1 2 4 1 1 3 7 4 5 5 3 3 2 7 1 1 1 1 40 40 6 1 33 7 25 9 2 6 1 1 1 7 7 5 4 1 2 6 1 7 7 5 1 3 12 13 6 1 3 2 2 10 13 13 1 1 36 44 43 107 1 101 8 108 4 10 7 65 73 72 6 6 63 59 2 7 63 2 28 1 2 9 34 3 29 1 35 2 35 1 39 15 5 5 30 25 24 2 8 47 54 3 44 14 23 1 7 33 1 3 60 66 12 10 113 10 103 3 1 90 6 11 11 7 1 6 453 449 348 145 3 274 99 193 453 70 6 23 60 1 1 8 8 8 132 15 128 10 98 18 3 29 2 9 21 133 133 129 126 8 8 8 127 114 109 7 114 3 474 154 154 498 500 10 502 181 110 60 19 19 8 108 36 181 180 180 180 179 181 180 3 2 7 2 1 4 1 1 1 1 12 12 10 5 3 3 1 1 1 1 3 5 3 2 3 2 25 3 22 28 53 23 48 48 4 1 3 2 1 4 2 28 23 5 2 23 23 4 23 23 4 63 1 1 3 8 50 22 4 24 281 1 2 31 2 1 11 12 9 6 218 3 6 4 4 2 3 1 9 2 2 2 2 4 1 6 36 4 2 2 1 2 1 1 19 1 2 2 2 2 3 2 2 3 5 12 2 4 2 2 5 4 2 3 4 2 1 4 2 1 2 2 2 7 9 2 1 1 5 859 582 280 18 8 51 38 17 5 17 5 16 6 17 3 18 1 18 17 1 18 18 18 18 2 17 5 5 5 5 129 131 1 5 4 2 3 3 1 1 2 1 1 1 1 1 2 7 1 1 5 2 6 1 7 1 1 1 1 1 1 1 3 1 1 1 1 1 1 1 3 1 1 1 1 1 1 2 1 1 1 1 2 1 40 4 2 3 1 1 25 15 2 25 11 27 40 2 1 1 1 44 41 288 156 132 21 21 21 5 21 21 5 2 1 3 1 1 2 189 189 8 172 3 2 5 169 169 169 169 156 13 10 10 2 8 9 1 10 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 | // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Implementation of the Transmission Control Protocol(TCP). * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Mark Evans, <evansmp@uhura.aston.ac.uk> * Corey Minyard <wf-rch!minyard@relay.EU.net> * Florian La Roche, <flla@stud.uni-sb.de> * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> * Linus Torvalds, <torvalds@cs.helsinki.fi> * Alan Cox, <gw4pts@gw4pts.ampr.org> * Matthew Dillon, <dillon@apollo.west.oic.com> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * Jorge Cwik, <jorge@laser.satlink.net> * * Fixes: * Alan Cox : Numerous verify_area() calls * Alan Cox : Set the ACK bit on a reset * Alan Cox : Stopped it crashing if it closed while * sk->inuse=1 and was trying to connect * (tcp_err()). * Alan Cox : All icmp error handling was broken * pointers passed where wrong and the * socket was looked up backwards. Nobody * tested any icmp error code obviously. * Alan Cox : tcp_err() now handled properly. It * wakes people on errors. poll * behaves and the icmp error race * has gone by moving it into sock.c * Alan Cox : tcp_send_reset() fixed to work for * everything not just packets for * unknown sockets. * Alan Cox : tcp option processing. * Alan Cox : Reset tweaked (still not 100%) [Had * syn rule wrong] * Herp Rosmanith : More reset fixes * Alan Cox : No longer acks invalid rst frames. * Acking any kind of RST is right out. * Alan Cox : Sets an ignore me flag on an rst * receive otherwise odd bits of prattle * escape still * Alan Cox : Fixed another acking RST frame bug. * Should stop LAN workplace lockups. * Alan Cox : Some tidyups using the new skb list * facilities * Alan Cox : sk->keepopen now seems to work * Alan Cox : Pulls options out correctly on accepts * Alan Cox : Fixed assorted sk->rqueue->next errors * Alan Cox : PSH doesn't end a TCP read. Switched a * bit to skb ops. * Alan Cox : Tidied tcp_data to avoid a potential * nasty. * Alan Cox : Added some better commenting, as the * tcp is hard to follow * Alan Cox : Removed incorrect check for 20 * psh * Michael O'Reilly : ack < copied bug fix. * Johannes Stille : Misc tcp fixes (not all in yet). * Alan Cox : FIN with no memory -> CRASH * Alan Cox : Added socket option proto entries. * Also added awareness of them to accept. * Alan Cox : Added TCP options (SOL_TCP) * Alan Cox : Switched wakeup calls to callbacks, * so the kernel can layer network * sockets. * Alan Cox : Use ip_tos/ip_ttl settings. * Alan Cox : Handle FIN (more) properly (we hope). * Alan Cox : RST frames sent on unsynchronised * state ack error. * Alan Cox : Put in missing check for SYN bit. * Alan Cox : Added tcp_select_window() aka NET2E * window non shrink trick. * Alan Cox : Added a couple of small NET2E timer * fixes * Charles Hedrick : TCP fixes * Toomas Tamm : TCP window fixes * Alan Cox : Small URG fix to rlogin ^C ack fight * Charles Hedrick : Rewrote most of it to actually work * Linus : Rewrote tcp_read() and URG handling * completely * Gerhard Koerting: Fixed some missing timer handling * Matthew Dillon : Reworked TCP machine states as per RFC * Gerhard Koerting: PC/TCP workarounds * Adam Caldwell : Assorted timer/timing errors * Matthew Dillon : Fixed another RST bug * Alan Cox : Move to kernel side addressing changes. * Alan Cox : Beginning work on TCP fastpathing * (not yet usable) * Arnt Gulbrandsen: Turbocharged tcp_check() routine. * Alan Cox : TCP fast path debugging * Alan Cox : Window clamping * Michael Riepe : Bug in tcp_check() * Matt Dillon : More TCP improvements and RST bug fixes * Matt Dillon : Yet more small nasties remove from the * TCP code (Be very nice to this man if * tcp finally works 100%) 8) * Alan Cox : BSD accept semantics. * Alan Cox : Reset on closedown bug. * Peter De Schrijver : ENOTCONN check missing in tcp_sendto(). * Michael Pall : Handle poll() after URG properly in * all cases. * Michael Pall : Undo the last fix in tcp_read_urg() * (multi URG PUSH broke rlogin). * Michael Pall : Fix the multi URG PUSH problem in * tcp_readable(), poll() after URG * works now. * Michael Pall : recv(...,MSG_OOB) never blocks in the * BSD api. * Alan Cox : Changed the semantics of sk->socket to * fix a race and a signal problem with * accept() and async I/O. * Alan Cox : Relaxed the rules on tcp_sendto(). * Yury Shevchuk : Really fixed accept() blocking problem. * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for * clients/servers which listen in on * fixed ports. * Alan Cox : Cleaned the above up and shrank it to * a sensible code size. * Alan Cox : Self connect lockup fix. * Alan Cox : No connect to multicast. * Ross Biro : Close unaccepted children on master * socket close. * Alan Cox : Reset tracing code. * Alan Cox : Spurious resets on shutdown. * Alan Cox : Giant 15 minute/60 second timer error * Alan Cox : Small whoops in polling before an * accept. * Alan Cox : Kept the state trace facility since * it's handy for debugging. * Alan Cox : More reset handler fixes. * Alan Cox : Started rewriting the code based on * the RFC's for other useful protocol * references see: Comer, KA9Q NOS, and * for a reference on the difference * between specifications and how BSD * works see the 4.4lite source. * A.N.Kuznetsov : Don't time wait on completion of tidy * close. * Linus Torvalds : Fin/Shutdown & copied_seq changes. * Linus Torvalds : Fixed BSD port reuse to work first syn * Alan Cox : Reimplemented timers as per the RFC * and using multiple timers for sanity. * Alan Cox : Small bug fixes, and a lot of new * comments. * Alan Cox : Fixed dual reader crash by locking * the buffers (much like datagram.c) * Alan Cox : Fixed stuck sockets in probe. A probe * now gets fed up of retrying without * (even a no space) answer. * Alan Cox : Extracted closing code better * Alan Cox : Fixed the closing state machine to * resemble the RFC. * Alan Cox : More 'per spec' fixes. * Jorge Cwik : Even faster checksumming. * Alan Cox : tcp_data() doesn't ack illegal PSH * only frames. At least one pc tcp stack * generates them. * Alan Cox : Cache last socket. * Alan Cox : Per route irtt. * Matt Day : poll()->select() match BSD precisely on error * Alan Cox : New buffers * Marc Tamsky : Various sk->prot->retransmits and * sk->retransmits misupdating fixed. * Fixed tcp_write_timeout: stuck close, * and TCP syn retries gets used now. * Mark Yarvis : In tcp_read_wakeup(), don't send an * ack if state is TCP_CLOSED. * Alan Cox : Look up device on a retransmit - routes may * change. Doesn't yet cope with MSS shrink right * but it's a start! * Marc Tamsky : Closing in closing fixes. * Mike Shaver : RFC1122 verifications. * Alan Cox : rcv_saddr errors. * Alan Cox : Block double connect(). * Alan Cox : Small hooks for enSKIP. * Alexey Kuznetsov: Path MTU discovery. * Alan Cox : Support soft errors. * Alan Cox : Fix MTU discovery pathological case * when the remote claims no mtu! * Marc Tamsky : TCP_CLOSE fix. * Colin (G3TNE) : Send a reset on syn ack replies in * window but wrong (fixes NT lpd problems) * Pedro Roque : Better TCP window handling, delayed ack. * Joerg Reuter : No modification of locked buffers in * tcp_do_retransmit() * Eric Schenk : Changed receiver side silly window * avoidance algorithm to BSD style * algorithm. This doubles throughput * against machines running Solaris, * and seems to result in general * improvement. * Stefan Magdalinski : adjusted tcp_readable() to fix FIONREAD * Willy Konynenberg : Transparent proxying support. * Mike McLagan : Routing by source * Keith Owens : Do proper merging with partial SKB's in * tcp_do_sendmsg to avoid burstiness. * Eric Schenk : Fix fast close down bug with * shutdown() followed by close(). * Andi Kleen : Make poll agree with SIGIO * Salvatore Sanfilippo : Support SO_LINGER with linger == 1 and * lingertime == 0 (RFC 793 ABORT Call) * Hirokazu Takahashi : Use copy_from_user() instead of * csum_and_copy_from_user() if possible. * * Description of States: * * TCP_SYN_SENT sent a connection request, waiting for ack * * TCP_SYN_RECV received a connection request, sent ack, * waiting for final ack in three-way handshake. * * TCP_ESTABLISHED connection established * * TCP_FIN_WAIT1 our side has shutdown, waiting to complete * transmission of remaining buffered data * * TCP_FIN_WAIT2 all buffered data sent, waiting for remote * to shutdown * * TCP_CLOSING both sides have shutdown but we still have * data we have to finish sending * * TCP_TIME_WAIT timeout to catch resent junk before entering * closed, can only be entered from FIN_WAIT2 * or CLOSING. Required because the other end * may not have gotten our last ACK causing it * to retransmit the data packet (which we ignore) * * TCP_CLOSE_WAIT remote side has shutdown and is waiting for * us to finish writing our data and to shutdown * (we have to close() to move on to LAST_ACK) * * TCP_LAST_ACK out side has shutdown after remote has * shutdown. There may still be data in our * buffer that we have to finish sending * * TCP_CLOSE socket is finished */ #define pr_fmt(fmt) "TCP: " fmt #include <crypto/md5.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/types.h> #include <linux/fcntl.h> #include <linux/poll.h> #include <linux/inet_diag.h> #include <linux/init.h> #include <linux/fs.h> #include <linux/skbuff.h> #include <linux/splice.h> #include <linux/net.h> #include <linux/socket.h> #include <linux/random.h> #include <linux/memblock.h> #include <linux/highmem.h> #include <linux/cache.h> #include <linux/err.h> #include <linux/time.h> #include <linux/slab.h> #include <linux/errqueue.h> #include <linux/static_key.h> #include <linux/btf.h> #include <net/icmp.h> #include <net/inet_common.h> #include <net/inet_ecn.h> #include <net/tcp.h> #include <net/tcp_ecn.h> #include <net/mptcp.h> #include <net/proto_memory.h> #include <net/xfrm.h> #include <net/ip.h> #include <net/psp.h> #include <net/sock.h> #include <net/rstreason.h> #include <linux/uaccess.h> #include <asm/ioctls.h> #include <net/busy_poll.h> #include <net/hotdata.h> #include <trace/events/tcp.h> #include <net/rps.h> #include "../core/devmem.h" /* Track pending CMSGs. */ enum { TCP_CMSG_INQ = 1, TCP_CMSG_TS = 2 }; DEFINE_PER_CPU(unsigned int, tcp_orphan_count); EXPORT_PER_CPU_SYMBOL_GPL(tcp_orphan_count); DEFINE_PER_CPU(u32, tcp_tw_isn); EXPORT_PER_CPU_SYMBOL_GPL(tcp_tw_isn); long sysctl_tcp_mem[3] __read_mostly; EXPORT_IPV6_MOD(sysctl_tcp_mem); DEFINE_PER_CPU(int, tcp_memory_per_cpu_fw_alloc); EXPORT_PER_CPU_SYMBOL_GPL(tcp_memory_per_cpu_fw_alloc); #if IS_ENABLED(CONFIG_SMC) DEFINE_STATIC_KEY_FALSE(tcp_have_smc); EXPORT_SYMBOL(tcp_have_smc); #endif /* * Current number of TCP sockets. */ struct percpu_counter tcp_sockets_allocated ____cacheline_aligned_in_smp; EXPORT_IPV6_MOD(tcp_sockets_allocated); /* * TCP splice context */ struct tcp_splice_state { struct pipe_inode_info *pipe; size_t len; unsigned int flags; }; /* * Pressure flag: try to collapse. * Technical note: it is used by multiple contexts non atomically. * All the __sk_mem_schedule() is of this nature: accounting * is strict, actions are advisory and have some latency. */ unsigned long tcp_memory_pressure __read_mostly; EXPORT_SYMBOL_GPL(tcp_memory_pressure); void tcp_enter_memory_pressure(struct sock *sk) { unsigned long val; if (READ_ONCE(tcp_memory_pressure)) return; val = jiffies; if (!val) val--; if (!cmpxchg(&tcp_memory_pressure, 0, val)) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES); } EXPORT_IPV6_MOD_GPL(tcp_enter_memory_pressure); void tcp_leave_memory_pressure(struct sock *sk) { unsigned long val; if (!READ_ONCE(tcp_memory_pressure)) return; val = xchg(&tcp_memory_pressure, 0); if (val) NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURESCHRONO, jiffies_to_msecs(jiffies - val)); } EXPORT_IPV6_MOD_GPL(tcp_leave_memory_pressure); /* Convert seconds to retransmits based on initial and max timeout */ static u8 secs_to_retrans(int seconds, int timeout, int rto_max) { u8 res = 0; if (seconds > 0) { int period = timeout; res = 1; while (seconds > period && res < 255) { res++; timeout <<= 1; if (timeout > rto_max) timeout = rto_max; period += timeout; } } return res; } /* Convert retransmits to seconds based on initial and max timeout */ static int retrans_to_secs(u8 retrans, int timeout, int rto_max) { int period = 0; if (retrans > 0) { period = timeout; while (--retrans) { timeout <<= 1; if (timeout > rto_max) timeout = rto_max; period += timeout; } } return period; } static u64 tcp_compute_delivery_rate(const struct tcp_sock *tp) { u32 rate = READ_ONCE(tp->rate_delivered); u32 intv = READ_ONCE(tp->rate_interval_us); u64 rate64 = 0; if (rate && intv) { rate64 = (u64)rate * tp->mss_cache * USEC_PER_SEC; do_div(rate64, intv); } return rate64; } #ifdef CONFIG_TCP_MD5SIG void tcp_md5_destruct_sock(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); if (tp->md5sig_info) { tcp_clear_md5_list(sk); kfree(rcu_replace_pointer(tp->md5sig_info, NULL, 1)); static_branch_slow_dec_deferred(&tcp_md5_needed); } } EXPORT_IPV6_MOD_GPL(tcp_md5_destruct_sock); #endif /* Address-family independent initialization for a tcp_sock. * * NOTE: A lot of things set to zero explicitly by call to * sk_alloc() so need not be done here. */ void tcp_init_sock(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int rto_min_us, rto_max_ms; tp->out_of_order_queue = RB_ROOT; sk->tcp_rtx_queue = RB_ROOT; tcp_init_xmit_timers(sk); INIT_LIST_HEAD(&tp->tsq_node); INIT_LIST_HEAD(&tp->tsorted_sent_queue); icsk->icsk_rto = TCP_TIMEOUT_INIT; rto_max_ms = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rto_max_ms); icsk->icsk_rto_max = msecs_to_jiffies(rto_max_ms); rto_min_us = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rto_min_us); icsk->icsk_rto_min = usecs_to_jiffies(rto_min_us); icsk->icsk_delack_max = TCP_DELACK_MAX; tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U); /* So many TCP implementations out there (incorrectly) count the * initial SYN frame in their delayed-ACK and congestion control * algorithms that we must have the following bandaid to talk * efficiently to them. -DaveM */ tcp_snd_cwnd_set(tp, TCP_INIT_CWND); /* There's a bubble in the pipe until at least the first ACK. */ tp->app_limited = ~0U; tp->rate_app_limited = 1; /* See draft-stevens-tcpca-spec-01 for discussion of the * initialization of these values. */ tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; tp->snd_cwnd_clamp = ~0; tp->mss_cache = TCP_MSS_DEFAULT; tp->reordering = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reordering); tcp_assign_congestion_control(sk); tp->tsoffset = 0; tp->rack.reo_wnd_steps = 1; sk->sk_write_space = sk_stream_write_space; sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); icsk->icsk_sync_mss = tcp_sync_mss; WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[1])); WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[1])); tcp_scaling_ratio_init(sk); set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); sk_sockets_allocated_inc(sk); xa_init_flags(&sk->sk_user_frags, XA_FLAGS_ALLOC1); } EXPORT_IPV6_MOD(tcp_init_sock); static void tcp_tx_timestamp(struct sock *sk, struct sockcm_cookie *sockc) { struct sk_buff *skb = tcp_write_queue_tail(sk); u32 tsflags = sockc->tsflags; if (tsflags && skb) { struct skb_shared_info *shinfo = skb_shinfo(skb); struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); sock_tx_timestamp(sk, sockc, &shinfo->tx_flags); if (tsflags & SOF_TIMESTAMPING_TX_ACK) tcb->txstamp_ack |= TSTAMP_ACK_SK; if (tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK) shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1; } if (cgroup_bpf_enabled(CGROUP_SOCK_OPS) && SK_BPF_CB_FLAG_TEST(sk, SK_BPF_CB_TX_TIMESTAMPING) && skb) bpf_skops_tx_timestamping(sk, skb, BPF_SOCK_OPS_TSTAMP_SENDMSG_CB); } static bool tcp_stream_is_readable(struct sock *sk, int target) { if (tcp_epollin_ready(sk, target)) return true; return sk_is_readable(sk); } /* * Wait for a TCP event. * * Note that we don't need to lock the socket, as the upper poll layers * take care of normal races (between the test and the event) and we don't * go look at any of the socket buffers directly. */ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) { __poll_t mask; struct sock *sk = sock->sk; const struct tcp_sock *tp = tcp_sk(sk); u8 shutdown; int state; sock_poll_wait(file, sock, wait); state = inet_sk_state_load(sk); if (state == TCP_LISTEN) return inet_csk_listen_poll(sk); /* Socket is not locked. We are protected from async events * by poll logic and correct handling of state changes * made by other threads is impossible in any case. */ mask = 0; /* * EPOLLHUP is certainly not done right. But poll() doesn't * have a notion of HUP in just one direction, and for a * socket the read side is more interesting. * * Some poll() documentation says that EPOLLHUP is incompatible * with the EPOLLOUT/POLLWR flags, so somebody should check this * all. But careful, it tends to be safer to return too many * bits than too few, and you can easily break real applications * if you don't tell them that something has hung up! * * Check-me. * * Check number 1. EPOLLHUP is _UNMASKABLE_ event (see UNIX98 and * our fs/select.c). It means that after we received EOF, * poll always returns immediately, making impossible poll() on write() * in state CLOSE_WAIT. One solution is evident --- to set EPOLLHUP * if and only if shutdown has been made in both directions. * Actually, it is interesting to look how Solaris and DUX * solve this dilemma. I would prefer, if EPOLLHUP were maskable, * then we could set it on SND_SHUTDOWN. BTW examples given * in Stevens' books assume exactly this behaviour, it explains * why EPOLLHUP is incompatible with EPOLLOUT. --ANK * * NOTE. Check for TCP_CLOSE is added. The goal is to prevent * blocking on fresh not-connected or disconnected socket. --ANK */ shutdown = READ_ONCE(sk->sk_shutdown); if (shutdown == SHUTDOWN_MASK || state == TCP_CLOSE) mask |= EPOLLHUP; if (shutdown & RCV_SHUTDOWN) mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; /* Connected or passive Fast Open socket? */ if (state != TCP_SYN_SENT && (state != TCP_SYN_RECV || rcu_access_pointer(tp->fastopen_rsk))) { int target = sock_rcvlowat(sk, 0, INT_MAX); u16 urg_data = READ_ONCE(tp->urg_data); if (unlikely(urg_data) && READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq) && !sock_flag(sk, SOCK_URGINLINE)) target++; if (tcp_stream_is_readable(sk, target)) mask |= EPOLLIN | EPOLLRDNORM; if (!(shutdown & SEND_SHUTDOWN)) { if (__sk_stream_is_writeable(sk, 1)) { mask |= EPOLLOUT | EPOLLWRNORM; } else { /* send SIGIO later */ sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); /* Race breaker. If space is freed after * wspace test but before the flags are set, * IO signal will be lost. Memory barrier * pairs with the input side. */ smp_mb__after_atomic(); if (__sk_stream_is_writeable(sk, 1)) mask |= EPOLLOUT | EPOLLWRNORM; } } else mask |= EPOLLOUT | EPOLLWRNORM; if (urg_data & TCP_URG_VALID) mask |= EPOLLPRI; } else if (state == TCP_SYN_SENT && inet_test_bit(DEFER_CONNECT, sk)) { /* Active TCP fastopen socket with defer_connect * Return EPOLLOUT so application can call write() * in order for kernel to generate SYN+data */ mask |= EPOLLOUT | EPOLLWRNORM; } /* This barrier is coupled with smp_wmb() in tcp_done_with_error() */ smp_rmb(); if (READ_ONCE(sk->sk_err) || !skb_queue_empty_lockless(&sk->sk_error_queue)) mask |= EPOLLERR; return mask; } EXPORT_SYMBOL(tcp_poll); int tcp_ioctl(struct sock *sk, int cmd, int *karg) { struct tcp_sock *tp = tcp_sk(sk); int answ; bool slow; switch (cmd) { case SIOCINQ: if (sk->sk_state == TCP_LISTEN) return -EINVAL; slow = lock_sock_fast(sk); answ = tcp_inq(sk); unlock_sock_fast(sk, slow); break; case SIOCATMARK: answ = READ_ONCE(tp->urg_data) && READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq); break; case SIOCOUTQ: if (sk->sk_state == TCP_LISTEN) return -EINVAL; if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) answ = 0; else answ = READ_ONCE(tp->write_seq) - tp->snd_una; break; case SIOCOUTQNSD: if (sk->sk_state == TCP_LISTEN) return -EINVAL; if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) answ = 0; else answ = READ_ONCE(tp->write_seq) - READ_ONCE(tp->snd_nxt); break; default: return -ENOIOCTLCMD; } *karg = answ; return 0; } EXPORT_IPV6_MOD(tcp_ioctl); void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb) { TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; tp->pushed_seq = tp->write_seq; } static inline bool forced_push(const struct tcp_sock *tp) { return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1)); } void tcp_skb_entail(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); tcb->seq = tcb->end_seq = tp->write_seq; tcb->tcp_flags = TCPHDR_ACK; __skb_header_release(skb); psp_enqueue_set_decrypted(sk, skb); tcp_add_write_queue_tail(sk, skb); sk_wmem_queued_add(sk, skb->truesize); sk_mem_charge(sk, skb->truesize); if (tp->nonagle & TCP_NAGLE_PUSH) tp->nonagle &= ~TCP_NAGLE_PUSH; tcp_slow_start_after_idle_check(sk); } static inline void tcp_mark_urg(struct tcp_sock *tp, int flags) { if (flags & MSG_OOB) tp->snd_up = tp->write_seq; } /* If a not yet filled skb is pushed, do not send it if * we have data packets in Qdisc or NIC queues : * Because TX completion will happen shortly, it gives a chance * to coalesce future sendmsg() payload into this skb, without * need for a timer, and with no latency trade off. * As packets containing data payload have a bigger truesize * than pure acks (dataless) packets, the last checks prevent * autocorking if we only have an ACK in Qdisc/NIC queues, * or if TX completion was delayed after we processed ACK packet. */ static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb, int size_goal) { return skb->len < size_goal && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_autocorking) && !tcp_rtx_queue_empty(sk) && refcount_read(&sk->sk_wmem_alloc) > skb->truesize && tcp_skb_can_collapse_to(skb); } void tcp_push(struct sock *sk, int flags, int mss_now, int nonagle, int size_goal) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; skb = tcp_write_queue_tail(sk); if (!skb) return; if (!(flags & MSG_MORE) || forced_push(tp)) tcp_mark_push(tp, skb); tcp_mark_urg(tp, flags); if (tcp_should_autocork(sk, skb, size_goal)) { /* avoid atomic op if TSQ_THROTTLED bit is already set */ if (!test_bit(TSQ_THROTTLED, &sk->sk_tsq_flags)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING); set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags); smp_mb__after_atomic(); } /* It is possible TX completion already happened * before we set TSQ_THROTTLED. */ if (refcount_read(&sk->sk_wmem_alloc) > skb->truesize) return; } if (flags & MSG_MORE) nonagle = TCP_NAGLE_CORK; __tcp_push_pending_frames(sk, mss_now, nonagle); } static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, unsigned int offset, size_t len) { struct tcp_splice_state *tss = rd_desc->arg.data; int ret; ret = skb_splice_bits(skb, skb->sk, offset, tss->pipe, min(rd_desc->count, len), tss->flags); if (ret > 0) rd_desc->count -= ret; return ret; } static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss) { /* Store TCP splice context information in read_descriptor_t. */ read_descriptor_t rd_desc = { .arg.data = tss, .count = tss->len, }; return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv); } /** * tcp_splice_read - splice data from TCP socket to a pipe * @sock: socket to splice from * @ppos: position (not valid) * @pipe: pipe to splice to * @len: number of bytes to splice * @flags: splice modifier flags * * Description: * Will read pages from given socket and fill them into a pipe. * **/ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { struct sock *sk = sock->sk; struct tcp_splice_state tss = { .pipe = pipe, .len = len, .flags = flags, }; long timeo; ssize_t spliced; int ret; sock_rps_record_flow(sk); /* * We can't seek on a socket input */ if (unlikely(*ppos)) return -ESPIPE; ret = spliced = 0; lock_sock(sk); timeo = sock_rcvtimeo(sk, sock->file->f_flags & O_NONBLOCK); while (tss.len) { ret = __tcp_splice_read(sk, &tss); if (ret < 0) break; else if (!ret) { if (spliced) break; if (sock_flag(sk, SOCK_DONE)) break; if (sk->sk_err) { ret = sock_error(sk); break; } if (sk->sk_shutdown & RCV_SHUTDOWN) break; if (sk->sk_state == TCP_CLOSE) { /* * This occurs when user tries to read * from never connected socket. */ ret = -ENOTCONN; break; } if (!timeo) { ret = -EAGAIN; break; } /* if __tcp_splice_read() got nothing while we have * an skb in receive queue, we do not want to loop. * This might happen with URG data. */ if (!skb_queue_empty(&sk->sk_receive_queue)) break; ret = sk_wait_data(sk, &timeo, NULL); if (ret < 0) break; if (signal_pending(current)) { ret = sock_intr_errno(timeo); break; } continue; } tss.len -= ret; spliced += ret; if (!tss.len || !timeo) break; release_sock(sk); lock_sock(sk); if (sk->sk_err || sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN) || signal_pending(current)) break; } release_sock(sk); if (spliced) return spliced; return ret; } EXPORT_IPV6_MOD(tcp_splice_read); struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, gfp_t gfp, bool force_schedule) { struct sk_buff *skb; skb = alloc_skb_fclone(MAX_TCP_HEADER, gfp); if (likely(skb)) { bool mem_scheduled; skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); if (force_schedule) { mem_scheduled = true; sk_forced_mem_schedule(sk, skb->truesize); } else { mem_scheduled = sk_wmem_schedule(sk, skb->truesize); } if (likely(mem_scheduled)) { skb_reserve(skb, MAX_TCP_HEADER); skb->ip_summed = CHECKSUM_PARTIAL; INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); return skb; } __kfree_skb(skb); } else { if (!sk->sk_bypass_prot_mem) tcp_enter_memory_pressure(sk); sk_stream_moderate_sndbuf(sk); } return NULL; } static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, int large_allowed) { struct tcp_sock *tp = tcp_sk(sk); u32 new_size_goal, size_goal; if (!large_allowed) return mss_now; /* Note : tcp_tso_autosize() will eventually split this later */ new_size_goal = tcp_bound_to_half_wnd(tp, sk->sk_gso_max_size); /* We try hard to avoid divides here */ size_goal = tp->gso_segs * mss_now; if (unlikely(new_size_goal < size_goal || new_size_goal >= size_goal + mss_now)) { tp->gso_segs = min_t(u16, new_size_goal / mss_now, sk->sk_gso_max_segs); size_goal = tp->gso_segs * mss_now; } return max(size_goal, mss_now); } int tcp_send_mss(struct sock *sk, int *size_goal, int flags) { int mss_now; mss_now = tcp_current_mss(sk); *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB)); return mss_now; } /* In some cases, sendmsg() could have added an skb to the write queue, * but failed adding payload on it. We need to remove it to consume less * memory, but more importantly be able to generate EPOLLOUT for Edge Trigger * epoll() users. Another reason is that tcp_write_xmit() does not like * finding an empty skb in the write queue. */ void tcp_remove_empty_skb(struct sock *sk) { struct sk_buff *skb = tcp_write_queue_tail(sk); if (skb && TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) { tcp_unlink_write_queue(skb, sk); if (tcp_write_queue_empty(sk)) tcp_chrono_stop(sk, TCP_CHRONO_BUSY); tcp_wmem_free_skb(sk, skb); } } /* skb changing from pure zc to mixed, must charge zc */ static int tcp_downgrade_zcopy_pure(struct sock *sk, struct sk_buff *skb) { if (unlikely(skb_zcopy_pure(skb))) { u32 extra = skb->truesize - SKB_TRUESIZE(skb_end_offset(skb)); if (!sk_wmem_schedule(sk, extra)) return -ENOMEM; sk_mem_charge(sk, extra); skb_shinfo(skb)->flags &= ~SKBFL_PURE_ZEROCOPY; } return 0; } int tcp_wmem_schedule(struct sock *sk, int copy) { int left; if (likely(sk_wmem_schedule(sk, copy))) return copy; /* We could be in trouble if we have nothing queued. * Use whatever is left in sk->sk_forward_alloc and tcp_wmem[0] * to guarantee some progress. */ left = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[0]) - sk->sk_wmem_queued; if (left > 0) sk_forced_mem_schedule(sk, min(left, copy)); return min(copy, sk->sk_forward_alloc); } void tcp_free_fastopen_req(struct tcp_sock *tp) { if (tp->fastopen_req) { kfree(tp->fastopen_req); tp->fastopen_req = NULL; } } int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied, size_t size, struct ubuf_info *uarg) { struct tcp_sock *tp = tcp_sk(sk); struct inet_sock *inet = inet_sk(sk); struct sockaddr *uaddr = msg->msg_name; int err, flags; if (!(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen) & TFO_CLIENT_ENABLE) || (uaddr && msg->msg_namelen >= sizeof(uaddr->sa_family) && uaddr->sa_family == AF_UNSPEC)) return -EOPNOTSUPP; if (tp->fastopen_req) return -EALREADY; /* Another Fast Open is in progress */ tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request), sk->sk_allocation); if (unlikely(!tp->fastopen_req)) return -ENOBUFS; tp->fastopen_req->data = msg; tp->fastopen_req->size = size; tp->fastopen_req->uarg = uarg; if (inet_test_bit(DEFER_CONNECT, sk)) { err = tcp_connect(sk); /* Same failure procedure as in tcp_v4/6_connect */ if (err) { tcp_set_state(sk, TCP_CLOSE); inet->inet_dport = 0; sk->sk_route_caps = 0; } } flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0; err = __inet_stream_connect(sk->sk_socket, (struct sockaddr_unsized *)uaddr, msg->msg_namelen, flags, 1); /* fastopen_req could already be freed in __inet_stream_connect * if the connection times out or gets rst */ if (tp->fastopen_req) { *copied = tp->fastopen_req->copied; tcp_free_fastopen_req(tp); inet_clear_bit(DEFER_CONNECT, sk); } return err; } int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) { struct net_devmem_dmabuf_binding *binding = NULL; struct tcp_sock *tp = tcp_sk(sk); struct ubuf_info *uarg = NULL; struct sk_buff *skb; struct sockcm_cookie sockc; int flags, err, copied = 0; int mss_now = 0, size_goal, copied_syn = 0; int process_backlog = 0; int sockc_err = 0; int zc = 0; long timeo; flags = msg->msg_flags; sockc = (struct sockcm_cookie){ .tsflags = READ_ONCE(sk->sk_tsflags) }; if (msg->msg_controllen) { sockc_err = sock_cmsg_send(sk, msg, &sockc); /* Don't return error until MSG_FASTOPEN has been processed; * that may succeed even if the cmsg is invalid. */ } if ((flags & MSG_ZEROCOPY) && size) { if (msg->msg_ubuf) { uarg = msg->msg_ubuf; if (sk->sk_route_caps & NETIF_F_SG) zc = MSG_ZEROCOPY; } else if (sock_flag(sk, SOCK_ZEROCOPY)) { skb = tcp_write_queue_tail(sk); uarg = msg_zerocopy_realloc(sk, size, skb_zcopy(skb), !sockc_err && sockc.dmabuf_id); if (!uarg) { err = -ENOBUFS; goto out_err; } if (sk->sk_route_caps & NETIF_F_SG) zc = MSG_ZEROCOPY; else uarg_to_msgzc(uarg)->zerocopy = 0; if (!sockc_err && sockc.dmabuf_id) { binding = net_devmem_get_binding(sk, sockc.dmabuf_id); if (IS_ERR(binding)) { err = PTR_ERR(binding); binding = NULL; goto out_err; } } } } else if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES) && size) { if (sk->sk_route_caps & NETIF_F_SG) zc = MSG_SPLICE_PAGES; } if (!sockc_err && sockc.dmabuf_id && (!(flags & MSG_ZEROCOPY) || !sock_flag(sk, SOCK_ZEROCOPY))) { err = -EINVAL; goto out_err; } if (unlikely(flags & MSG_FASTOPEN || inet_test_bit(DEFER_CONNECT, sk)) && !tp->repair) { err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size, uarg); if (err == -EINPROGRESS && copied_syn > 0) goto out; else if (err) goto out_err; } timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); tcp_rate_check_app_limited(sk); /* is sending application-limited? */ /* Wait for a connection to finish. One exception is TCP Fast Open * (passive side) where data is allowed to be sent before a connection * is fully established. */ if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) && !tcp_passive_fastopen(sk)) { err = sk_stream_wait_connect(sk, &timeo); if (err != 0) goto do_error; } if (unlikely(tp->repair)) { if (tp->repair_queue == TCP_RECV_QUEUE) { copied = tcp_send_rcvq(sk, msg, size); goto out_nopush; } err = -EINVAL; if (tp->repair_queue == TCP_NO_QUEUE) goto out_err; /* 'common' sending to sendq */ } if (sockc_err) { err = sockc_err; goto out_err; } /* This should be in poll */ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); /* Ok commence sending. */ copied = 0; restart: mss_now = tcp_send_mss(sk, &size_goal, flags); err = -EPIPE; if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) goto do_error; while (msg_data_left(msg)) { int copy = 0; skb = tcp_write_queue_tail(sk); if (skb) copy = size_goal - skb->len; trace_tcp_sendmsg_locked(sk, msg, skb, size_goal); if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) { bool first_skb; new_segment: if (!sk_stream_memory_free(sk)) goto wait_for_space; if (unlikely(process_backlog >= 16)) { process_backlog = 0; if (sk_flush_backlog(sk)) goto restart; } first_skb = tcp_rtx_and_write_queues_empty(sk); skb = tcp_stream_alloc_skb(sk, sk->sk_allocation, first_skb); if (!skb) goto wait_for_space; process_backlog++; #ifdef CONFIG_SKB_DECRYPTED skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED); #endif tcp_skb_entail(sk, skb); copy = size_goal; /* All packets are restored as if they have * already been sent. skb_mstamp_ns isn't set to * avoid wrong rtt estimation. */ if (tp->repair) TCP_SKB_CB(skb)->sacked |= TCPCB_REPAIRED; } /* Try to append data to the end of skb. */ if (copy > msg_data_left(msg)) copy = msg_data_left(msg); if (zc == 0) { bool merge = true; int i = skb_shinfo(skb)->nr_frags; struct page_frag *pfrag = sk_page_frag(sk); if (!sk_page_frag_refill(sk, pfrag)) goto wait_for_space; if (!skb_can_coalesce(skb, i, pfrag->page, pfrag->offset)) { if (i >= READ_ONCE(net_hotdata.sysctl_max_skb_frags)) { tcp_mark_push(tp, skb); goto new_segment; } merge = false; } copy = min_t(int, copy, pfrag->size - pfrag->offset); if (unlikely(skb_zcopy_pure(skb) || skb_zcopy_managed(skb))) { if (tcp_downgrade_zcopy_pure(sk, skb)) goto wait_for_space; skb_zcopy_downgrade_managed(skb); } copy = tcp_wmem_schedule(sk, copy); if (!copy) goto wait_for_space; err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb, pfrag->page, pfrag->offset, copy); if (err) goto do_error; /* Update the skb. */ if (merge) { skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); } else { skb_fill_page_desc(skb, i, pfrag->page, pfrag->offset, copy); page_ref_inc(pfrag->page); } pfrag->offset += copy; } else if (zc == MSG_ZEROCOPY) { /* First append to a fragless skb builds initial * pure zerocopy skb */ if (!skb->len) skb_shinfo(skb)->flags |= SKBFL_PURE_ZEROCOPY; if (!skb_zcopy_pure(skb)) { copy = tcp_wmem_schedule(sk, copy); if (!copy) goto wait_for_space; } err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg, binding); if (err == -EMSGSIZE || err == -EEXIST) { tcp_mark_push(tp, skb); goto new_segment; } if (err < 0) goto do_error; copy = err; } else if (zc == MSG_SPLICE_PAGES) { /* Splice in data if we can; copy if we can't. */ if (tcp_downgrade_zcopy_pure(sk, skb)) goto wait_for_space; copy = tcp_wmem_schedule(sk, copy); if (!copy) goto wait_for_space; err = skb_splice_from_iter(skb, &msg->msg_iter, copy); if (err < 0) { if (err == -EMSGSIZE) { tcp_mark_push(tp, skb); goto new_segment; } goto do_error; } copy = err; if (!(flags & MSG_NO_SHARED_FRAGS)) skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG; sk_wmem_queued_add(sk, copy); sk_mem_charge(sk, copy); } if (!copied) TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH; WRITE_ONCE(tp->write_seq, tp->write_seq + copy); TCP_SKB_CB(skb)->end_seq += copy; tcp_skb_pcount_set(skb, 0); copied += copy; if (!msg_data_left(msg)) { if (unlikely(flags & MSG_EOR)) TCP_SKB_CB(skb)->eor = 1; goto out; } if (skb->len < size_goal || (flags & MSG_OOB) || unlikely(tp->repair)) continue; if (forced_push(tp)) { tcp_mark_push(tp, skb); __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH); } else if (skb == tcp_send_head(sk)) tcp_push_one(sk, mss_now); continue; wait_for_space: set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); tcp_remove_empty_skb(sk); if (copied) tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH, size_goal); err = sk_stream_wait_memory(sk, &timeo); if (err != 0) goto do_error; mss_now = tcp_send_mss(sk, &size_goal, flags); } out: if (copied) { tcp_tx_timestamp(sk, &sockc); tcp_push(sk, flags, mss_now, tp->nonagle, size_goal); } out_nopush: /* msg->msg_ubuf is pinned by the caller so we don't take extra refs */ if (uarg && !msg->msg_ubuf) net_zcopy_put(uarg); if (binding) net_devmem_dmabuf_binding_put(binding); return copied + copied_syn; do_error: tcp_remove_empty_skb(sk); if (copied + copied_syn) goto out; out_err: /* msg->msg_ubuf is pinned by the caller so we don't take extra refs */ if (uarg && !msg->msg_ubuf) net_zcopy_put_abort(uarg, true); err = sk_stream_error(sk, flags, err); /* make sure we wake any epoll edge trigger waiter */ if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) { sk->sk_write_space(sk); tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); } if (binding) net_devmem_dmabuf_binding_put(binding); return err; } EXPORT_SYMBOL_GPL(tcp_sendmsg_locked); int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) { int ret; lock_sock(sk); ret = tcp_sendmsg_locked(sk, msg, size); release_sock(sk); return ret; } EXPORT_SYMBOL(tcp_sendmsg); void tcp_splice_eof(struct socket *sock) { struct sock *sk = sock->sk; struct tcp_sock *tp = tcp_sk(sk); int mss_now, size_goal; if (!tcp_write_queue_tail(sk)) return; lock_sock(sk); mss_now = tcp_send_mss(sk, &size_goal, 0); tcp_push(sk, 0, mss_now, tp->nonagle, size_goal); release_sock(sk); } EXPORT_IPV6_MOD_GPL(tcp_splice_eof); /* * Handle reading urgent data. BSD has very simple semantics for * this, no blocking and very strange errors 8) */ static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags) { struct tcp_sock *tp = tcp_sk(sk); /* No URG data to read. */ if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data || tp->urg_data == TCP_URG_READ) return -EINVAL; /* Yes this is right ! */ if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE)) return -ENOTCONN; if (tp->urg_data & TCP_URG_VALID) { int err = 0; char c = tp->urg_data; if (!(flags & MSG_PEEK)) WRITE_ONCE(tp->urg_data, TCP_URG_READ); /* Read urgent data. */ msg->msg_flags |= MSG_OOB; if (len > 0) { if (!(flags & MSG_TRUNC)) err = memcpy_to_msg(msg, &c, 1); len = 1; } else msg->msg_flags |= MSG_TRUNC; return err ? -EFAULT : len; } if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN)) return 0; /* Fixed the recv(..., MSG_OOB) behaviour. BSD docs and * the available implementations agree in this case: * this call should never block, independent of the * blocking state of the socket. * Mike <pall@rz.uni-karlsruhe.de> */ return -EAGAIN; } static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len) { struct sk_buff *skb; int copied = 0, err = 0; skb_rbtree_walk(skb, &sk->tcp_rtx_queue) { err = skb_copy_datagram_msg(skb, 0, msg, skb->len); if (err) return err; copied += skb->len; } skb_queue_walk(&sk->sk_write_queue, skb) { err = skb_copy_datagram_msg(skb, 0, msg, skb->len); if (err) break; copied += skb->len; } return err ?: copied; } /* Clean up the receive buffer for full frames taken by the user, * then send an ACK if necessary. COPIED is the number of bytes * tcp_recvmsg has given to the user so far, it speeds up the * calculation of whether or not we must ACK for the sake of * a window update. */ void __tcp_cleanup_rbuf(struct sock *sk, int copied) { struct tcp_sock *tp = tcp_sk(sk); bool time_to_ack = false; if (inet_csk_ack_scheduled(sk)) { const struct inet_connection_sock *icsk = inet_csk(sk); if (/* Once-per-two-segments ACK was not sent by tcp_input.c */ tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss || /* * If this read emptied read buffer, we send ACK, if * connection is not bidirectional, user drained * receive buffer and there was a small segment * in queue. */ (copied > 0 && ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) || ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) && !inet_csk_in_pingpong_mode(sk))) && !atomic_read(&sk->sk_rmem_alloc))) time_to_ack = true; } /* We send an ACK if we can now advertise a non-zero window * which has been raised "significantly". * * Even if window raised up to infinity, do not send window open ACK * in states, where we will not receive more. It is useless. */ if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) { __u32 rcv_window_now = tcp_receive_window(tp); /* Optimize, __tcp_select_window() is not cheap. */ if (2*rcv_window_now <= tp->window_clamp) { __u32 new_window = __tcp_select_window(sk); /* Send ACK now, if this read freed lots of space * in our buffer. Certainly, new_window is new window. * We can advertise it now, if it is not less than current one. * "Lots" means "at least twice" here. */ if (new_window && new_window >= 2 * rcv_window_now) time_to_ack = true; } } if (time_to_ack) { tcp_mstamp_refresh(tp); tcp_send_ack(sk); } } void tcp_cleanup_rbuf(struct sock *sk, int copied) { struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); struct tcp_sock *tp = tcp_sk(sk); WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq), "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n", tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt); __tcp_cleanup_rbuf(sk, copied); } static void tcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb) { __skb_unlink(skb, &sk->sk_receive_queue); if (likely(skb->destructor == sock_rfree)) { sock_rfree(skb); skb->destructor = NULL; skb->sk = NULL; return skb_attempt_defer_free(skb); } __kfree_skb(skb); } struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) { struct sk_buff *skb; u32 offset; while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) { offset = seq - TCP_SKB_CB(skb)->seq; if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { pr_err_once("%s: found a SYN, please report !\n", __func__); offset--; } if (offset < skb->len || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) { *off = offset; return skb; } /* This looks weird, but this can happen if TCP collapsing * splitted a fat GRO packet, while we released socket lock * in skb_splice_bits() */ tcp_eat_recv_skb(sk, skb); } return NULL; } EXPORT_SYMBOL(tcp_recv_skb); /* * This routine provides an alternative to tcp_recvmsg() for routines * that would like to handle copying from skbuffs directly in 'sendfile' * fashion. * Note: * - It is assumed that the socket was locked by the caller. * - The routine does not block. * - At present, there is no support for reading OOB data * or for 'peeking' the socket using this routine * (although both would be easy to implement). */ static int __tcp_read_sock(struct sock *sk, read_descriptor_t *desc, sk_read_actor_t recv_actor, bool noack, u32 *copied_seq) { struct sk_buff *skb; struct tcp_sock *tp = tcp_sk(sk); u32 seq = *copied_seq; u32 offset; int copied = 0; if (sk->sk_state == TCP_LISTEN) return -ENOTCONN; while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) { if (offset < skb->len) { int used; size_t len; len = skb->len - offset; /* Stop reading if we hit a patch of urgent data */ if (unlikely(tp->urg_data)) { u32 urg_offset = tp->urg_seq - seq; if (urg_offset < len) len = urg_offset; if (!len) break; } used = recv_actor(desc, skb, offset, len); if (used <= 0) { if (!copied) copied = used; break; } if (WARN_ON_ONCE(used > len)) used = len; seq += used; copied += used; offset += used; /* If recv_actor drops the lock (e.g. TCP splice * receive) the skb pointer might be invalid when * getting here: tcp_collapse might have deleted it * while aggregating skbs from the socket queue. */ skb = tcp_recv_skb(sk, seq - 1, &offset); if (!skb) break; /* TCP coalescing might have appended data to the skb. * Try to splice more frags */ if (offset + 1 != skb->len) continue; } if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) { tcp_eat_recv_skb(sk, skb); ++seq; break; } tcp_eat_recv_skb(sk, skb); if (!desc->count) break; WRITE_ONCE(*copied_seq, seq); } WRITE_ONCE(*copied_seq, seq); if (noack) goto out; tcp_rcv_space_adjust(sk); /* Clean up data we have read: This will do ACK frames. */ if (copied > 0) { tcp_recv_skb(sk, seq, &offset); tcp_cleanup_rbuf(sk, copied); } out: return copied; } int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, sk_read_actor_t recv_actor) { return __tcp_read_sock(sk, desc, recv_actor, false, &tcp_sk(sk)->copied_seq); } EXPORT_SYMBOL(tcp_read_sock); int tcp_read_sock_noack(struct sock *sk, read_descriptor_t *desc, sk_read_actor_t recv_actor, bool noack, u32 *copied_seq) { return __tcp_read_sock(sk, desc, recv_actor, noack, copied_seq); } int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor) { struct sk_buff *skb; int copied = 0; if (sk->sk_state == TCP_LISTEN) return -ENOTCONN; while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) { u8 tcp_flags; int used; __skb_unlink(skb, &sk->sk_receive_queue); WARN_ON_ONCE(!skb_set_owner_sk_safe(skb, sk)); tcp_flags = TCP_SKB_CB(skb)->tcp_flags; used = recv_actor(sk, skb); if (used < 0) { if (!copied) copied = used; break; } copied += used; if (tcp_flags & TCPHDR_FIN) break; } return copied; } EXPORT_IPV6_MOD(tcp_read_skb); void tcp_read_done(struct sock *sk, size_t len) { struct tcp_sock *tp = tcp_sk(sk); u32 seq = tp->copied_seq; struct sk_buff *skb; size_t left; u32 offset; if (sk->sk_state == TCP_LISTEN) return; left = len; while (left && (skb = tcp_recv_skb(sk, seq, &offset)) != NULL) { int used; used = min_t(size_t, skb->len - offset, left); seq += used; left -= used; if (skb->len > offset + used) break; if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) { tcp_eat_recv_skb(sk, skb); ++seq; break; } tcp_eat_recv_skb(sk, skb); } WRITE_ONCE(tp->copied_seq, seq); tcp_rcv_space_adjust(sk); /* Clean up data we have read: This will do ACK frames. */ if (left != len) tcp_cleanup_rbuf(sk, len - left); } EXPORT_SYMBOL(tcp_read_done); int tcp_peek_len(struct socket *sock) { return tcp_inq(sock->sk); } EXPORT_IPV6_MOD(tcp_peek_len); /* Make sure sk_rcvbuf is big enough to satisfy SO_RCVLOWAT hint */ int tcp_set_rcvlowat(struct sock *sk, int val) { struct tcp_sock *tp = tcp_sk(sk); int space, cap; if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) cap = sk->sk_rcvbuf >> 1; else cap = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1; val = min(val, cap); WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); /* Check if we need to signal EPOLLIN right now */ tcp_data_ready(sk); if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) return 0; space = tcp_space_from_win(sk, val); if (space > sk->sk_rcvbuf) { WRITE_ONCE(sk->sk_rcvbuf, space); if (tp->window_clamp && tp->window_clamp < val) WRITE_ONCE(tp->window_clamp, val); } return 0; } EXPORT_IPV6_MOD(tcp_set_rcvlowat); void tcp_update_recv_tstamps(struct sk_buff *skb, struct scm_timestamping_internal *tss) { if (skb->tstamp) tss->ts[0] = ktime_to_timespec64(skb->tstamp); else tss->ts[0] = (struct timespec64) {0}; if (skb_hwtstamps(skb)->hwtstamp) tss->ts[2] = ktime_to_timespec64(skb_hwtstamps(skb)->hwtstamp); else tss->ts[2] = (struct timespec64) {0}; } #ifdef CONFIG_MMU static const struct vm_operations_struct tcp_vm_ops = { }; int tcp_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) { if (vma->vm_flags & (VM_WRITE | VM_EXEC)) return -EPERM; vm_flags_clear(vma, VM_MAYWRITE | VM_MAYEXEC); /* Instruct vm_insert_page() to not mmap_read_lock(mm) */ vm_flags_set(vma, VM_MIXEDMAP); vma->vm_ops = &tcp_vm_ops; return 0; } EXPORT_IPV6_MOD(tcp_mmap); static skb_frag_t *skb_advance_to_frag(struct sk_buff *skb, u32 offset_skb, u32 *offset_frag) { skb_frag_t *frag; if (unlikely(offset_skb >= skb->len)) return NULL; offset_skb -= skb_headlen(skb); if ((int)offset_skb < 0 || skb_has_frag_list(skb)) return NULL; frag = skb_shinfo(skb)->frags; while (offset_skb) { if (skb_frag_size(frag) > offset_skb) { *offset_frag = offset_skb; return frag; } offset_skb -= skb_frag_size(frag); ++frag; } *offset_frag = 0; return frag; } static bool can_map_frag(const skb_frag_t *frag) { struct page *page; if (skb_frag_size(frag) != PAGE_SIZE || skb_frag_off(frag)) return false; page = skb_frag_page(frag); if (PageCompound(page) || page->mapping) return false; return true; } static int find_next_mappable_frag(const skb_frag_t *frag, int remaining_in_skb) { int offset = 0; if (likely(can_map_frag(frag))) return 0; while (offset < remaining_in_skb && !can_map_frag(frag)) { offset += skb_frag_size(frag); ++frag; } return offset; } static void tcp_zerocopy_set_hint_for_skb(struct sock *sk, struct tcp_zerocopy_receive *zc, struct sk_buff *skb, u32 offset) { u32 frag_offset, partial_frag_remainder = 0; int mappable_offset; skb_frag_t *frag; /* worst case: skip to next skb. try to improve on this case below */ zc->recv_skip_hint = skb->len - offset; /* Find the frag containing this offset (and how far into that frag) */ frag = skb_advance_to_frag(skb, offset, &frag_offset); if (!frag) return; if (frag_offset) { struct skb_shared_info *info = skb_shinfo(skb); /* We read part of the last frag, must recvmsg() rest of skb. */ if (frag == &info->frags[info->nr_frags - 1]) return; /* Else, we must at least read the remainder in this frag. */ partial_frag_remainder = skb_frag_size(frag) - frag_offset; zc->recv_skip_hint -= partial_frag_remainder; ++frag; } /* partial_frag_remainder: If part way through a frag, must read rest. * mappable_offset: Bytes till next mappable frag, *not* counting bytes * in partial_frag_remainder. */ mappable_offset = find_next_mappable_frag(frag, zc->recv_skip_hint); zc->recv_skip_hint = mappable_offset + partial_frag_remainder; } static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len, int flags, struct scm_timestamping_internal *tss, int *cmsg_flags); static int receive_fallback_to_copy(struct sock *sk, struct tcp_zerocopy_receive *zc, int inq, struct scm_timestamping_internal *tss) { unsigned long copy_address = (unsigned long)zc->copybuf_address; struct msghdr msg = {}; int err; zc->length = 0; zc->recv_skip_hint = 0; if (copy_address != zc->copybuf_address) return -EINVAL; err = import_ubuf(ITER_DEST, (void __user *)copy_address, inq, &msg.msg_iter); if (err) return err; err = tcp_recvmsg_locked(sk, &msg, inq, MSG_DONTWAIT, tss, &zc->msg_flags); if (err < 0) return err; zc->copybuf_len = err; if (likely(zc->copybuf_len)) { struct sk_buff *skb; u32 offset; skb = tcp_recv_skb(sk, tcp_sk(sk)->copied_seq, &offset); if (skb) tcp_zerocopy_set_hint_for_skb(sk, zc, skb, offset); } return 0; } static int tcp_copy_straggler_data(struct tcp_zerocopy_receive *zc, struct sk_buff *skb, u32 copylen, u32 *offset, u32 *seq) { unsigned long copy_address = (unsigned long)zc->copybuf_address; struct msghdr msg = {}; int err; if (copy_address != zc->copybuf_address) return -EINVAL; err = import_ubuf(ITER_DEST, (void __user *)copy_address, copylen, &msg.msg_iter); if (err) return err; err = skb_copy_datagram_msg(skb, *offset, &msg, copylen); if (err) return err; zc->recv_skip_hint -= copylen; *offset += copylen; *seq += copylen; return (__s32)copylen; } static int tcp_zc_handle_leftover(struct tcp_zerocopy_receive *zc, struct sock *sk, struct sk_buff *skb, u32 *seq, s32 copybuf_len, struct scm_timestamping_internal *tss) { u32 offset, copylen = min_t(u32, copybuf_len, zc->recv_skip_hint); if (!copylen) return 0; /* skb is null if inq < PAGE_SIZE. */ if (skb) { offset = *seq - TCP_SKB_CB(skb)->seq; } else { skb = tcp_recv_skb(sk, *seq, &offset); if (TCP_SKB_CB(skb)->has_rxtstamp) { tcp_update_recv_tstamps(skb, tss); zc->msg_flags |= TCP_CMSG_TS; } } zc->copybuf_len = tcp_copy_straggler_data(zc, skb, copylen, &offset, seq); return zc->copybuf_len < 0 ? 0 : copylen; } static int tcp_zerocopy_vm_insert_batch_error(struct vm_area_struct *vma, struct page **pending_pages, unsigned long pages_remaining, unsigned long *address, u32 *length, u32 *seq, struct tcp_zerocopy_receive *zc, u32 total_bytes_to_map, int err) { /* At least one page did not map. Try zapping if we skipped earlier. */ if (err == -EBUSY && zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT) { u32 maybe_zap_len; maybe_zap_len = total_bytes_to_map - /* All bytes to map */ *length + /* Mapped or pending */ (pages_remaining * PAGE_SIZE); /* Failed map. */ zap_page_range_single(vma, *address, maybe_zap_len, NULL); err = 0; } if (!err) { unsigned long leftover_pages = pages_remaining; int bytes_mapped; /* We called zap_page_range_single, try to reinsert. */ err = vm_insert_pages(vma, *address, pending_pages, &pages_remaining); bytes_mapped = PAGE_SIZE * (leftover_pages - pages_remaining); *seq += bytes_mapped; *address += bytes_mapped; } if (err) { /* Either we were unable to zap, OR we zapped, retried an * insert, and still had an issue. Either ways, pages_remaining * is the number of pages we were unable to map, and we unroll * some state we speculatively touched before. */ const int bytes_not_mapped = PAGE_SIZE * pages_remaining; *length -= bytes_not_mapped; zc->recv_skip_hint += bytes_not_mapped; } return err; } static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma, struct page **pages, unsigned int pages_to_map, unsigned long *address, u32 *length, u32 *seq, struct tcp_zerocopy_receive *zc, u32 total_bytes_to_map) { unsigned long pages_remaining = pages_to_map; unsigned int pages_mapped; unsigned int bytes_mapped; int err; err = vm_insert_pages(vma, *address, pages, &pages_remaining); pages_mapped = pages_to_map - (unsigned int)pages_remaining; bytes_mapped = PAGE_SIZE * pages_mapped; /* Even if vm_insert_pages fails, it may have partially succeeded in * mapping (some but not all of the pages). */ *seq += bytes_mapped; *address += bytes_mapped; if (likely(!err)) return 0; /* Error: maybe zap and retry + rollback state for failed inserts. */ return tcp_zerocopy_vm_insert_batch_error(vma, pages + pages_mapped, pages_remaining, address, length, seq, zc, total_bytes_to_map, err); } #define TCP_VALID_ZC_MSG_FLAGS (TCP_CMSG_TS) static void tcp_zc_finalize_rx_tstamp(struct sock *sk, struct tcp_zerocopy_receive *zc, struct scm_timestamping_internal *tss) { unsigned long msg_control_addr; struct msghdr cmsg_dummy; msg_control_addr = (unsigned long)zc->msg_control; cmsg_dummy.msg_control_user = (void __user *)msg_control_addr; cmsg_dummy.msg_controllen = (__kernel_size_t)zc->msg_controllen; cmsg_dummy.msg_flags = in_compat_syscall() ? MSG_CMSG_COMPAT : 0; cmsg_dummy.msg_control_is_user = true; zc->msg_flags = 0; if (zc->msg_control == msg_control_addr && zc->msg_controllen == cmsg_dummy.msg_controllen) { tcp_recv_timestamp(&cmsg_dummy, sk, tss); zc->msg_control = (__u64) ((uintptr_t)cmsg_dummy.msg_control_user); zc->msg_controllen = (__u64)cmsg_dummy.msg_controllen; zc->msg_flags = (__u32)cmsg_dummy.msg_flags; } } static struct vm_area_struct *find_tcp_vma(struct mm_struct *mm, unsigned long address, bool *mmap_locked) { struct vm_area_struct *vma = lock_vma_under_rcu(mm, address); if (vma) { if (vma->vm_ops != &tcp_vm_ops) { vma_end_read(vma); return NULL; } *mmap_locked = false; return vma; } mmap_read_lock(mm); vma = vma_lookup(mm, address); if (!vma || vma->vm_ops != &tcp_vm_ops) { mmap_read_unlock(mm); return NULL; } *mmap_locked = true; return vma; } #define TCP_ZEROCOPY_PAGE_BATCH_SIZE 32 static int tcp_zerocopy_receive(struct sock *sk, struct tcp_zerocopy_receive *zc, struct scm_timestamping_internal *tss) { u32 length = 0, offset, vma_len, avail_len, copylen = 0; unsigned long address = (unsigned long)zc->address; struct page *pages[TCP_ZEROCOPY_PAGE_BATCH_SIZE]; s32 copybuf_len = zc->copybuf_len; struct tcp_sock *tp = tcp_sk(sk); const skb_frag_t *frags = NULL; unsigned int pages_to_map = 0; struct vm_area_struct *vma; struct sk_buff *skb = NULL; u32 seq = tp->copied_seq; u32 total_bytes_to_map; int inq = tcp_inq(sk); bool mmap_locked; int ret; zc->copybuf_len = 0; zc->msg_flags = 0; if (address & (PAGE_SIZE - 1) || address != zc->address) return -EINVAL; if (sk->sk_state == TCP_LISTEN) return -ENOTCONN; sock_rps_record_flow(sk); if (inq && inq <= copybuf_len) return receive_fallback_to_copy(sk, zc, inq, tss); if (inq < PAGE_SIZE) { zc->length = 0; zc->recv_skip_hint = inq; if (!inq && sock_flag(sk, SOCK_DONE)) return -EIO; return 0; } vma = find_tcp_vma(current->mm, address, &mmap_locked); if (!vma) return -EINVAL; vma_len = min_t(unsigned long, zc->length, vma->vm_end - address); avail_len = min_t(u32, vma_len, inq); total_bytes_to_map = avail_len & ~(PAGE_SIZE - 1); if (total_bytes_to_map) { if (!(zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT)) zap_page_range_single(vma, address, total_bytes_to_map, NULL); zc->length = total_bytes_to_map; zc->recv_skip_hint = 0; } else { zc->length = avail_len; zc->recv_skip_hint = avail_len; } ret = 0; while (length + PAGE_SIZE <= zc->length) { int mappable_offset; struct page *page; if (zc->recv_skip_hint < PAGE_SIZE) { u32 offset_frag; if (skb) { if (zc->recv_skip_hint > 0) break; skb = skb->next; offset = seq - TCP_SKB_CB(skb)->seq; } else { skb = tcp_recv_skb(sk, seq, &offset); } if (!skb_frags_readable(skb)) break; if (TCP_SKB_CB(skb)->has_rxtstamp) { tcp_update_recv_tstamps(skb, tss); zc->msg_flags |= TCP_CMSG_TS; } zc->recv_skip_hint = skb->len - offset; frags = skb_advance_to_frag(skb, offset, &offset_frag); if (!frags || offset_frag) break; } mappable_offset = find_next_mappable_frag(frags, zc->recv_skip_hint); if (mappable_offset) { zc->recv_skip_hint = mappable_offset; break; } page = skb_frag_page(frags); if (WARN_ON_ONCE(!page)) break; prefetchw(page); pages[pages_to_map++] = page; length += PAGE_SIZE; zc->recv_skip_hint -= PAGE_SIZE; frags++; if (pages_to_map == TCP_ZEROCOPY_PAGE_BATCH_SIZE || zc->recv_skip_hint < PAGE_SIZE) { /* Either full batch, or we're about to go to next skb * (and we cannot unroll failed ops across skbs). */ ret = tcp_zerocopy_vm_insert_batch(vma, pages, pages_to_map, &address, &length, &seq, zc, total_bytes_to_map); if (ret) goto out; pages_to_map = 0; } } if (pages_to_map) { ret = tcp_zerocopy_vm_insert_batch(vma, pages, pages_to_map, &address, &length, &seq, zc, total_bytes_to_map); } out: if (mmap_locked) mmap_read_unlock(current->mm); else vma_end_read(vma); /* Try to copy straggler data. */ if (!ret) copylen = tcp_zc_handle_leftover(zc, sk, skb, &seq, copybuf_len, tss); if (length + copylen) { WRITE_ONCE(tp->copied_seq, seq); tcp_rcv_space_adjust(sk); /* Clean up data we have read: This will do ACK frames. */ tcp_recv_skb(sk, seq, &offset); tcp_cleanup_rbuf(sk, length + copylen); ret = 0; if (length == zc->length) zc->recv_skip_hint = 0; } else { if (!zc->recv_skip_hint && sock_flag(sk, SOCK_DONE)) ret = -EIO; } zc->length = length; return ret; } #endif /* Similar to __sock_recv_timestamp, but does not require an skb */ void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, struct scm_timestamping_internal *tss) { int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW); u32 tsflags = READ_ONCE(sk->sk_tsflags); bool has_timestamping = false; if (tss->ts[0].tv_sec || tss->ts[0].tv_nsec) { if (sock_flag(sk, SOCK_RCVTSTAMP)) { if (sock_flag(sk, SOCK_RCVTSTAMPNS)) { if (new_tstamp) { struct __kernel_timespec kts = { .tv_sec = tss->ts[0].tv_sec, .tv_nsec = tss->ts[0].tv_nsec, }; put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_NEW, sizeof(kts), &kts); } else { struct __kernel_old_timespec ts_old = { .tv_sec = tss->ts[0].tv_sec, .tv_nsec = tss->ts[0].tv_nsec, }; put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_OLD, sizeof(ts_old), &ts_old); } } else { if (new_tstamp) { struct __kernel_sock_timeval stv = { .tv_sec = tss->ts[0].tv_sec, .tv_usec = tss->ts[0].tv_nsec / 1000, }; put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_NEW, sizeof(stv), &stv); } else { struct __kernel_old_timeval tv = { .tv_sec = tss->ts[0].tv_sec, .tv_usec = tss->ts[0].tv_nsec / 1000, }; put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD, sizeof(tv), &tv); } } } if (tsflags & SOF_TIMESTAMPING_SOFTWARE && (tsflags & SOF_TIMESTAMPING_RX_SOFTWARE || !(tsflags & SOF_TIMESTAMPING_OPT_RX_FILTER))) has_timestamping = true; else tss->ts[0] = (struct timespec64) {0}; } if (tss->ts[2].tv_sec || tss->ts[2].tv_nsec) { if (tsflags & SOF_TIMESTAMPING_RAW_HARDWARE && (tsflags & SOF_TIMESTAMPING_RX_HARDWARE || !(tsflags & SOF_TIMESTAMPING_OPT_RX_FILTER))) has_timestamping = true; else tss->ts[2] = (struct timespec64) {0}; } if (has_timestamping) { tss->ts[1] = (struct timespec64) {0}; if (sock_flag(sk, SOCK_TSTAMP_NEW)) put_cmsg_scm_timestamping64(msg, tss); else put_cmsg_scm_timestamping(msg, tss); } } static int tcp_inq_hint(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); u32 copied_seq = READ_ONCE(tp->copied_seq); u32 rcv_nxt = READ_ONCE(tp->rcv_nxt); int inq; inq = rcv_nxt - copied_seq; if (unlikely(inq < 0 || copied_seq != READ_ONCE(tp->copied_seq))) { lock_sock(sk); inq = tp->rcv_nxt - tp->copied_seq; release_sock(sk); } /* After receiving a FIN, tell the user-space to continue reading * by returning a non-zero inq. */ if (inq == 0 && sock_flag(sk, SOCK_DONE)) inq = 1; return inq; } /* batch __xa_alloc() calls and reduce xa_lock()/xa_unlock() overhead. */ struct tcp_xa_pool { u8 max; /* max <= MAX_SKB_FRAGS */ u8 idx; /* idx <= max */ __u32 tokens[MAX_SKB_FRAGS]; netmem_ref netmems[MAX_SKB_FRAGS]; }; static void tcp_xa_pool_commit_locked(struct sock *sk, struct tcp_xa_pool *p) { int i; /* Commit part that has been copied to user space. */ for (i = 0; i < p->idx; i++) __xa_cmpxchg(&sk->sk_user_frags, p->tokens[i], XA_ZERO_ENTRY, (__force void *)p->netmems[i], GFP_KERNEL); /* Rollback what has been pre-allocated and is no longer needed. */ for (; i < p->max; i++) __xa_erase(&sk->sk_user_frags, p->tokens[i]); p->max = 0; p->idx = 0; } static void tcp_xa_pool_commit(struct sock *sk, struct tcp_xa_pool *p) { if (!p->max) return; xa_lock_bh(&sk->sk_user_frags); tcp_xa_pool_commit_locked(sk, p); xa_unlock_bh(&sk->sk_user_frags); } static int tcp_xa_pool_refill(struct sock *sk, struct tcp_xa_pool *p, unsigned int max_frags) { int err, k; if (p->idx < p->max) return 0; xa_lock_bh(&sk->sk_user_frags); tcp_xa_pool_commit_locked(sk, p); for (k = 0; k < max_frags; k++) { err = __xa_alloc(&sk->sk_user_frags, &p->tokens[k], XA_ZERO_ENTRY, xa_limit_31b, GFP_KERNEL); if (err) break; } xa_unlock_bh(&sk->sk_user_frags); p->max = k; p->idx = 0; return k ? 0 : err; } /* On error, returns the -errno. On success, returns number of bytes sent to the * user. May not consume all of @remaining_len. */ static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb, unsigned int offset, struct msghdr *msg, int remaining_len) { struct dmabuf_cmsg dmabuf_cmsg = { 0 }; struct tcp_xa_pool tcp_xa_pool; unsigned int start; int i, copy, n; int sent = 0; int err = 0; tcp_xa_pool.max = 0; tcp_xa_pool.idx = 0; do { start = skb_headlen(skb); if (skb_frags_readable(skb)) { err = -ENODEV; goto out; } /* Copy header. */ copy = start - offset; if (copy > 0) { copy = min(copy, remaining_len); n = copy_to_iter(skb->data + offset, copy, &msg->msg_iter); if (n != copy) { err = -EFAULT; goto out; } offset += copy; remaining_len -= copy; /* First a dmabuf_cmsg for # bytes copied to user * buffer. */ memset(&dmabuf_cmsg, 0, sizeof(dmabuf_cmsg)); dmabuf_cmsg.frag_size = copy; err = put_cmsg_notrunc(msg, SOL_SOCKET, SO_DEVMEM_LINEAR, sizeof(dmabuf_cmsg), &dmabuf_cmsg); if (err) goto out; sent += copy; if (remaining_len == 0) goto out; } /* after that, send information of dmabuf pages through a * sequence of cmsg */ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; struct net_iov *niov; u64 frag_offset; int end; /* !skb_frags_readable() should indicate that ALL the * frags in this skb are dmabuf net_iovs. We're checking * for that flag above, but also check individual frags * here. If the tcp stack is not setting * skb_frags_readable() correctly, we still don't want * to crash here. */ if (!skb_frag_net_iov(frag)) { net_err_ratelimited("Found non-dmabuf skb with net_iov"); err = -ENODEV; goto out; } niov = skb_frag_net_iov(frag); if (!net_is_devmem_iov(niov)) { err = -ENODEV; goto out; } end = start + skb_frag_size(frag); copy = end - offset; if (copy > 0) { copy = min(copy, remaining_len); frag_offset = net_iov_virtual_addr(niov) + skb_frag_off(frag) + offset - start; dmabuf_cmsg.frag_offset = frag_offset; dmabuf_cmsg.frag_size = copy; err = tcp_xa_pool_refill(sk, &tcp_xa_pool, skb_shinfo(skb)->nr_frags - i); if (err) goto out; /* Will perform the exchange later */ dmabuf_cmsg.frag_token = tcp_xa_pool.tokens[tcp_xa_pool.idx]; dmabuf_cmsg.dmabuf_id = net_devmem_iov_binding_id(niov); offset += copy; remaining_len -= copy; err = put_cmsg_notrunc(msg, SOL_SOCKET, SO_DEVMEM_DMABUF, sizeof(dmabuf_cmsg), &dmabuf_cmsg); if (err) goto out; atomic_long_inc(&niov->desc.pp_ref_count); tcp_xa_pool.netmems[tcp_xa_pool.idx++] = skb_frag_netmem(frag); sent += copy; if (remaining_len == 0) goto out; } start = end; } tcp_xa_pool_commit(sk, &tcp_xa_pool); if (!remaining_len) goto out; /* if remaining_len is not satisfied yet, we need to go to the * next frag in the frag_list to satisfy remaining_len. */ skb = skb_shinfo(skb)->frag_list ?: skb->next; offset = offset - start; } while (skb); if (remaining_len) { err = -EFAULT; goto out; } out: tcp_xa_pool_commit(sk, &tcp_xa_pool); if (!sent) sent = err; return sent; } /* * This routine copies from a sock struct into the user buffer. * * Technical note: in 2.3 we work on _locked_ socket, so that * tricks with *seq access order and skb->users are not required. * Probably, code can be easily improved even more. */ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len, int flags, struct scm_timestamping_internal *tss, int *cmsg_flags) { struct tcp_sock *tp = tcp_sk(sk); int last_copied_dmabuf = -1; /* uninitialized */ int copied = 0; u32 peek_seq; u32 *seq; unsigned long used; int err; int target; /* Read at least this many bytes */ long timeo; struct sk_buff *skb, *last; u32 peek_offset = 0; u32 urg_hole = 0; err = -ENOTCONN; if (sk->sk_state == TCP_LISTEN) goto out; if (tp->recvmsg_inq) { *cmsg_flags = TCP_CMSG_INQ; msg->msg_get_inq = 1; } timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); /* Urgent data needs to be handled specially. */ if (flags & MSG_OOB) goto recv_urg; if (unlikely(tp->repair)) { err = -EPERM; if (!(flags & MSG_PEEK)) goto out; if (tp->repair_queue == TCP_SEND_QUEUE) goto recv_sndq; err = -EINVAL; if (tp->repair_queue == TCP_NO_QUEUE) goto out; /* 'common' recv queue MSG_PEEK-ing */ } seq = &tp->copied_seq; if (flags & MSG_PEEK) { peek_offset = max(sk_peek_offset(sk, flags), 0); peek_seq = tp->copied_seq + peek_offset; seq = &peek_seq; } target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); do { u32 offset; /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */ if (unlikely(tp->urg_data) && tp->urg_seq == *seq) { if (copied) break; if (signal_pending(current)) { copied = timeo ? sock_intr_errno(timeo) : -EAGAIN; break; } } /* Next get a buffer. */ last = skb_peek_tail(&sk->sk_receive_queue); skb_queue_walk(&sk->sk_receive_queue, skb) { last = skb; /* Now that we have two receive queues this * shouldn't happen. */ if (WARN(before(*seq, TCP_SKB_CB(skb)->seq), "TCP recvmsg seq # bug: copied %X, seq %X, rcvnxt %X, fl %X\n", *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags)) break; offset = *seq - TCP_SKB_CB(skb)->seq; if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { pr_err_once("%s: found a SYN, please report !\n", __func__); offset--; } if (offset < skb->len) goto found_ok_skb; if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) goto found_fin_ok; WARN(!(flags & MSG_PEEK), "TCP recvmsg seq # bug 2: copied %X, seq %X, rcvnxt %X, fl %X\n", *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags); } /* Well, if we have backlog, try to process it now yet. */ if (copied >= target && !READ_ONCE(sk->sk_backlog.tail)) break; if (copied) { if (!timeo || sk->sk_err || sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN) || signal_pending(current)) break; } else { if (sock_flag(sk, SOCK_DONE)) break; if (sk->sk_err) { copied = sock_error(sk); break; } if (sk->sk_shutdown & RCV_SHUTDOWN) break; if (sk->sk_state == TCP_CLOSE) { /* This occurs when user tries to read * from never connected socket. */ copied = -ENOTCONN; break; } if (!timeo) { copied = -EAGAIN; break; } if (signal_pending(current)) { copied = sock_intr_errno(timeo); break; } } if (copied >= target) { /* Do not sleep, just process backlog. */ __sk_flush_backlog(sk); } else { tcp_cleanup_rbuf(sk, copied); err = sk_wait_data(sk, &timeo, last); if (err < 0) { err = copied ? : err; goto out; } } if ((flags & MSG_PEEK) && (peek_seq - peek_offset - copied - urg_hole != tp->copied_seq)) { net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n", current->comm, task_pid_nr(current)); peek_seq = tp->copied_seq + peek_offset; } continue; found_ok_skb: /* Ok so how much can we use? */ used = skb->len - offset; if (len < used) used = len; /* Do we have urgent data here? */ if (unlikely(tp->urg_data)) { u32 urg_offset = tp->urg_seq - *seq; if (urg_offset < used) { if (!urg_offset) { if (!sock_flag(sk, SOCK_URGINLINE)) { WRITE_ONCE(*seq, *seq + 1); urg_hole++; offset++; used--; if (!used) goto skip_copy; } } else used = urg_offset; } } if (!(flags & MSG_TRUNC)) { if (last_copied_dmabuf != -1 && last_copied_dmabuf != !skb_frags_readable(skb)) break; if (skb_frags_readable(skb)) { err = skb_copy_datagram_msg(skb, offset, msg, used); if (err) { /* Exception. Bailout! */ if (!copied) copied = -EFAULT; break; } } else { if (!(flags & MSG_SOCK_DEVMEM)) { /* dmabuf skbs can only be received * with the MSG_SOCK_DEVMEM flag. */ if (!copied) copied = -EFAULT; break; } err = tcp_recvmsg_dmabuf(sk, skb, offset, msg, used); if (err < 0) { if (!copied) copied = err; break; } used = err; } } last_copied_dmabuf = !skb_frags_readable(skb); WRITE_ONCE(*seq, *seq + used); copied += used; len -= used; if (flags & MSG_PEEK) sk_peek_offset_fwd(sk, used); else sk_peek_offset_bwd(sk, used); tcp_rcv_space_adjust(sk); skip_copy: if (unlikely(tp->urg_data) && after(tp->copied_seq, tp->urg_seq)) { WRITE_ONCE(tp->urg_data, 0); tcp_fast_path_check(sk); } if (TCP_SKB_CB(skb)->has_rxtstamp) { tcp_update_recv_tstamps(skb, tss); *cmsg_flags |= TCP_CMSG_TS; } if (used + offset < skb->len) continue; if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) goto found_fin_ok; if (!(flags & MSG_PEEK)) tcp_eat_recv_skb(sk, skb); continue; found_fin_ok: /* Process the FIN. */ WRITE_ONCE(*seq, *seq + 1); if (!(flags & MSG_PEEK)) tcp_eat_recv_skb(sk, skb); break; } while (len > 0); /* According to UNIX98, msg_name/msg_namelen are ignored * on connected socket. I was just happy when found this 8) --ANK */ /* Clean up data we have read: This will do ACK frames. */ tcp_cleanup_rbuf(sk, copied); return copied; out: return err; recv_urg: err = tcp_recv_urg(sk, msg, len, flags); goto out; recv_sndq: err = tcp_peek_sndq(sk, msg, len); goto out; } int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { int cmsg_flags = 0, ret; struct scm_timestamping_internal tss; if (unlikely(flags & MSG_ERRQUEUE)) return inet_recv_error(sk, msg, len, addr_len); if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue) && sk->sk_state == TCP_ESTABLISHED) sk_busy_loop(sk, flags & MSG_DONTWAIT); lock_sock(sk); ret = tcp_recvmsg_locked(sk, msg, len, flags, &tss, &cmsg_flags); release_sock(sk); if ((cmsg_flags || msg->msg_get_inq) && ret >= 0) { if (cmsg_flags & TCP_CMSG_TS) tcp_recv_timestamp(msg, sk, &tss); if (msg->msg_get_inq) { msg->msg_inq = tcp_inq_hint(sk); if (cmsg_flags & TCP_CMSG_INQ) put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(msg->msg_inq), &msg->msg_inq); } } return ret; } EXPORT_IPV6_MOD(tcp_recvmsg); void tcp_set_state(struct sock *sk, int state) { int oldstate = sk->sk_state; /* We defined a new enum for TCP states that are exported in BPF * so as not force the internal TCP states to be frozen. The * following checks will detect if an internal state value ever * differs from the BPF value. If this ever happens, then we will * need to remap the internal value to the BPF value before calling * tcp_call_bpf_2arg. */ BUILD_BUG_ON((int)BPF_TCP_ESTABLISHED != (int)TCP_ESTABLISHED); BUILD_BUG_ON((int)BPF_TCP_SYN_SENT != (int)TCP_SYN_SENT); BUILD_BUG_ON((int)BPF_TCP_SYN_RECV != (int)TCP_SYN_RECV); BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT1 != (int)TCP_FIN_WAIT1); BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT2 != (int)TCP_FIN_WAIT2); BUILD_BUG_ON((int)BPF_TCP_TIME_WAIT != (int)TCP_TIME_WAIT); BUILD_BUG_ON((int)BPF_TCP_CLOSE != (int)TCP_CLOSE); BUILD_BUG_ON((int)BPF_TCP_CLOSE_WAIT != (int)TCP_CLOSE_WAIT); BUILD_BUG_ON((int)BPF_TCP_LAST_ACK != (int)TCP_LAST_ACK); BUILD_BUG_ON((int)BPF_TCP_LISTEN != (int)TCP_LISTEN); BUILD_BUG_ON((int)BPF_TCP_CLOSING != (int)TCP_CLOSING); BUILD_BUG_ON((int)BPF_TCP_NEW_SYN_RECV != (int)TCP_NEW_SYN_RECV); BUILD_BUG_ON((int)BPF_TCP_BOUND_INACTIVE != (int)TCP_BOUND_INACTIVE); BUILD_BUG_ON((int)BPF_TCP_MAX_STATES != (int)TCP_MAX_STATES); /* bpf uapi header bpf.h defines an anonymous enum with values * BPF_TCP_* used by bpf programs. Currently gcc built vmlinux * is able to emit this enum in DWARF due to the above BUILD_BUG_ON. * But clang built vmlinux does not have this enum in DWARF * since clang removes the above code before generating IR/debuginfo. * Let us explicitly emit the type debuginfo to ensure the * above-mentioned anonymous enum in the vmlinux DWARF and hence BTF * regardless of which compiler is used. */ BTF_TYPE_EMIT_ENUM(BPF_TCP_ESTABLISHED); if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_STATE_CB_FLAG)) tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_STATE_CB, oldstate, state); switch (state) { case TCP_ESTABLISHED: if (oldstate != TCP_ESTABLISHED) TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB); break; case TCP_CLOSE_WAIT: if (oldstate == TCP_SYN_RECV) TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB); break; case TCP_CLOSE: if (oldstate == TCP_CLOSE_WAIT || oldstate == TCP_ESTABLISHED) TCP_INC_STATS(sock_net(sk), TCP_MIB_ESTABRESETS); sk->sk_prot->unhash(sk); if (inet_csk(sk)->icsk_bind_hash && !(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) inet_put_port(sk); fallthrough; default: if (oldstate == TCP_ESTABLISHED || oldstate == TCP_CLOSE_WAIT) TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB); } /* Change state AFTER socket is unhashed to avoid closed * socket sitting in hash tables. */ inet_sk_state_store(sk, state); } EXPORT_SYMBOL_GPL(tcp_set_state); /* * State processing on a close. This implements the state shift for * sending our FIN frame. Note that we only send a FIN for some * states. A shutdown() may have already sent the FIN, or we may be * closed. */ static const unsigned char new_state[16] = { /* current state: new state: action: */ [0 /* (Invalid) */] = TCP_CLOSE, [TCP_ESTABLISHED] = TCP_FIN_WAIT1 | TCP_ACTION_FIN, [TCP_SYN_SENT] = TCP_CLOSE, [TCP_SYN_RECV] = TCP_FIN_WAIT1 | TCP_ACTION_FIN, [TCP_FIN_WAIT1] = TCP_FIN_WAIT1, [TCP_FIN_WAIT2] = TCP_FIN_WAIT2, [TCP_TIME_WAIT] = TCP_CLOSE, [TCP_CLOSE] = TCP_CLOSE, [TCP_CLOSE_WAIT] = TCP_LAST_ACK | TCP_ACTION_FIN, [TCP_LAST_ACK] = TCP_LAST_ACK, [TCP_LISTEN] = TCP_CLOSE, [TCP_CLOSING] = TCP_CLOSING, [TCP_NEW_SYN_RECV] = TCP_CLOSE, /* should not happen ! */ }; static int tcp_close_state(struct sock *sk) { int next = (int)new_state[sk->sk_state]; int ns = next & TCP_STATE_MASK; tcp_set_state(sk, ns); return next & TCP_ACTION_FIN; } /* * Shutdown the sending side of a connection. Much like close except * that we don't receive shut down or sock_set_flag(sk, SOCK_DEAD). */ void tcp_shutdown(struct sock *sk, int how) { /* We need to grab some memory, and put together a FIN, * and then put it into the queue to be sent. * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92. */ if (!(how & SEND_SHUTDOWN)) return; /* If we've already sent a FIN, or it's a closed state, skip this. */ if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_SYN_SENT | TCPF_CLOSE_WAIT)) { /* Clear out any half completed packets. FIN if needed. */ if (tcp_close_state(sk)) tcp_send_fin(sk); } } EXPORT_IPV6_MOD(tcp_shutdown); int tcp_orphan_count_sum(void) { int i, total = 0; for_each_possible_cpu(i) total += per_cpu(tcp_orphan_count, i); return max(total, 0); } static int tcp_orphan_cache; static struct timer_list tcp_orphan_timer; #define TCP_ORPHAN_TIMER_PERIOD msecs_to_jiffies(100) static void tcp_orphan_update(struct timer_list *unused) { WRITE_ONCE(tcp_orphan_cache, tcp_orphan_count_sum()); mod_timer(&tcp_orphan_timer, jiffies + TCP_ORPHAN_TIMER_PERIOD); } static bool tcp_too_many_orphans(int shift) { return READ_ONCE(tcp_orphan_cache) << shift > READ_ONCE(sysctl_tcp_max_orphans); } static bool tcp_out_of_memory(const struct sock *sk) { if (sk->sk_wmem_queued > SOCK_MIN_SNDBUF && sk_memory_allocated(sk) > sk_prot_mem_limits(sk, 2)) return true; return false; } bool tcp_check_oom(const struct sock *sk, int shift) { bool too_many_orphans, out_of_socket_memory; too_many_orphans = tcp_too_many_orphans(shift); out_of_socket_memory = tcp_out_of_memory(sk); if (too_many_orphans) net_info_ratelimited("too many orphaned sockets\n"); if (out_of_socket_memory) net_info_ratelimited("out of memory -- consider tuning tcp_mem\n"); return too_many_orphans || out_of_socket_memory; } void __tcp_close(struct sock *sk, long timeout) { bool data_was_unread = false; struct sk_buff *skb; int state; WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); if (sk->sk_state == TCP_LISTEN) { tcp_set_state(sk, TCP_CLOSE); /* Special case. */ inet_csk_listen_stop(sk); goto adjudge_to_death; } /* We need to flush the recv. buffs. We do this only on the * descriptor close, not protocol-sourced closes, because the * reader process may not have drained the data yet! */ while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) { u32 end_seq = TCP_SKB_CB(skb)->end_seq; if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) end_seq--; if (after(end_seq, tcp_sk(sk)->copied_seq)) data_was_unread = true; tcp_eat_recv_skb(sk, skb); } /* If socket has been already reset (e.g. in tcp_reset()) - kill it. */ if (sk->sk_state == TCP_CLOSE) goto adjudge_to_death; /* As outlined in RFC 2525, section 2.17, we send a RST here because * data was lost. To witness the awful effects of the old behavior of * always doing a FIN, run an older 2.1.x kernel or 2.0.x, start a bulk * GET in an FTP client, suspend the process, wait for the client to * advertise a zero window, then kill -9 the FTP client, wheee... * Note: timeout is always zero in such a case. */ if (unlikely(tcp_sk(sk)->repair)) { sk->sk_prot->disconnect(sk, 0); } else if (data_was_unread) { /* Unread data was tossed, zap the connection. */ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE); tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, sk->sk_allocation, SK_RST_REASON_TCP_ABORT_ON_CLOSE); } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) { /* Check zero linger _after_ checking for unread data. */ sk->sk_prot->disconnect(sk, 0); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA); } else if (tcp_close_state(sk)) { /* We FIN if the application ate all the data before * zapping the connection. */ /* RED-PEN. Formally speaking, we have broken TCP state * machine. State transitions: * * TCP_ESTABLISHED -> TCP_FIN_WAIT1 * TCP_SYN_RECV -> TCP_FIN_WAIT1 (it is difficult) * TCP_CLOSE_WAIT -> TCP_LAST_ACK * * are legal only when FIN has been sent (i.e. in window), * rather than queued out of window. Purists blame. * * F.e. "RFC state" is ESTABLISHED, * if Linux state is FIN-WAIT-1, but FIN is still not sent. * * The visible declinations are that sometimes * we enter time-wait state, when it is not required really * (harmless), do not send active resets, when they are * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when * they look as CLOSING or LAST_ACK for Linux) * Probably, I missed some more holelets. * --ANK * XXX (TFO) - To start off we don't support SYN+ACK+FIN * in a single packet! (May consider it later but will * probably need API support or TCP_CORK SYN-ACK until * data is written and socket is closed.) */ tcp_send_fin(sk); } sk_stream_wait_close(sk, timeout); adjudge_to_death: state = sk->sk_state; sock_hold(sk); sock_orphan(sk); local_bh_disable(); bh_lock_sock(sk); /* remove backlog if any, without releasing ownership. */ __release_sock(sk); tcp_orphan_count_inc(); /* Have we already been destroyed by a softirq or backlog? */ if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE) goto out; /* This is a (useful) BSD violating of the RFC. There is a * problem with TCP as specified in that the other end could * keep a socket open forever with no application left this end. * We use a 1 minute timeout (about the same as BSD) then kill * our end. If they send after that then tough - BUT: long enough * that we won't make the old 4*rto = almost no time - whoops * reset mistake. * * Nope, it was not mistake. It is really desired behaviour * f.e. on http servers, when such sockets are useless, but * consume significant resources. Let's do it with special * linger2 option. --ANK */ if (sk->sk_state == TCP_FIN_WAIT2) { struct tcp_sock *tp = tcp_sk(sk); if (READ_ONCE(tp->linger2) < 0) { tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, GFP_ATOMIC, SK_RST_REASON_TCP_ABORT_ON_LINGER); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONLINGER); } else { const int tmo = tcp_fin_time(sk); if (tmo > TCP_TIMEWAIT_LEN) { tcp_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); } else { tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); goto out; } } } if (sk->sk_state != TCP_CLOSE) { if (tcp_check_oom(sk, 0)) { tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, GFP_ATOMIC, SK_RST_REASON_TCP_ABORT_ON_MEMORY); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY); } else if (!check_net(sock_net(sk))) { /* Not possible to send reset; just close */ tcp_set_state(sk, TCP_CLOSE); } } if (sk->sk_state == TCP_CLOSE) { struct request_sock *req; req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk, lockdep_sock_is_held(sk)); /* We could get here with a non-NULL req if the socket is * aborted (e.g., closed with unread data) before 3WHS * finishes. */ if (req) reqsk_fastopen_remove(sk, req, false); inet_csk_destroy_sock(sk); } /* Otherwise, socket is reprieved until protocol close. */ out: bh_unlock_sock(sk); local_bh_enable(); } void tcp_close(struct sock *sk, long timeout) { lock_sock(sk); __tcp_close(sk, timeout); release_sock(sk); if (!sk->sk_net_refcnt) inet_csk_clear_xmit_timers_sync(sk); sock_put(sk); } EXPORT_SYMBOL(tcp_close); /* These states need RST on ABORT according to RFC793 */ static inline bool tcp_need_reset(int state) { return (1 << state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_SYN_RECV); } static void tcp_rtx_queue_purge(struct sock *sk) { struct rb_node *p = rb_first(&sk->tcp_rtx_queue); tcp_sk(sk)->highest_sack = NULL; while (p) { struct sk_buff *skb = rb_to_skb(p); p = rb_next(p); /* Since we are deleting whole queue, no need to * list_del(&skb->tcp_tsorted_anchor) */ tcp_rtx_queue_unlink(skb, sk); tcp_wmem_free_skb(sk, skb); } } void tcp_write_queue_purge(struct sock *sk) { struct sk_buff *skb; tcp_chrono_stop(sk, TCP_CHRONO_BUSY); while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) { tcp_skb_tsorted_anchor_cleanup(skb); tcp_wmem_free_skb(sk, skb); } tcp_rtx_queue_purge(sk); INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue); tcp_clear_all_retrans_hints(tcp_sk(sk)); tcp_sk(sk)->packets_out = 0; inet_csk(sk)->icsk_backoff = 0; } int tcp_disconnect(struct sock *sk, int flags) { struct inet_sock *inet = inet_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int old_state = sk->sk_state; struct request_sock *req; u32 seq; if (old_state != TCP_CLOSE) tcp_set_state(sk, TCP_CLOSE); /* ABORT function of RFC793 */ if (old_state == TCP_LISTEN) { inet_csk_listen_stop(sk); } else if (unlikely(tp->repair)) { WRITE_ONCE(sk->sk_err, ECONNABORTED); } else if (tcp_need_reset(old_state)) { tcp_send_active_reset(sk, gfp_any(), SK_RST_REASON_TCP_STATE); WRITE_ONCE(sk->sk_err, ECONNRESET); } else if (tp->snd_nxt != tp->write_seq && (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK)) { /* The last check adjusts for discrepancy of Linux wrt. RFC * states */ tcp_send_active_reset(sk, gfp_any(), SK_RST_REASON_TCP_DISCONNECT_WITH_DATA); WRITE_ONCE(sk->sk_err, ECONNRESET); } else if (old_state == TCP_SYN_SENT) WRITE_ONCE(sk->sk_err, ECONNRESET); tcp_clear_xmit_timers(sk); __skb_queue_purge(&sk->sk_receive_queue); WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); WRITE_ONCE(tp->urg_data, 0); sk_set_peek_off(sk, -1); tcp_write_queue_purge(sk); tcp_fastopen_active_disable_ofo_check(sk); skb_rbtree_purge(&tp->out_of_order_queue); inet->inet_dport = 0; inet_bhash2_reset_saddr(sk); WRITE_ONCE(sk->sk_shutdown, 0); sock_reset_flag(sk, SOCK_DONE); tp->srtt_us = 0; tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); tp->rcv_rtt_last_tsecr = 0; seq = tp->write_seq + tp->max_window + 2; if (!seq) seq = 1; WRITE_ONCE(tp->write_seq, seq); icsk->icsk_backoff = 0; WRITE_ONCE(icsk->icsk_probes_out, 0); icsk->icsk_probes_tstamp = 0; icsk->icsk_rto = TCP_TIMEOUT_INIT; WRITE_ONCE(icsk->icsk_rto_min, TCP_RTO_MIN); WRITE_ONCE(icsk->icsk_delack_max, TCP_DELACK_MAX); tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; tcp_snd_cwnd_set(tp, TCP_INIT_CWND); tp->snd_cwnd_cnt = 0; tp->is_cwnd_limited = 0; tp->max_packets_out = 0; tp->window_clamp = 0; tp->delivered = 0; tp->delivered_ce = 0; tp->accecn_fail_mode = 0; tp->saw_accecn_opt = TCP_ACCECN_OPT_NOT_SEEN; tcp_accecn_init_counters(tp); tp->prev_ecnfield = 0; tp->accecn_opt_tstamp = 0; if (icsk->icsk_ca_initialized && icsk->icsk_ca_ops->release) icsk->icsk_ca_ops->release(sk); memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); icsk->icsk_ca_initialized = 0; tcp_set_ca_state(sk, TCP_CA_Open); tp->is_sack_reneg = 0; tcp_clear_retrans(tp); tp->total_retrans = 0; inet_csk_delack_init(sk); /* Initialize rcv_mss to TCP_MIN_MSS to avoid division by 0 * issue in __tcp_select_window() */ icsk->icsk_ack.rcv_mss = TCP_MIN_MSS; memset(&tp->rx_opt, 0, sizeof(tp->rx_opt)); __sk_dst_reset(sk); dst_release(unrcu_pointer(xchg(&sk->sk_rx_dst, NULL))); tcp_saved_syn_free(tp); tp->compressed_ack = 0; tp->segs_in = 0; tp->segs_out = 0; tp->bytes_sent = 0; tp->bytes_acked = 0; tp->bytes_received = 0; tp->bytes_retrans = 0; tp->data_segs_in = 0; tp->data_segs_out = 0; tp->duplicate_sack[0].start_seq = 0; tp->duplicate_sack[0].end_seq = 0; tp->dsack_dups = 0; tp->reord_seen = 0; tp->retrans_out = 0; tp->sacked_out = 0; tp->tlp_high_seq = 0; tp->last_oow_ack_time = 0; tp->plb_rehash = 0; /* There's a bubble in the pipe until at least the first ACK. */ tp->app_limited = ~0U; tp->rate_app_limited = 1; tp->rack.mstamp = 0; tp->rack.advanced = 0; tp->rack.reo_wnd_steps = 1; tp->rack.last_delivered = 0; tp->rack.reo_wnd_persist = 0; tp->rack.dsack_seen = 0; tp->syn_data_acked = 0; tp->syn_fastopen_child = 0; tp->rx_opt.saw_tstamp = 0; tp->rx_opt.dsack = 0; tp->rx_opt.num_sacks = 0; tp->rcv_ooopack = 0; /* Clean up fastopen related fields */ req = rcu_dereference_protected(tp->fastopen_rsk, lockdep_sock_is_held(sk)); if (req) reqsk_fastopen_remove(sk, req, false); tcp_free_fastopen_req(tp); inet_clear_bit(DEFER_CONNECT, sk); tp->fastopen_client_fail = 0; WARN_ON(inet->inet_num && !icsk->icsk_bind_hash); if (sk->sk_frag.page) { put_page(sk->sk_frag.page); sk->sk_frag.page = NULL; sk->sk_frag.offset = 0; } sk_error_report(sk); return 0; } EXPORT_SYMBOL(tcp_disconnect); static inline bool tcp_can_repair_sock(const struct sock *sk) { return sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) && (sk->sk_state != TCP_LISTEN); } static int tcp_repair_set_window(struct tcp_sock *tp, sockptr_t optbuf, int len) { struct tcp_repair_window opt; if (!tp->repair) return -EPERM; if (len != sizeof(opt)) return -EINVAL; if (copy_from_sockptr(&opt, optbuf, sizeof(opt))) return -EFAULT; if (opt.max_window < opt.snd_wnd) return -EINVAL; if (after(opt.snd_wl1, tp->rcv_nxt + opt.rcv_wnd)) return -EINVAL; if (after(opt.rcv_wup, tp->rcv_nxt)) return -EINVAL; tp->snd_wl1 = opt.snd_wl1; tp->snd_wnd = opt.snd_wnd; tp->max_window = opt.max_window; tp->rcv_wnd = opt.rcv_wnd; tp->rcv_wup = opt.rcv_wup; return 0; } static int tcp_repair_options_est(struct sock *sk, sockptr_t optbuf, unsigned int len) { struct tcp_sock *tp = tcp_sk(sk); struct tcp_repair_opt opt; size_t offset = 0; while (len >= sizeof(opt)) { if (copy_from_sockptr_offset(&opt, optbuf, offset, sizeof(opt))) return -EFAULT; offset += sizeof(opt); len -= sizeof(opt); switch (opt.opt_code) { case TCPOPT_MSS: tp->rx_opt.mss_clamp = opt.opt_val; tcp_mtup_init(sk); break; case TCPOPT_WINDOW: { u16 snd_wscale = opt.opt_val & 0xFFFF; u16 rcv_wscale = opt.opt_val >> 16; if (snd_wscale > TCP_MAX_WSCALE || rcv_wscale > TCP_MAX_WSCALE) return -EFBIG; tp->rx_opt.snd_wscale = snd_wscale; tp->rx_opt.rcv_wscale = rcv_wscale; tp->rx_opt.wscale_ok = 1; } break; case TCPOPT_SACK_PERM: if (opt.opt_val != 0) return -EINVAL; tp->rx_opt.sack_ok |= TCP_SACK_SEEN; break; case TCPOPT_TIMESTAMP: if (opt.opt_val != 0) return -EINVAL; tp->rx_opt.tstamp_ok = 1; break; } } return 0; } DEFINE_STATIC_KEY_FALSE(tcp_tx_delay_enabled); EXPORT_IPV6_MOD(tcp_tx_delay_enabled); static void tcp_enable_tx_delay(struct sock *sk, int val) { struct tcp_sock *tp = tcp_sk(sk); s32 delta = (val - tp->tcp_tx_delay) << 3; if (val && !static_branch_unlikely(&tcp_tx_delay_enabled)) { static int __tcp_tx_delay_enabled = 0; if (cmpxchg(&__tcp_tx_delay_enabled, 0, 1) == 0) { static_branch_enable(&tcp_tx_delay_enabled); pr_info("TCP_TX_DELAY enabled\n"); } } /* If we change tcp_tx_delay on a live flow, adjust tp->srtt_us, * tp->rtt_min, icsk_rto and sk->sk_pacing_rate. * This is best effort. */ if (delta && sk->sk_state == TCP_ESTABLISHED) { s64 srtt = (s64)tp->srtt_us + delta; tp->srtt_us = clamp_t(s64, srtt, 1, ~0U); /* Note: does not deal with non zero icsk_backoff */ tcp_set_rto(sk); minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U); tcp_update_pacing_rate(sk); } } /* When set indicates to always queue non-full frames. Later the user clears * this option and we transmit any pending partial frames in the queue. This is * meant to be used alongside sendfile() to get properly filled frames when the * user (for example) must write out headers with a write() call first and then * use sendfile to send out the data parts. * * TCP_CORK can be set together with TCP_NODELAY and it is stronger than * TCP_NODELAY. */ void __tcp_sock_set_cork(struct sock *sk, bool on) { struct tcp_sock *tp = tcp_sk(sk); if (on) { tp->nonagle |= TCP_NAGLE_CORK; } else { tp->nonagle &= ~TCP_NAGLE_CORK; if (tp->nonagle & TCP_NAGLE_OFF) tp->nonagle |= TCP_NAGLE_PUSH; tcp_push_pending_frames(sk); } } void tcp_sock_set_cork(struct sock *sk, bool on) { lock_sock(sk); __tcp_sock_set_cork(sk, on); release_sock(sk); } EXPORT_SYMBOL(tcp_sock_set_cork); /* TCP_NODELAY is weaker than TCP_CORK, so that this option on corked socket is * remembered, but it is not activated until cork is cleared. * * However, when TCP_NODELAY is set we make an explicit push, which overrides * even TCP_CORK for currently queued segments. */ void __tcp_sock_set_nodelay(struct sock *sk, bool on) { if (on) { tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH; tcp_push_pending_frames(sk); } else { tcp_sk(sk)->nonagle &= ~TCP_NAGLE_OFF; } } void tcp_sock_set_nodelay(struct sock *sk) { lock_sock(sk); __tcp_sock_set_nodelay(sk, true); release_sock(sk); } EXPORT_SYMBOL(tcp_sock_set_nodelay); static void __tcp_sock_set_quickack(struct sock *sk, int val) { if (!val) { inet_csk_enter_pingpong_mode(sk); return; } inet_csk_exit_pingpong_mode(sk); if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) && inet_csk_ack_scheduled(sk)) { inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_PUSHED; tcp_cleanup_rbuf(sk, 1); if (!(val & 1)) inet_csk_enter_pingpong_mode(sk); } } void tcp_sock_set_quickack(struct sock *sk, int val) { lock_sock(sk); __tcp_sock_set_quickack(sk, val); release_sock(sk); } EXPORT_SYMBOL(tcp_sock_set_quickack); int tcp_sock_set_syncnt(struct sock *sk, int val) { if (val < 1 || val > MAX_TCP_SYNCNT) return -EINVAL; WRITE_ONCE(inet_csk(sk)->icsk_syn_retries, val); return 0; } EXPORT_SYMBOL(tcp_sock_set_syncnt); int tcp_sock_set_user_timeout(struct sock *sk, int val) { /* Cap the max time in ms TCP will retry or probe the window * before giving up and aborting (ETIMEDOUT) a connection. */ if (val < 0) return -EINVAL; WRITE_ONCE(inet_csk(sk)->icsk_user_timeout, val); return 0; } EXPORT_SYMBOL(tcp_sock_set_user_timeout); int tcp_sock_set_keepidle_locked(struct sock *sk, int val) { struct tcp_sock *tp = tcp_sk(sk); if (val < 1 || val > MAX_TCP_KEEPIDLE) return -EINVAL; /* Paired with WRITE_ONCE() in keepalive_time_when() */ WRITE_ONCE(tp->keepalive_time, val * HZ); if (sock_flag(sk, SOCK_KEEPOPEN) && !((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) { u32 elapsed = keepalive_time_elapsed(tp); if (tp->keepalive_time > elapsed) elapsed = tp->keepalive_time - elapsed; else elapsed = 0; tcp_reset_keepalive_timer(sk, elapsed); } return 0; } int tcp_sock_set_keepidle(struct sock *sk, int val) { int err; lock_sock(sk); err = tcp_sock_set_keepidle_locked(sk, val); release_sock(sk); return err; } EXPORT_SYMBOL(tcp_sock_set_keepidle); int tcp_sock_set_keepintvl(struct sock *sk, int val) { if (val < 1 || val > MAX_TCP_KEEPINTVL) return -EINVAL; WRITE_ONCE(tcp_sk(sk)->keepalive_intvl, val * HZ); return 0; } EXPORT_SYMBOL(tcp_sock_set_keepintvl); int tcp_sock_set_keepcnt(struct sock *sk, int val) { if (val < 1 || val > MAX_TCP_KEEPCNT) return -EINVAL; /* Paired with READ_ONCE() in keepalive_probes() */ WRITE_ONCE(tcp_sk(sk)->keepalive_probes, val); return 0; } EXPORT_SYMBOL(tcp_sock_set_keepcnt); int tcp_set_window_clamp(struct sock *sk, int val) { u32 old_window_clamp, new_window_clamp, new_rcv_ssthresh; struct tcp_sock *tp = tcp_sk(sk); if (!val) { if (sk->sk_state != TCP_CLOSE) return -EINVAL; WRITE_ONCE(tp->window_clamp, 0); return 0; } old_window_clamp = tp->window_clamp; new_window_clamp = max_t(int, SOCK_MIN_RCVBUF / 2, val); if (new_window_clamp == old_window_clamp) return 0; WRITE_ONCE(tp->window_clamp, new_window_clamp); /* Need to apply the reserved mem provisioning only * when shrinking the window clamp. */ if (new_window_clamp < old_window_clamp) { __tcp_adjust_rcv_ssthresh(sk, new_window_clamp); } else { new_rcv_ssthresh = min(tp->rcv_wnd, new_window_clamp); tp->rcv_ssthresh = max(new_rcv_ssthresh, tp->rcv_ssthresh); } return 0; } int tcp_sock_set_maxseg(struct sock *sk, int val) { /* Values greater than interface MTU won't take effect. However * at the point when this call is done we typically don't yet * know which interface is going to be used */ if (val && (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW)) return -EINVAL; WRITE_ONCE(tcp_sk(sk)->rx_opt.user_mss, val); return 0; } /* * Socket option code for TCP. */ int do_tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); struct net *net = sock_net(sk); int val; int err = 0; /* These are data/string values, all the others are ints */ switch (optname) { case TCP_CONGESTION: { char name[TCP_CA_NAME_MAX]; if (optlen < 1) return -EINVAL; val = strncpy_from_sockptr(name, optval, min_t(long, TCP_CA_NAME_MAX-1, optlen)); if (val < 0) return -EFAULT; name[val] = 0; sockopt_lock_sock(sk); err = tcp_set_congestion_control(sk, name, !has_current_bpf_ctx(), sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)); sockopt_release_sock(sk); return err; } case TCP_ULP: { char name[TCP_ULP_NAME_MAX]; if (optlen < 1) return -EINVAL; val = strncpy_from_sockptr(name, optval, min_t(long, TCP_ULP_NAME_MAX - 1, optlen)); if (val < 0) return -EFAULT; name[val] = 0; sockopt_lock_sock(sk); err = tcp_set_ulp(sk, name); sockopt_release_sock(sk); return err; } case TCP_FASTOPEN_KEY: { __u8 key[TCP_FASTOPEN_KEY_BUF_LENGTH]; __u8 *backup_key = NULL; /* Allow a backup key as well to facilitate key rotation * First key is the active one. */ if (optlen != TCP_FASTOPEN_KEY_LENGTH && optlen != TCP_FASTOPEN_KEY_BUF_LENGTH) return -EINVAL; if (copy_from_sockptr(key, optval, optlen)) return -EFAULT; if (optlen == TCP_FASTOPEN_KEY_BUF_LENGTH) backup_key = key + TCP_FASTOPEN_KEY_LENGTH; return tcp_fastopen_reset_cipher(net, sk, key, backup_key); } default: /* fallthru */ break; } if (optlen < sizeof(int)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; /* Handle options that can be set without locking the socket. */ switch (optname) { case TCP_SYNCNT: return tcp_sock_set_syncnt(sk, val); case TCP_USER_TIMEOUT: return tcp_sock_set_user_timeout(sk, val); case TCP_KEEPINTVL: return tcp_sock_set_keepintvl(sk, val); case TCP_KEEPCNT: return tcp_sock_set_keepcnt(sk, val); case TCP_LINGER2: if (val < 0) WRITE_ONCE(tp->linger2, -1); else if (val > TCP_FIN_TIMEOUT_MAX / HZ) WRITE_ONCE(tp->linger2, TCP_FIN_TIMEOUT_MAX); else WRITE_ONCE(tp->linger2, val * HZ); return 0; case TCP_DEFER_ACCEPT: /* Translate value in seconds to number of retransmits */ WRITE_ONCE(icsk->icsk_accept_queue.rskq_defer_accept, secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ)); return 0; case TCP_RTO_MAX_MS: if (val < MSEC_PER_SEC || val > TCP_RTO_MAX_SEC * MSEC_PER_SEC) return -EINVAL; WRITE_ONCE(inet_csk(sk)->icsk_rto_max, msecs_to_jiffies(val)); return 0; case TCP_RTO_MIN_US: { int rto_min = usecs_to_jiffies(val); if (rto_min > TCP_RTO_MIN || rto_min < TCP_TIMEOUT_MIN) return -EINVAL; WRITE_ONCE(inet_csk(sk)->icsk_rto_min, rto_min); return 0; } case TCP_DELACK_MAX_US: { int delack_max = usecs_to_jiffies(val); if (delack_max > TCP_DELACK_MAX || delack_max < TCP_TIMEOUT_MIN) return -EINVAL; WRITE_ONCE(inet_csk(sk)->icsk_delack_max, delack_max); return 0; } case TCP_MAXSEG: return tcp_sock_set_maxseg(sk, val); } sockopt_lock_sock(sk); switch (optname) { case TCP_NODELAY: __tcp_sock_set_nodelay(sk, val); break; case TCP_THIN_LINEAR_TIMEOUTS: if (val < 0 || val > 1) err = -EINVAL; else tp->thin_lto = val; break; case TCP_THIN_DUPACK: if (val < 0 || val > 1) err = -EINVAL; break; case TCP_REPAIR: if (!tcp_can_repair_sock(sk)) err = -EPERM; else if (val == TCP_REPAIR_ON) { tp->repair = 1; sk->sk_reuse = SK_FORCE_REUSE; tp->repair_queue = TCP_NO_QUEUE; } else if (val == TCP_REPAIR_OFF) { tp->repair = 0; sk->sk_reuse = SK_NO_REUSE; tcp_send_window_probe(sk); } else if (val == TCP_REPAIR_OFF_NO_WP) { tp->repair = 0; sk->sk_reuse = SK_NO_REUSE; } else err = -EINVAL; break; case TCP_REPAIR_QUEUE: if (!tp->repair) err = -EPERM; else if ((unsigned int)val < TCP_QUEUES_NR) tp->repair_queue = val; else err = -EINVAL; break; case TCP_QUEUE_SEQ: if (sk->sk_state != TCP_CLOSE) { err = -EPERM; } else if (tp->repair_queue == TCP_SEND_QUEUE) { if (!tcp_rtx_queue_empty(sk)) err = -EPERM; else WRITE_ONCE(tp->write_seq, val); } else if (tp->repair_queue == TCP_RECV_QUEUE) { if (tp->rcv_nxt != tp->copied_seq) { err = -EPERM; } else { WRITE_ONCE(tp->rcv_nxt, val); WRITE_ONCE(tp->copied_seq, val); } } else { err = -EINVAL; } break; case TCP_REPAIR_OPTIONS: if (!tp->repair) err = -EINVAL; else if (sk->sk_state == TCP_ESTABLISHED && !tp->bytes_sent) err = tcp_repair_options_est(sk, optval, optlen); else err = -EPERM; break; case TCP_CORK: __tcp_sock_set_cork(sk, val); break; case TCP_KEEPIDLE: err = tcp_sock_set_keepidle_locked(sk, val); break; case TCP_SAVE_SYN: /* 0: disable, 1: enable, 2: start from ether_header */ if (val < 0 || val > 2) err = -EINVAL; else tp->save_syn = val; break; case TCP_WINDOW_CLAMP: err = tcp_set_window_clamp(sk, val); break; case TCP_QUICKACK: __tcp_sock_set_quickack(sk, val); break; case TCP_AO_REPAIR: if (!tcp_can_repair_sock(sk)) { err = -EPERM; break; } err = tcp_ao_set_repair(sk, optval, optlen); break; #ifdef CONFIG_TCP_AO case TCP_AO_ADD_KEY: case TCP_AO_DEL_KEY: case TCP_AO_INFO: { /* If this is the first TCP-AO setsockopt() on the socket, * sk_state has to be LISTEN or CLOSE. Allow TCP_REPAIR * in any state. */ if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) goto ao_parse; if (rcu_dereference_protected(tcp_sk(sk)->ao_info, lockdep_sock_is_held(sk))) goto ao_parse; if (tp->repair) goto ao_parse; err = -EISCONN; break; ao_parse: err = tp->af_specific->ao_parse(sk, optname, optval, optlen); break; } #endif #ifdef CONFIG_TCP_MD5SIG case TCP_MD5SIG: case TCP_MD5SIG_EXT: err = tp->af_specific->md5_parse(sk, optname, optval, optlen); break; #endif case TCP_FASTOPEN: if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) { tcp_fastopen_init_key_once(net); fastopen_queue_tune(sk, val); } else { err = -EINVAL; } break; case TCP_FASTOPEN_CONNECT: if (val > 1 || val < 0) { err = -EINVAL; } else if (READ_ONCE(net->ipv4.sysctl_tcp_fastopen) & TFO_CLIENT_ENABLE) { if (sk->sk_state == TCP_CLOSE) tp->fastopen_connect = val; else err = -EINVAL; } else { err = -EOPNOTSUPP; } break; case TCP_FASTOPEN_NO_COOKIE: if (val > 1 || val < 0) err = -EINVAL; else if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) err = -EINVAL; else tp->fastopen_no_cookie = val; break; case TCP_TIMESTAMP: if (!tp->repair) { err = -EPERM; break; } /* val is an opaque field, * and low order bit contains usec_ts enable bit. * Its a best effort, and we do not care if user makes an error. */ tp->tcp_usec_ts = val & 1; WRITE_ONCE(tp->tsoffset, val - tcp_clock_ts(tp->tcp_usec_ts)); break; case TCP_REPAIR_WINDOW: err = tcp_repair_set_window(tp, optval, optlen); break; case TCP_NOTSENT_LOWAT: WRITE_ONCE(tp->notsent_lowat, val); sk->sk_write_space(sk); break; case TCP_INQ: if (val > 1 || val < 0) err = -EINVAL; else tp->recvmsg_inq = val; break; case TCP_TX_DELAY: /* tp->srtt_us is u32, and is shifted by 3 */ if (val < 0 || val >= (1U << (31 - 3))) { err = -EINVAL; break; } tcp_enable_tx_delay(sk, val); WRITE_ONCE(tp->tcp_tx_delay, val); break; default: err = -ENOPROTOOPT; break; } sockopt_release_sock(sk); return err; } int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { const struct inet_connection_sock *icsk = inet_csk(sk); if (level != SOL_TCP) /* Paired with WRITE_ONCE() in do_ipv6_setsockopt() and tcp_v6_connect() */ return READ_ONCE(icsk->icsk_af_ops)->setsockopt(sk, level, optname, optval, optlen); return do_tcp_setsockopt(sk, level, optname, optval, optlen); } EXPORT_IPV6_MOD(tcp_setsockopt); static void tcp_get_info_chrono_stats(const struct tcp_sock *tp, struct tcp_info *info) { u64 stats[__TCP_CHRONO_MAX], total = 0; enum tcp_chrono i; for (i = TCP_CHRONO_BUSY; i < __TCP_CHRONO_MAX; ++i) { stats[i] = tp->chrono_stat[i - 1]; if (i == tp->chrono_type) stats[i] += tcp_jiffies32 - tp->chrono_start; stats[i] *= USEC_PER_SEC / HZ; total += stats[i]; } info->tcpi_busy_time = total; info->tcpi_rwnd_limited = stats[TCP_CHRONO_RWND_LIMITED]; info->tcpi_sndbuf_limited = stats[TCP_CHRONO_SNDBUF_LIMITED]; } /* Return information about state of tcp endpoint in API format. */ void tcp_get_info(struct sock *sk, struct tcp_info *info) { const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */ const struct inet_connection_sock *icsk = inet_csk(sk); const u8 ect1_idx = INET_ECN_ECT_1 - 1; const u8 ect0_idx = INET_ECN_ECT_0 - 1; const u8 ce_idx = INET_ECN_CE - 1; unsigned long rate; u32 now; u64 rate64; bool slow; memset(info, 0, sizeof(*info)); if (sk->sk_type != SOCK_STREAM) return; info->tcpi_state = inet_sk_state_load(sk); /* Report meaningful fields for all TCP states, including listeners */ rate = READ_ONCE(sk->sk_pacing_rate); rate64 = (rate != ~0UL) ? rate : ~0ULL; info->tcpi_pacing_rate = rate64; rate = READ_ONCE(sk->sk_max_pacing_rate); rate64 = (rate != ~0UL) ? rate : ~0ULL; info->tcpi_max_pacing_rate = rate64; info->tcpi_reordering = tp->reordering; info->tcpi_snd_cwnd = tcp_snd_cwnd(tp); if (info->tcpi_state == TCP_LISTEN) { /* listeners aliased fields : * tcpi_unacked -> Number of children ready for accept() * tcpi_sacked -> max backlog */ info->tcpi_unacked = READ_ONCE(sk->sk_ack_backlog); info->tcpi_sacked = READ_ONCE(sk->sk_max_ack_backlog); return; } slow = lock_sock_fast(sk); info->tcpi_ca_state = icsk->icsk_ca_state; info->tcpi_retransmits = icsk->icsk_retransmits; info->tcpi_probes = icsk->icsk_probes_out; info->tcpi_backoff = icsk->icsk_backoff; if (tp->rx_opt.tstamp_ok) info->tcpi_options |= TCPI_OPT_TIMESTAMPS; if (tcp_is_sack(tp)) info->tcpi_options |= TCPI_OPT_SACK; if (tp->rx_opt.wscale_ok) { info->tcpi_options |= TCPI_OPT_WSCALE; info->tcpi_snd_wscale = tp->rx_opt.snd_wscale; info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale; } if (tcp_ecn_mode_any(tp)) info->tcpi_options |= TCPI_OPT_ECN; if (tp->ecn_flags & TCP_ECN_SEEN) info->tcpi_options |= TCPI_OPT_ECN_SEEN; if (tp->syn_data_acked) info->tcpi_options |= TCPI_OPT_SYN_DATA; if (tp->tcp_usec_ts) info->tcpi_options |= TCPI_OPT_USEC_TS; if (tp->syn_fastopen_child) info->tcpi_options |= TCPI_OPT_TFO_CHILD; info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto); info->tcpi_ato = jiffies_to_usecs(min_t(u32, icsk->icsk_ack.ato, tcp_delack_max(sk))); info->tcpi_snd_mss = tp->mss_cache; info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss; info->tcpi_unacked = tp->packets_out; info->tcpi_sacked = tp->sacked_out; info->tcpi_lost = tp->lost_out; info->tcpi_retrans = tp->retrans_out; now = tcp_jiffies32; info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime); info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime); info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp); info->tcpi_pmtu = icsk->icsk_pmtu_cookie; info->tcpi_rcv_ssthresh = tp->rcv_ssthresh; info->tcpi_rtt = tp->srtt_us >> 3; info->tcpi_rttvar = tp->mdev_us >> 2; info->tcpi_snd_ssthresh = tp->snd_ssthresh; info->tcpi_advmss = tp->advmss; info->tcpi_rcv_rtt = tp->rcv_rtt_est.rtt_us >> 3; info->tcpi_rcv_space = tp->rcvq_space.space; info->tcpi_total_retrans = tp->total_retrans; info->tcpi_bytes_acked = tp->bytes_acked; info->tcpi_bytes_received = tp->bytes_received; info->tcpi_notsent_bytes = max_t(int, 0, tp->write_seq - tp->snd_nxt); tcp_get_info_chrono_stats(tp, info); info->tcpi_segs_out = tp->segs_out; /* segs_in and data_segs_in can be updated from tcp_segs_in() from BH */ info->tcpi_segs_in = READ_ONCE(tp->segs_in); info->tcpi_data_segs_in = READ_ONCE(tp->data_segs_in); info->tcpi_min_rtt = tcp_min_rtt(tp); info->tcpi_data_segs_out = tp->data_segs_out; info->tcpi_delivery_rate_app_limited = tp->rate_app_limited ? 1 : 0; rate64 = tcp_compute_delivery_rate(tp); if (rate64) info->tcpi_delivery_rate = rate64; info->tcpi_delivered = tp->delivered; info->tcpi_delivered_ce = tp->delivered_ce; info->tcpi_bytes_sent = tp->bytes_sent; info->tcpi_bytes_retrans = tp->bytes_retrans; info->tcpi_dsack_dups = tp->dsack_dups; info->tcpi_reord_seen = tp->reord_seen; info->tcpi_rcv_ooopack = tp->rcv_ooopack; info->tcpi_snd_wnd = tp->snd_wnd; info->tcpi_rcv_wnd = tp->rcv_wnd; info->tcpi_rehash = tp->plb_rehash + tp->timeout_rehash; info->tcpi_fastopen_client_fail = tp->fastopen_client_fail; info->tcpi_total_rto = tp->total_rto; info->tcpi_total_rto_recoveries = tp->total_rto_recoveries; info->tcpi_total_rto_time = tp->total_rto_time; if (tp->rto_stamp) info->tcpi_total_rto_time += tcp_clock_ms() - tp->rto_stamp; info->tcpi_accecn_fail_mode = tp->accecn_fail_mode; info->tcpi_accecn_opt_seen = tp->saw_accecn_opt; info->tcpi_received_ce = tp->received_ce; info->tcpi_delivered_e1_bytes = tp->delivered_ecn_bytes[ect1_idx]; info->tcpi_delivered_e0_bytes = tp->delivered_ecn_bytes[ect0_idx]; info->tcpi_delivered_ce_bytes = tp->delivered_ecn_bytes[ce_idx]; info->tcpi_received_e1_bytes = tp->received_ecn_bytes[ect1_idx]; info->tcpi_received_e0_bytes = tp->received_ecn_bytes[ect0_idx]; info->tcpi_received_ce_bytes = tp->received_ecn_bytes[ce_idx]; unlock_sock_fast(sk, slow); } EXPORT_SYMBOL_GPL(tcp_get_info); static size_t tcp_opt_stats_get_size(void) { return nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BUSY */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_RWND_LIMITED */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_SNDBUF_LIMITED */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_DATA_SEGS_OUT */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_TOTAL_RETRANS */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_PACING_RATE */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_DELIVERY_RATE */ nla_total_size(sizeof(u32)) + /* TCP_NLA_SND_CWND */ nla_total_size(sizeof(u32)) + /* TCP_NLA_REORDERING */ nla_total_size(sizeof(u32)) + /* TCP_NLA_MIN_RTT */ nla_total_size(sizeof(u8)) + /* TCP_NLA_RECUR_RETRANS */ nla_total_size(sizeof(u8)) + /* TCP_NLA_DELIVERY_RATE_APP_LMT */ nla_total_size(sizeof(u32)) + /* TCP_NLA_SNDQ_SIZE */ nla_total_size(sizeof(u8)) + /* TCP_NLA_CA_STATE */ nla_total_size(sizeof(u32)) + /* TCP_NLA_SND_SSTHRESH */ nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED */ nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED_CE */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_SENT */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_RETRANS */ nla_total_size(sizeof(u32)) + /* TCP_NLA_DSACK_DUPS */ nla_total_size(sizeof(u32)) + /* TCP_NLA_REORD_SEEN */ nla_total_size(sizeof(u32)) + /* TCP_NLA_SRTT */ nla_total_size(sizeof(u16)) + /* TCP_NLA_TIMEOUT_REHASH */ nla_total_size(sizeof(u32)) + /* TCP_NLA_BYTES_NOTSENT */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_EDT */ nla_total_size(sizeof(u8)) + /* TCP_NLA_TTL */ nla_total_size(sizeof(u32)) + /* TCP_NLA_REHASH */ 0; } /* Returns TTL or hop limit of an incoming packet from skb. */ static u8 tcp_skb_ttl_or_hop_limit(const struct sk_buff *skb) { if (skb->protocol == htons(ETH_P_IP)) return ip_hdr(skb)->ttl; else if (skb->protocol == htons(ETH_P_IPV6)) return ipv6_hdr(skb)->hop_limit; else return 0; } struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk, const struct sk_buff *orig_skb, const struct sk_buff *ack_skb) { const struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *stats; struct tcp_info info; unsigned long rate; u64 rate64; stats = alloc_skb(tcp_opt_stats_get_size(), GFP_ATOMIC); if (!stats) return NULL; tcp_get_info_chrono_stats(tp, &info); nla_put_u64_64bit(stats, TCP_NLA_BUSY, info.tcpi_busy_time, TCP_NLA_PAD); nla_put_u64_64bit(stats, TCP_NLA_RWND_LIMITED, info.tcpi_rwnd_limited, TCP_NLA_PAD); nla_put_u64_64bit(stats, TCP_NLA_SNDBUF_LIMITED, info.tcpi_sndbuf_limited, TCP_NLA_PAD); nla_put_u64_64bit(stats, TCP_NLA_DATA_SEGS_OUT, tp->data_segs_out, TCP_NLA_PAD); nla_put_u64_64bit(stats, TCP_NLA_TOTAL_RETRANS, tp->total_retrans, TCP_NLA_PAD); rate = READ_ONCE(sk->sk_pacing_rate); rate64 = (rate != ~0UL) ? rate : ~0ULL; nla_put_u64_64bit(stats, TCP_NLA_PACING_RATE, rate64, TCP_NLA_PAD); rate64 = tcp_compute_delivery_rate(tp); nla_put_u64_64bit(stats, TCP_NLA_DELIVERY_RATE, rate64, TCP_NLA_PAD); nla_put_u32(stats, TCP_NLA_SND_CWND, tcp_snd_cwnd(tp)); nla_put_u32(stats, TCP_NLA_REORDERING, tp->reordering); nla_put_u32(stats, TCP_NLA_MIN_RTT, tcp_min_rtt(tp)); nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, READ_ONCE(inet_csk(sk)->icsk_retransmits)); nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited); nla_put_u32(stats, TCP_NLA_SND_SSTHRESH, tp->snd_ssthresh); nla_put_u32(stats, TCP_NLA_DELIVERED, tp->delivered); nla_put_u32(stats, TCP_NLA_DELIVERED_CE, tp->delivered_ce); nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una); nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state); nla_put_u64_64bit(stats, TCP_NLA_BYTES_SENT, tp->bytes_sent, TCP_NLA_PAD); nla_put_u64_64bit(stats, TCP_NLA_BYTES_RETRANS, tp->bytes_retrans, TCP_NLA_PAD); nla_put_u32(stats, TCP_NLA_DSACK_DUPS, tp->dsack_dups); nla_put_u32(stats, TCP_NLA_REORD_SEEN, tp->reord_seen); nla_put_u32(stats, TCP_NLA_SRTT, tp->srtt_us >> 3); nla_put_u16(stats, TCP_NLA_TIMEOUT_REHASH, tp->timeout_rehash); nla_put_u32(stats, TCP_NLA_BYTES_NOTSENT, max_t(int, 0, tp->write_seq - tp->snd_nxt)); nla_put_u64_64bit(stats, TCP_NLA_EDT, orig_skb->skb_mstamp_ns, TCP_NLA_PAD); if (ack_skb) nla_put_u8(stats, TCP_NLA_TTL, tcp_skb_ttl_or_hop_limit(ack_skb)); nla_put_u32(stats, TCP_NLA_REHASH, tp->plb_rehash + tp->timeout_rehash); return stats; } int do_tcp_getsockopt(struct sock *sk, int level, int optname, sockptr_t optval, sockptr_t optlen) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); int user_mss; int val, len; if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; if (len < 0) return -EINVAL; len = min_t(unsigned int, len, sizeof(int)); switch (optname) { case TCP_MAXSEG: val = tp->mss_cache; user_mss = READ_ONCE(tp->rx_opt.user_mss); if (user_mss && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) val = user_mss; if (tp->repair) val = tp->rx_opt.mss_clamp; break; case TCP_NODELAY: val = !!(tp->nonagle&TCP_NAGLE_OFF); break; case TCP_CORK: val = !!(tp->nonagle&TCP_NAGLE_CORK); break; case TCP_KEEPIDLE: val = keepalive_time_when(tp) / HZ; break; case TCP_KEEPINTVL: val = keepalive_intvl_when(tp) / HZ; break; case TCP_KEEPCNT: val = keepalive_probes(tp); break; case TCP_SYNCNT: val = READ_ONCE(icsk->icsk_syn_retries) ? : READ_ONCE(net->ipv4.sysctl_tcp_syn_retries); break; case TCP_LINGER2: val = READ_ONCE(tp->linger2); if (val >= 0) val = (val ? : READ_ONCE(net->ipv4.sysctl_tcp_fin_timeout)) / HZ; break; case TCP_DEFER_ACCEPT: val = READ_ONCE(icsk->icsk_accept_queue.rskq_defer_accept); val = retrans_to_secs(val, TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ); break; case TCP_WINDOW_CLAMP: val = READ_ONCE(tp->window_clamp); break; case TCP_INFO: { struct tcp_info info; if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; tcp_get_info(sk, &info); len = min_t(unsigned int, len, sizeof(info)); if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, &info, len)) return -EFAULT; return 0; } case TCP_CC_INFO: { const struct tcp_congestion_ops *ca_ops; union tcp_cc_info info; size_t sz = 0; int attr; if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; ca_ops = icsk->icsk_ca_ops; if (ca_ops && ca_ops->get_info) sz = ca_ops->get_info(sk, ~0U, &attr, &info); len = min_t(unsigned int, len, sz); if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, &info, len)) return -EFAULT; return 0; } case TCP_QUICKACK: val = !inet_csk_in_pingpong_mode(sk); break; case TCP_CONGESTION: if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; len = min_t(unsigned int, len, TCP_CA_NAME_MAX); if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, icsk->icsk_ca_ops->name, len)) return -EFAULT; return 0; case TCP_ULP: if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; len = min_t(unsigned int, len, TCP_ULP_NAME_MAX); if (!icsk->icsk_ulp_ops) { len = 0; if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; return 0; } if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, icsk->icsk_ulp_ops->name, len)) return -EFAULT; return 0; case TCP_FASTOPEN_KEY: { u64 key[TCP_FASTOPEN_KEY_BUF_LENGTH / sizeof(u64)]; unsigned int key_len; if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; key_len = tcp_fastopen_get_cipher(net, icsk, key) * TCP_FASTOPEN_KEY_LENGTH; len = min_t(unsigned int, len, key_len); if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, key, len)) return -EFAULT; return 0; } case TCP_THIN_LINEAR_TIMEOUTS: val = tp->thin_lto; break; case TCP_THIN_DUPACK: val = 0; break; case TCP_REPAIR: val = tp->repair; break; case TCP_REPAIR_QUEUE: if (tp->repair) val = tp->repair_queue; else return -EINVAL; break; case TCP_REPAIR_WINDOW: { struct tcp_repair_window opt; if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; if (len != sizeof(opt)) return -EINVAL; if (!tp->repair) return -EPERM; opt.snd_wl1 = tp->snd_wl1; opt.snd_wnd = tp->snd_wnd; opt.max_window = tp->max_window; opt.rcv_wnd = tp->rcv_wnd; opt.rcv_wup = tp->rcv_wup; if (copy_to_sockptr(optval, &opt, len)) return -EFAULT; return 0; } case TCP_QUEUE_SEQ: if (tp->repair_queue == TCP_SEND_QUEUE) val = tp->write_seq; else if (tp->repair_queue == TCP_RECV_QUEUE) val = tp->rcv_nxt; else return -EINVAL; break; case TCP_USER_TIMEOUT: val = READ_ONCE(icsk->icsk_user_timeout); break; case TCP_FASTOPEN: val = READ_ONCE(icsk->icsk_accept_queue.fastopenq.max_qlen); break; case TCP_FASTOPEN_CONNECT: val = tp->fastopen_connect; break; case TCP_FASTOPEN_NO_COOKIE: val = tp->fastopen_no_cookie; break; case TCP_TX_DELAY: val = READ_ONCE(tp->tcp_tx_delay); break; case TCP_TIMESTAMP: val = tcp_clock_ts(tp->tcp_usec_ts) + READ_ONCE(tp->tsoffset); if (tp->tcp_usec_ts) val |= 1; else val &= ~1; break; case TCP_NOTSENT_LOWAT: val = READ_ONCE(tp->notsent_lowat); break; case TCP_INQ: val = tp->recvmsg_inq; break; case TCP_SAVE_SYN: val = tp->save_syn; break; case TCP_SAVED_SYN: { if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; sockopt_lock_sock(sk); if (tp->saved_syn) { if (len < tcp_saved_syn_len(tp->saved_syn)) { len = tcp_saved_syn_len(tp->saved_syn); if (copy_to_sockptr(optlen, &len, sizeof(int))) { sockopt_release_sock(sk); return -EFAULT; } sockopt_release_sock(sk); return -EINVAL; } len = tcp_saved_syn_len(tp->saved_syn); if (copy_to_sockptr(optlen, &len, sizeof(int))) { sockopt_release_sock(sk); return -EFAULT; } if (copy_to_sockptr(optval, tp->saved_syn->data, len)) { sockopt_release_sock(sk); return -EFAULT; } tcp_saved_syn_free(tp); sockopt_release_sock(sk); } else { sockopt_release_sock(sk); len = 0; if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; } return 0; } #ifdef CONFIG_MMU case TCP_ZEROCOPY_RECEIVE: { struct scm_timestamping_internal tss; struct tcp_zerocopy_receive zc = {}; int err; if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; if (len < 0 || len < offsetofend(struct tcp_zerocopy_receive, length)) return -EINVAL; if (unlikely(len > sizeof(zc))) { err = check_zeroed_sockptr(optval, sizeof(zc), len - sizeof(zc)); if (err < 1) return err == 0 ? -EINVAL : err; len = sizeof(zc); if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; } if (copy_from_sockptr(&zc, optval, len)) return -EFAULT; if (zc.reserved) return -EINVAL; if (zc.msg_flags & ~(TCP_VALID_ZC_MSG_FLAGS)) return -EINVAL; sockopt_lock_sock(sk); err = tcp_zerocopy_receive(sk, &zc, &tss); err = BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sk, level, optname, &zc, &len, err); sockopt_release_sock(sk); if (len >= offsetofend(struct tcp_zerocopy_receive, msg_flags)) goto zerocopy_rcv_cmsg; switch (len) { case offsetofend(struct tcp_zerocopy_receive, msg_flags): goto zerocopy_rcv_cmsg; case offsetofend(struct tcp_zerocopy_receive, msg_controllen): case offsetofend(struct tcp_zerocopy_receive, msg_control): case offsetofend(struct tcp_zerocopy_receive, flags): case offsetofend(struct tcp_zerocopy_receive, copybuf_len): case offsetofend(struct tcp_zerocopy_receive, copybuf_address): case offsetofend(struct tcp_zerocopy_receive, err): goto zerocopy_rcv_sk_err; case offsetofend(struct tcp_zerocopy_receive, inq): goto zerocopy_rcv_inq; case offsetofend(struct tcp_zerocopy_receive, length): default: goto zerocopy_rcv_out; } zerocopy_rcv_cmsg: if (zc.msg_flags & TCP_CMSG_TS) tcp_zc_finalize_rx_tstamp(sk, &zc, &tss); else zc.msg_flags = 0; zerocopy_rcv_sk_err: if (!err) zc.err = sock_error(sk); zerocopy_rcv_inq: zc.inq = tcp_inq_hint(sk); zerocopy_rcv_out: if (!err && copy_to_sockptr(optval, &zc, len)) err = -EFAULT; return err; } #endif case TCP_AO_REPAIR: if (!tcp_can_repair_sock(sk)) return -EPERM; return tcp_ao_get_repair(sk, optval, optlen); case TCP_AO_GET_KEYS: case TCP_AO_INFO: { int err; sockopt_lock_sock(sk); if (optname == TCP_AO_GET_KEYS) err = tcp_ao_get_mkts(sk, optval, optlen); else err = tcp_ao_get_sock_info(sk, optval, optlen); sockopt_release_sock(sk); return err; } case TCP_IS_MPTCP: val = 0; break; case TCP_RTO_MAX_MS: val = jiffies_to_msecs(tcp_rto_max(sk)); break; case TCP_RTO_MIN_US: val = jiffies_to_usecs(READ_ONCE(inet_csk(sk)->icsk_rto_min)); break; case TCP_DELACK_MAX_US: val = jiffies_to_usecs(READ_ONCE(inet_csk(sk)->icsk_delack_max)); break; default: return -ENOPROTOOPT; } if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, &val, len)) return -EFAULT; return 0; } bool tcp_bpf_bypass_getsockopt(int level, int optname) { /* TCP do_tcp_getsockopt has optimized getsockopt implementation * to avoid extra socket lock for TCP_ZEROCOPY_RECEIVE. */ if (level == SOL_TCP && optname == TCP_ZEROCOPY_RECEIVE) return true; return false; } EXPORT_IPV6_MOD(tcp_bpf_bypass_getsockopt); int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { struct inet_connection_sock *icsk = inet_csk(sk); if (level != SOL_TCP) /* Paired with WRITE_ONCE() in do_ipv6_setsockopt() and tcp_v6_connect() */ return READ_ONCE(icsk->icsk_af_ops)->getsockopt(sk, level, optname, optval, optlen); return do_tcp_getsockopt(sk, level, optname, USER_SOCKPTR(optval), USER_SOCKPTR(optlen)); } EXPORT_IPV6_MOD(tcp_getsockopt); #ifdef CONFIG_TCP_MD5SIG void tcp_md5_hash_skb_data(struct md5_ctx *ctx, const struct sk_buff *skb, unsigned int header_len) { const unsigned int head_data_len = skb_headlen(skb) > header_len ? skb_headlen(skb) - header_len : 0; const struct skb_shared_info *shi = skb_shinfo(skb); struct sk_buff *frag_iter; unsigned int i; md5_update(ctx, (const u8 *)tcp_hdr(skb) + header_len, head_data_len); for (i = 0; i < shi->nr_frags; ++i) { const skb_frag_t *f = &shi->frags[i]; u32 p_off, p_len, copied; const void *vaddr; struct page *p; skb_frag_foreach_page(f, skb_frag_off(f), skb_frag_size(f), p, p_off, p_len, copied) { vaddr = kmap_local_page(p); md5_update(ctx, vaddr + p_off, p_len); kunmap_local(vaddr); } } skb_walk_frags(skb, frag_iter) tcp_md5_hash_skb_data(ctx, frag_iter, 0); } EXPORT_IPV6_MOD(tcp_md5_hash_skb_data); void tcp_md5_hash_key(struct md5_ctx *ctx, const struct tcp_md5sig_key *key) { u8 keylen = READ_ONCE(key->keylen); /* paired with WRITE_ONCE() in tcp_md5_do_add */ /* We use data_race() because tcp_md5_do_add() might change * key->key under us */ data_race(({ md5_update(ctx, key->key, keylen), 0; })); } EXPORT_IPV6_MOD(tcp_md5_hash_key); /* Called with rcu_read_lock() */ static enum skb_drop_reason tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb, const void *saddr, const void *daddr, int family, int l3index, const __u8 *hash_location) { /* This gets called for each TCP segment that has TCP-MD5 option. * We have 2 drop cases: * o An MD5 signature is present, but we're not expecting one. * o The MD5 signature is wrong. */ const struct tcp_sock *tp = tcp_sk(sk); struct tcp_md5sig_key *key; u8 newhash[16]; key = tcp_md5_do_lookup(sk, l3index, saddr, family); if (!key) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED); trace_tcp_hash_md5_unexpected(sk, skb); return SKB_DROP_REASON_TCP_MD5UNEXPECTED; } /* Check the signature. * To support dual stack listeners, we need to handle * IPv4-mapped case. */ if (family == AF_INET) tcp_v4_md5_hash_skb(newhash, key, NULL, skb); else tp->af_specific->calc_md5_hash(newhash, key, NULL, skb); if (memcmp(hash_location, newhash, 16) != 0) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE); trace_tcp_hash_md5_mismatch(sk, skb); return SKB_DROP_REASON_TCP_MD5FAILURE; } return SKB_NOT_DROPPED_YET; } #else static inline enum skb_drop_reason tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb, const void *saddr, const void *daddr, int family, int l3index, const __u8 *hash_location) { return SKB_NOT_DROPPED_YET; } #endif /* Called with rcu_read_lock() */ enum skb_drop_reason tcp_inbound_hash(struct sock *sk, const struct request_sock *req, const struct sk_buff *skb, const void *saddr, const void *daddr, int family, int dif, int sdif) { const struct tcphdr *th = tcp_hdr(skb); const struct tcp_ao_hdr *aoh; const __u8 *md5_location; int l3index; /* Invalid option or two times meet any of auth options */ if (tcp_parse_auth_options(th, &md5_location, &aoh)) { trace_tcp_hash_bad_header(sk, skb); return SKB_DROP_REASON_TCP_AUTH_HDR; } if (req) { if (tcp_rsk_used_ao(req) != !!aoh) { u8 keyid, rnext, maclen; if (aoh) { keyid = aoh->keyid; rnext = aoh->rnext_keyid; maclen = tcp_ao_hdr_maclen(aoh); } else { keyid = rnext = maclen = 0; } NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOBAD); trace_tcp_ao_handshake_failure(sk, skb, keyid, rnext, maclen); return SKB_DROP_REASON_TCP_AOFAILURE; } } /* sdif set, means packet ingressed via a device * in an L3 domain and dif is set to the l3mdev */ l3index = sdif ? dif : 0; /* Fast path: unsigned segments */ if (likely(!md5_location && !aoh)) { /* Drop if there's TCP-MD5 or TCP-AO key with any rcvid/sndid * for the remote peer. On TCP-AO established connection * the last key is impossible to remove, so there's * always at least one current_key. */ if (tcp_ao_required(sk, saddr, family, l3index, true)) { trace_tcp_hash_ao_required(sk, skb); return SKB_DROP_REASON_TCP_AONOTFOUND; } if (unlikely(tcp_md5_do_lookup(sk, l3index, saddr, family))) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND); trace_tcp_hash_md5_required(sk, skb); return SKB_DROP_REASON_TCP_MD5NOTFOUND; } return SKB_NOT_DROPPED_YET; } if (aoh) return tcp_inbound_ao_hash(sk, skb, family, req, l3index, aoh); return tcp_inbound_md5_hash(sk, skb, saddr, daddr, family, l3index, md5_location); } EXPORT_IPV6_MOD_GPL(tcp_inbound_hash); void tcp_done(struct sock *sk) { struct request_sock *req; /* We might be called with a new socket, after * inet_csk_prepare_forced_close() has been called * so we can not use lockdep_sock_is_held(sk) */ req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk, 1); if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS); tcp_set_state(sk, TCP_CLOSE); tcp_clear_xmit_timers(sk); if (req) reqsk_fastopen_remove(sk, req, false); WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); if (!sock_flag(sk, SOCK_DEAD)) sk->sk_state_change(sk); else inet_csk_destroy_sock(sk); } EXPORT_SYMBOL_GPL(tcp_done); int tcp_abort(struct sock *sk, int err) { int state = inet_sk_state_load(sk); if (state == TCP_NEW_SYN_RECV) { struct request_sock *req = inet_reqsk(sk); local_bh_disable(); inet_csk_reqsk_queue_drop(req->rsk_listener, req); local_bh_enable(); return 0; } if (state == TCP_TIME_WAIT) { struct inet_timewait_sock *tw = inet_twsk(sk); refcount_inc(&tw->tw_refcnt); local_bh_disable(); inet_twsk_deschedule_put(tw); local_bh_enable(); return 0; } /* BPF context ensures sock locking. */ if (!has_current_bpf_ctx()) /* Don't race with userspace socket closes such as tcp_close. */ lock_sock(sk); /* Avoid closing the same socket twice. */ if (sk->sk_state == TCP_CLOSE) { if (!has_current_bpf_ctx()) release_sock(sk); return -ENOENT; } if (sk->sk_state == TCP_LISTEN) { tcp_set_state(sk, TCP_CLOSE); inet_csk_listen_stop(sk); } /* Don't race with BH socket closes such as inet_csk_listen_stop. */ local_bh_disable(); bh_lock_sock(sk); if (tcp_need_reset(sk->sk_state)) tcp_send_active_reset(sk, GFP_ATOMIC, SK_RST_REASON_TCP_STATE); tcp_done_with_error(sk, err); bh_unlock_sock(sk); local_bh_enable(); if (!has_current_bpf_ctx()) release_sock(sk); return 0; } EXPORT_SYMBOL_GPL(tcp_abort); extern struct tcp_congestion_ops tcp_reno; static __initdata unsigned long thash_entries; static int __init set_thash_entries(char *str) { ssize_t ret; if (!str) return 0; ret = kstrtoul(str, 0, &thash_entries); if (ret) return 0; return 1; } __setup("thash_entries=", set_thash_entries); static void __init tcp_init_mem(void) { unsigned long limit = nr_free_buffer_pages() / 16; limit = max(limit, 128UL); sysctl_tcp_mem[0] = limit / 4 * 3; /* 4.68 % */ sysctl_tcp_mem[1] = limit; /* 6.25 % */ sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2; /* 9.37 % */ } static void __init tcp_struct_check(void) { /* TX read-mostly hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, max_window); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, rcv_ssthresh); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, reordering); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, notsent_lowat); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, gso_segs); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, retransmit_skb_hint); #if IS_ENABLED(CONFIG_TLS_DEVICE) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, tcp_clean_acked); #endif /* TXRX read-mostly hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, tsoffset); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, snd_wnd); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, mss_cache); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, snd_cwnd); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, prr_out); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, lost_out); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, sacked_out); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, scaling_ratio); /* RX read-mostly hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, copied_seq); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, snd_wl1); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, tlp_high_seq); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, rttvar_us); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, retrans_out); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, advmss); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, urg_data); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, lost); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, rtt_min); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, out_of_order_queue); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, snd_ssthresh); /* TX read-write hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, segs_out); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, data_segs_out); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, bytes_sent); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, snd_sml); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, chrono_start); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, chrono_stat); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, write_seq); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, pushed_seq); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, lsndtime); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, mdev_us); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tcp_wstamp_ns); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, accecn_opt_tstamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, rtt_seq); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tsorted_sent_queue); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, highest_sack); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, ecn_flags); /* TXRX read-write hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, pred_flags); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, tcp_clock_cache); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, tcp_mstamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_nxt); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_nxt); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_una); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, window_clamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, srtt_us); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, packets_out); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_up); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered_ce); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, received_ce); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, received_ecn_bytes); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, app_limited); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_wnd); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_tstamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rx_opt); /* RX read-write hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_received); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, segs_in); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, data_segs_in); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_wup); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, max_packets_out); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, cwnd_usage_seq); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rate_delivered); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rate_interval_us); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_rtt_last_tsecr); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, delivered_ecn_bytes); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, first_tx_mstamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, delivered_mstamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_acked); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_rtt_est); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcvq_space); } void __init tcp_init(void) { int max_rshare, max_wshare, cnt; unsigned long limit; unsigned int i; BUILD_BUG_ON(TCP_MIN_SND_MSS <= MAX_TCP_OPTION_SPACE); BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof_field(struct sk_buff, cb)); tcp_struct_check(); percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL); timer_setup(&tcp_orphan_timer, tcp_orphan_update, TIMER_DEFERRABLE); mod_timer(&tcp_orphan_timer, jiffies + TCP_ORPHAN_TIMER_PERIOD); inet_hashinfo2_init(&tcp_hashinfo, "tcp_listen_portaddr_hash", thash_entries, 21, /* one slot per 2 MB*/ 0, 64 * 1024); tcp_hashinfo.bind_bucket_cachep = kmem_cache_create("tcp_bind_bucket", sizeof(struct inet_bind_bucket), 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, NULL); tcp_hashinfo.bind2_bucket_cachep = kmem_cache_create("tcp_bind2_bucket", sizeof(struct inet_bind2_bucket), 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, NULL); /* Size and allocate the main established and bind bucket * hash tables. * * The methodology is similar to that of the buffer cache. */ tcp_hashinfo.ehash = alloc_large_system_hash("TCP established", sizeof(struct inet_ehash_bucket), thash_entries, 17, /* one slot per 128 KB of memory */ 0, NULL, &tcp_hashinfo.ehash_mask, 0, thash_entries ? 0 : 512 * 1024); for (i = 0; i <= tcp_hashinfo.ehash_mask; i++) INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i); if (inet_ehash_locks_alloc(&tcp_hashinfo)) panic("TCP: failed to alloc ehash_locks"); tcp_hashinfo.bhash = alloc_large_system_hash("TCP bind", 2 * sizeof(struct inet_bind_hashbucket), tcp_hashinfo.ehash_mask + 1, 17, /* one slot per 128 KB of memory */ 0, &tcp_hashinfo.bhash_size, NULL, 0, 64 * 1024); tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size; tcp_hashinfo.bhash2 = tcp_hashinfo.bhash + tcp_hashinfo.bhash_size; for (i = 0; i < tcp_hashinfo.bhash_size; i++) { spin_lock_init(&tcp_hashinfo.bhash[i].lock); INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain); spin_lock_init(&tcp_hashinfo.bhash2[i].lock); INIT_HLIST_HEAD(&tcp_hashinfo.bhash2[i].chain); } tcp_hashinfo.pernet = false; cnt = tcp_hashinfo.ehash_mask + 1; sysctl_tcp_max_orphans = cnt / 2; tcp_init_mem(); /* Set per-socket limits to no more than 1/128 the pressure threshold */ limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7); max_wshare = min(4UL*1024*1024, limit); max_rshare = min(32UL*1024*1024, limit); init_net.ipv4.sysctl_tcp_wmem[0] = PAGE_SIZE; init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024; init_net.ipv4.sysctl_tcp_wmem[2] = max(64*1024, max_wshare); init_net.ipv4.sysctl_tcp_rmem[0] = PAGE_SIZE; init_net.ipv4.sysctl_tcp_rmem[1] = 131072; init_net.ipv4.sysctl_tcp_rmem[2] = max(131072, max_rshare); pr_info("Hash tables configured (established %u bind %u)\n", tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size); tcp_v4_init(); tcp_metrics_init(); BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0); tcp_tsq_work_init(); mptcp_init(); } |
| 12 4 6 3 3 13 1 6 6 39 1 23 5 8 2 35 2 12 2 21 33 1 1 2 10 3 3 2 5 8 7 1 2 1 3 9 1 1 1 6 3 1 9 5 2 1 11 11 11 11 11 11 16 1 20 1 20 1 37 1 22 15 5 6 5 1 13 13 2 9 7 6 4 4 11 11 6 7 4 3 3 1 2 104 61 52 2 39 2 7 5 3 3 4 3 4 3 4 4 4 1 2 1 1 1 1 1 1 1 1 1 1 3 2 3 9 1 103 100 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 | // SPDX-License-Identifier: GPL-2.0-or-later /* * User level driver support for input subsystem * * Heavily based on evdev.c by Vojtech Pavlik * * Author: Aristeu Sergio Rozanski Filho <aris@cathedrallabs.org> * * Changes/Revisions: * 0.4 01/09/2014 (Benjamin Tissoires <benjamin.tissoires@redhat.com>) * - add UI_GET_SYSNAME ioctl * 0.3 09/04/2006 (Anssi Hannula <anssi.hannula@gmail.com>) * - updated ff support for the changes in kernel interface * - added MODULE_VERSION * 0.2 16/10/2004 (Micah Dowty <micah@navi.cx>) * - added force feedback support * - added UI_SET_PHYS * 0.1 20/06/2002 * - first public version */ #include <uapi/linux/uinput.h> #include <linux/poll.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/init.h> #include <linux/fs.h> #include <linux/miscdevice.h> #include <linux/overflow.h> #include <linux/input/mt.h> #include "../input-compat.h" #define UINPUT_NAME "uinput" #define UINPUT_BUFFER_SIZE 16 #define UINPUT_NUM_REQUESTS 16 #define UINPUT_TIMESTAMP_ALLOWED_OFFSET_SECS 10 enum uinput_state { UIST_NEW_DEVICE, UIST_SETUP_COMPLETE, UIST_CREATED }; struct uinput_request { unsigned int id; unsigned int code; /* UI_FF_UPLOAD, UI_FF_ERASE */ int retval; struct completion done; union { unsigned int effect_id; struct { struct ff_effect *effect; struct ff_effect *old; } upload; } u; }; struct uinput_device { struct input_dev *dev; struct mutex mutex; enum uinput_state state; wait_queue_head_t waitq; unsigned char ready; unsigned char head; unsigned char tail; struct input_event buff[UINPUT_BUFFER_SIZE]; unsigned int ff_effects_max; struct uinput_request *requests[UINPUT_NUM_REQUESTS]; wait_queue_head_t requests_waitq; spinlock_t requests_lock; }; static int uinput_dev_event(struct input_dev *dev, unsigned int type, unsigned int code, int value) { struct uinput_device *udev = input_get_drvdata(dev); struct timespec64 ts; ktime_get_ts64(&ts); udev->buff[udev->head] = (struct input_event) { .input_event_sec = ts.tv_sec, .input_event_usec = ts.tv_nsec / NSEC_PER_USEC, .type = type, .code = code, .value = value, }; udev->head = (udev->head + 1) % UINPUT_BUFFER_SIZE; wake_up_interruptible(&udev->waitq); return 0; } /* Atomically allocate an ID for the given request. Returns 0 on success. */ static bool uinput_request_alloc_id(struct uinput_device *udev, struct uinput_request *request) { unsigned int id; bool reserved = false; spin_lock(&udev->requests_lock); for (id = 0; id < UINPUT_NUM_REQUESTS; id++) { if (!udev->requests[id]) { request->id = id; udev->requests[id] = request; reserved = true; break; } } spin_unlock(&udev->requests_lock); return reserved; } static struct uinput_request *uinput_request_find(struct uinput_device *udev, unsigned int id) { /* Find an input request, by ID. Returns NULL if the ID isn't valid. */ if (id >= UINPUT_NUM_REQUESTS) return NULL; return udev->requests[id]; } static int uinput_request_reserve_slot(struct uinput_device *udev, struct uinput_request *request) { /* Allocate slot. If none are available right away, wait. */ return wait_event_interruptible(udev->requests_waitq, uinput_request_alloc_id(udev, request)); } static void uinput_request_release_slot(struct uinput_device *udev, unsigned int id) { /* Mark slot as available */ spin_lock(&udev->requests_lock); udev->requests[id] = NULL; spin_unlock(&udev->requests_lock); wake_up(&udev->requests_waitq); } static int uinput_request_send(struct uinput_device *udev, struct uinput_request *request) { int retval; retval = mutex_lock_interruptible(&udev->mutex); if (retval) return retval; if (udev->state != UIST_CREATED) { retval = -ENODEV; goto out; } init_completion(&request->done); /* * Tell our userspace application about this new request * by queueing an input event. */ uinput_dev_event(udev->dev, EV_UINPUT, request->code, request->id); out: mutex_unlock(&udev->mutex); return retval; } static int uinput_request_submit(struct uinput_device *udev, struct uinput_request *request) { int retval; retval = uinput_request_reserve_slot(udev, request); if (retval) return retval; retval = uinput_request_send(udev, request); if (retval) goto out; if (!wait_for_completion_timeout(&request->done, 30 * HZ)) { retval = -ETIMEDOUT; goto out; } retval = request->retval; out: uinput_request_release_slot(udev, request->id); return retval; } /* * Fail all outstanding requests so handlers don't wait for the userspace * to finish processing them. */ static void uinput_flush_requests(struct uinput_device *udev) { struct uinput_request *request; int i; spin_lock(&udev->requests_lock); for (i = 0; i < UINPUT_NUM_REQUESTS; i++) { request = udev->requests[i]; if (request) { request->retval = -ENODEV; complete(&request->done); } } spin_unlock(&udev->requests_lock); } static void uinput_dev_set_gain(struct input_dev *dev, u16 gain) { uinput_dev_event(dev, EV_FF, FF_GAIN, gain); } static void uinput_dev_set_autocenter(struct input_dev *dev, u16 magnitude) { uinput_dev_event(dev, EV_FF, FF_AUTOCENTER, magnitude); } static int uinput_dev_playback(struct input_dev *dev, int effect_id, int value) { return uinput_dev_event(dev, EV_FF, effect_id, value); } static int uinput_dev_upload_effect(struct input_dev *dev, struct ff_effect *effect, struct ff_effect *old) { struct uinput_device *udev = input_get_drvdata(dev); struct uinput_request request; /* * uinput driver does not currently support periodic effects with * custom waveform since it does not have a way to pass buffer of * samples (custom_data) to userspace. If ever there is a device * supporting custom waveforms we would need to define an additional * ioctl (UI_UPLOAD_SAMPLES) but for now we just bail out. */ if (effect->type == FF_PERIODIC && effect->u.periodic.waveform == FF_CUSTOM) return -EINVAL; request.code = UI_FF_UPLOAD; request.u.upload.effect = effect; request.u.upload.old = old; return uinput_request_submit(udev, &request); } static int uinput_dev_erase_effect(struct input_dev *dev, int effect_id) { struct uinput_device *udev = input_get_drvdata(dev); struct uinput_request request; if (!test_bit(EV_FF, dev->evbit)) return -ENOSYS; request.code = UI_FF_ERASE; request.u.effect_id = effect_id; return uinput_request_submit(udev, &request); } static int uinput_dev_flush(struct input_dev *dev, struct file *file) { /* * If we are called with file == NULL that means we are tearing * down the device, and therefore we can not handle FF erase * requests: either we are handling UI_DEV_DESTROY (and holding * the udev->mutex), or the file descriptor is closed and there is * nobody on the other side anymore. */ return file ? input_ff_flush(dev, file) : 0; } static void uinput_destroy_device(struct uinput_device *udev) { const char *name, *phys; struct input_dev *dev = udev->dev; enum uinput_state old_state = udev->state; udev->state = UIST_NEW_DEVICE; if (dev) { name = dev->name; phys = dev->phys; if (old_state == UIST_CREATED) { uinput_flush_requests(udev); input_unregister_device(dev); } else { input_free_device(dev); } kfree(name); kfree(phys); udev->dev = NULL; } } static int uinput_create_device(struct uinput_device *udev) { struct input_dev *dev = udev->dev; int error, nslot; if (udev->state != UIST_SETUP_COMPLETE) { printk(KERN_DEBUG "%s: write device info first\n", UINPUT_NAME); return -EINVAL; } if (test_bit(EV_ABS, dev->evbit)) { input_alloc_absinfo(dev); if (!dev->absinfo) { error = -EINVAL; goto fail1; } if (test_bit(ABS_MT_SLOT, dev->absbit)) { nslot = input_abs_get_max(dev, ABS_MT_SLOT) + 1; error = input_mt_init_slots(dev, nslot, 0); if (error) goto fail1; } else if (test_bit(ABS_MT_POSITION_X, dev->absbit)) { input_set_events_per_packet(dev, 60); } } if (test_bit(EV_FF, dev->evbit) && !udev->ff_effects_max) { printk(KERN_DEBUG "%s: ff_effects_max should be non-zero when FF_BIT is set\n", UINPUT_NAME); error = -EINVAL; goto fail1; } if (udev->ff_effects_max) { error = input_ff_create(dev, udev->ff_effects_max); if (error) goto fail1; dev->ff->upload = uinput_dev_upload_effect; dev->ff->erase = uinput_dev_erase_effect; dev->ff->playback = uinput_dev_playback; dev->ff->set_gain = uinput_dev_set_gain; dev->ff->set_autocenter = uinput_dev_set_autocenter; /* * The standard input_ff_flush() implementation does * not quite work for uinput as we can't reasonably * handle FF requests during device teardown. */ dev->flush = uinput_dev_flush; } dev->event = uinput_dev_event; input_set_drvdata(udev->dev, udev); error = input_register_device(udev->dev); if (error) goto fail2; udev->state = UIST_CREATED; return 0; fail2: input_ff_destroy(dev); fail1: uinput_destroy_device(udev); return error; } static int uinput_open(struct inode *inode, struct file *file) { struct uinput_device *newdev; newdev = kzalloc(sizeof(*newdev), GFP_KERNEL); if (!newdev) return -ENOMEM; mutex_init(&newdev->mutex); spin_lock_init(&newdev->requests_lock); init_waitqueue_head(&newdev->requests_waitq); init_waitqueue_head(&newdev->waitq); newdev->state = UIST_NEW_DEVICE; file->private_data = newdev; stream_open(inode, file); return 0; } static int uinput_validate_absinfo(struct input_dev *dev, unsigned int code, const struct input_absinfo *abs) { int min, max, range; min = abs->minimum; max = abs->maximum; if ((min != 0 || max != 0) && max < min) { printk(KERN_DEBUG "%s: invalid abs[%02x] min:%d max:%d\n", UINPUT_NAME, code, min, max); return -EINVAL; } if (!check_sub_overflow(max, min, &range) && abs->flat > range) { printk(KERN_DEBUG "%s: abs_flat #%02x out of range: %d (min:%d/max:%d)\n", UINPUT_NAME, code, abs->flat, min, max); return -EINVAL; } /* * Limit number of contacts to a reasonable value (100). This * ensures that we need less than 2 pages for struct input_mt * (we are not using in-kernel slot assignment so not going to * allocate memory for the "red" table), and we should have no * trouble getting this much memory. */ if (code == ABS_MT_SLOT && max > 99) { printk(KERN_DEBUG "%s: unreasonably large number of slots requested: %d\n", UINPUT_NAME, max); return -EINVAL; } return 0; } static int uinput_validate_absbits(struct input_dev *dev) { unsigned int cnt; int error; if (!test_bit(EV_ABS, dev->evbit)) return 0; /* * Check if absmin/absmax/absfuzz/absflat are sane. */ for_each_set_bit(cnt, dev->absbit, ABS_CNT) { if (!dev->absinfo) return -EINVAL; error = uinput_validate_absinfo(dev, cnt, &dev->absinfo[cnt]); if (error) return error; } return 0; } static int uinput_dev_setup(struct uinput_device *udev, struct uinput_setup __user *arg) { struct uinput_setup setup; struct input_dev *dev; if (udev->state == UIST_CREATED) return -EINVAL; if (copy_from_user(&setup, arg, sizeof(setup))) return -EFAULT; if (!setup.name[0]) return -EINVAL; dev = udev->dev; dev->id = setup.id; udev->ff_effects_max = setup.ff_effects_max; kfree(dev->name); dev->name = kstrndup(setup.name, UINPUT_MAX_NAME_SIZE, GFP_KERNEL); if (!dev->name) return -ENOMEM; udev->state = UIST_SETUP_COMPLETE; return 0; } static int uinput_abs_setup(struct uinput_device *udev, struct uinput_setup __user *arg, size_t size) { struct uinput_abs_setup setup = {}; struct input_dev *dev; int error; if (size > sizeof(setup)) return -E2BIG; if (udev->state == UIST_CREATED) return -EINVAL; if (copy_from_user(&setup, arg, size)) return -EFAULT; if (setup.code > ABS_MAX) return -ERANGE; dev = udev->dev; error = uinput_validate_absinfo(dev, setup.code, &setup.absinfo); if (error) return error; input_alloc_absinfo(dev); if (!dev->absinfo) return -ENOMEM; set_bit(setup.code, dev->absbit); dev->absinfo[setup.code] = setup.absinfo; return 0; } /* legacy setup via write() */ static int uinput_setup_device_legacy(struct uinput_device *udev, const char __user *buffer, size_t count) { struct uinput_user_dev *user_dev; struct input_dev *dev; int i; int retval; if (count != sizeof(struct uinput_user_dev)) return -EINVAL; if (!udev->dev) { udev->dev = input_allocate_device(); if (!udev->dev) return -ENOMEM; } dev = udev->dev; user_dev = memdup_user(buffer, sizeof(struct uinput_user_dev)); if (IS_ERR(user_dev)) return PTR_ERR(user_dev); udev->ff_effects_max = user_dev->ff_effects_max; /* Ensure name is filled in */ if (!user_dev->name[0]) { retval = -EINVAL; goto exit; } kfree(dev->name); dev->name = kstrndup(user_dev->name, UINPUT_MAX_NAME_SIZE, GFP_KERNEL); if (!dev->name) { retval = -ENOMEM; goto exit; } dev->id.bustype = user_dev->id.bustype; dev->id.vendor = user_dev->id.vendor; dev->id.product = user_dev->id.product; dev->id.version = user_dev->id.version; for (i = 0; i < ABS_CNT; i++) { input_abs_set_max(dev, i, user_dev->absmax[i]); input_abs_set_min(dev, i, user_dev->absmin[i]); input_abs_set_fuzz(dev, i, user_dev->absfuzz[i]); input_abs_set_flat(dev, i, user_dev->absflat[i]); } retval = uinput_validate_absbits(dev); if (retval < 0) goto exit; udev->state = UIST_SETUP_COMPLETE; retval = count; exit: kfree(user_dev); return retval; } /* * Returns true if the given timestamp is valid (i.e., if all the following * conditions are satisfied), false otherwise. * 1) given timestamp is positive * 2) it's within the allowed offset before the current time * 3) it's not in the future */ static bool is_valid_timestamp(const ktime_t timestamp) { ktime_t zero_time; ktime_t current_time; ktime_t min_time; ktime_t offset; zero_time = ktime_set(0, 0); if (ktime_compare(zero_time, timestamp) >= 0) return false; current_time = ktime_get(); offset = ktime_set(UINPUT_TIMESTAMP_ALLOWED_OFFSET_SECS, 0); min_time = ktime_sub(current_time, offset); if (ktime_after(min_time, timestamp) || ktime_after(timestamp, current_time)) return false; return true; } static ssize_t uinput_inject_events(struct uinput_device *udev, const char __user *buffer, size_t count) { struct input_event ev; size_t bytes = 0; ktime_t timestamp; if (count != 0 && count < input_event_size()) return -EINVAL; while (bytes + input_event_size() <= count) { /* * Note that even if some events were fetched successfully * we are still going to return EFAULT instead of partial * count to let userspace know that it got it's buffers * all wrong. */ if (input_event_from_user(buffer + bytes, &ev)) return -EFAULT; timestamp = ktime_set(ev.input_event_sec, ev.input_event_usec * NSEC_PER_USEC); if (is_valid_timestamp(timestamp)) input_set_timestamp(udev->dev, timestamp); input_event(udev->dev, ev.type, ev.code, ev.value); bytes += input_event_size(); cond_resched(); } return bytes; } static ssize_t uinput_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { struct uinput_device *udev = file->private_data; int retval; if (count == 0) return 0; retval = mutex_lock_interruptible(&udev->mutex); if (retval) return retval; retval = udev->state == UIST_CREATED ? uinput_inject_events(udev, buffer, count) : uinput_setup_device_legacy(udev, buffer, count); mutex_unlock(&udev->mutex); return retval; } static bool uinput_fetch_next_event(struct uinput_device *udev, struct input_event *event) { bool have_event; spin_lock_irq(&udev->dev->event_lock); have_event = udev->head != udev->tail; if (have_event) { *event = udev->buff[udev->tail]; udev->tail = (udev->tail + 1) % UINPUT_BUFFER_SIZE; } spin_unlock_irq(&udev->dev->event_lock); return have_event; } static ssize_t uinput_events_to_user(struct uinput_device *udev, char __user *buffer, size_t count) { struct input_event event; size_t read = 0; while (read + input_event_size() <= count && uinput_fetch_next_event(udev, &event)) { if (input_event_to_user(buffer + read, &event)) return -EFAULT; read += input_event_size(); } return read; } static ssize_t uinput_read(struct file *file, char __user *buffer, size_t count, loff_t *ppos) { struct uinput_device *udev = file->private_data; ssize_t retval; if (count != 0 && count < input_event_size()) return -EINVAL; do { retval = mutex_lock_interruptible(&udev->mutex); if (retval) return retval; if (udev->state != UIST_CREATED) retval = -ENODEV; else if (udev->head == udev->tail && (file->f_flags & O_NONBLOCK)) retval = -EAGAIN; else retval = uinput_events_to_user(udev, buffer, count); mutex_unlock(&udev->mutex); if (retval || count == 0) break; if (!(file->f_flags & O_NONBLOCK)) retval = wait_event_interruptible(udev->waitq, udev->head != udev->tail || udev->state != UIST_CREATED); } while (retval == 0); return retval; } static __poll_t uinput_poll(struct file *file, poll_table *wait) { struct uinput_device *udev = file->private_data; __poll_t mask = EPOLLOUT | EPOLLWRNORM; /* uinput is always writable */ poll_wait(file, &udev->waitq, wait); if (udev->head != udev->tail) mask |= EPOLLIN | EPOLLRDNORM; return mask; } static int uinput_release(struct inode *inode, struct file *file) { struct uinput_device *udev = file->private_data; uinput_destroy_device(udev); kfree(udev); return 0; } #ifdef CONFIG_COMPAT struct uinput_ff_upload_compat { __u32 request_id; __s32 retval; struct ff_effect_compat effect; struct ff_effect_compat old; }; static int uinput_ff_upload_to_user(char __user *buffer, const struct uinput_ff_upload *ff_up) { if (in_compat_syscall()) { struct uinput_ff_upload_compat ff_up_compat; memset(&ff_up_compat, 0, sizeof(ff_up_compat)); ff_up_compat.request_id = ff_up->request_id; ff_up_compat.retval = ff_up->retval; /* * It so happens that the pointer that gives us the trouble * is the last field in the structure. Since we don't support * custom waveforms in uinput anyway we can just copy the whole * thing (to the compat size) and ignore the pointer. */ memcpy(&ff_up_compat.effect, &ff_up->effect, sizeof(struct ff_effect_compat)); memcpy(&ff_up_compat.old, &ff_up->old, sizeof(struct ff_effect_compat)); if (copy_to_user(buffer, &ff_up_compat, sizeof(struct uinput_ff_upload_compat))) return -EFAULT; } else { if (copy_to_user(buffer, ff_up, sizeof(struct uinput_ff_upload))) return -EFAULT; } return 0; } static int uinput_ff_upload_from_user(const char __user *buffer, struct uinput_ff_upload *ff_up) { if (in_compat_syscall()) { struct uinput_ff_upload_compat ff_up_compat; if (copy_from_user(&ff_up_compat, buffer, sizeof(struct uinput_ff_upload_compat))) return -EFAULT; ff_up->request_id = ff_up_compat.request_id; ff_up->retval = ff_up_compat.retval; memcpy(&ff_up->effect, &ff_up_compat.effect, sizeof(struct ff_effect_compat)); memcpy(&ff_up->old, &ff_up_compat.old, sizeof(struct ff_effect_compat)); } else { if (copy_from_user(ff_up, buffer, sizeof(struct uinput_ff_upload))) return -EFAULT; } return 0; } #else static int uinput_ff_upload_to_user(char __user *buffer, const struct uinput_ff_upload *ff_up) { if (copy_to_user(buffer, ff_up, sizeof(struct uinput_ff_upload))) return -EFAULT; return 0; } static int uinput_ff_upload_from_user(const char __user *buffer, struct uinput_ff_upload *ff_up) { if (copy_from_user(ff_up, buffer, sizeof(struct uinput_ff_upload))) return -EFAULT; return 0; } #endif #define uinput_set_bit(_arg, _bit, _max) \ ({ \ int __ret = 0; \ if (udev->state == UIST_CREATED) \ __ret = -EINVAL; \ else if ((_arg) > (_max)) \ __ret = -EINVAL; \ else set_bit((_arg), udev->dev->_bit); \ __ret; \ }) static int uinput_str_to_user(void __user *dest, const char *str, unsigned int maxlen) { char __user *p = dest; int len, ret; if (!str) return -ENOENT; if (maxlen == 0) return -EINVAL; len = strlen(str) + 1; if (len > maxlen) len = maxlen; ret = copy_to_user(p, str, len); if (ret) return -EFAULT; /* force terminating '\0' */ ret = put_user(0, p + len - 1); return ret ? -EFAULT : len; } static long uinput_ioctl_handler(struct file *file, unsigned int cmd, unsigned long arg, void __user *p) { int retval; struct uinput_device *udev = file->private_data; struct uinput_ff_upload ff_up; struct uinput_ff_erase ff_erase; struct uinput_request *req; char *phys; const char *name; unsigned int size; retval = mutex_lock_interruptible(&udev->mutex); if (retval) return retval; if (!udev->dev) { udev->dev = input_allocate_device(); if (!udev->dev) { retval = -ENOMEM; goto out; } } switch (cmd) { case UI_GET_VERSION: if (put_user(UINPUT_VERSION, (unsigned int __user *)p)) retval = -EFAULT; goto out; case UI_DEV_CREATE: retval = uinput_create_device(udev); goto out; case UI_DEV_DESTROY: uinput_destroy_device(udev); goto out; case UI_DEV_SETUP: retval = uinput_dev_setup(udev, p); goto out; /* UI_ABS_SETUP is handled in the variable size ioctls */ case UI_SET_EVBIT: retval = uinput_set_bit(arg, evbit, EV_MAX); goto out; case UI_SET_KEYBIT: retval = uinput_set_bit(arg, keybit, KEY_MAX); goto out; case UI_SET_RELBIT: retval = uinput_set_bit(arg, relbit, REL_MAX); goto out; case UI_SET_ABSBIT: retval = uinput_set_bit(arg, absbit, ABS_MAX); goto out; case UI_SET_MSCBIT: retval = uinput_set_bit(arg, mscbit, MSC_MAX); goto out; case UI_SET_LEDBIT: retval = uinput_set_bit(arg, ledbit, LED_MAX); goto out; case UI_SET_SNDBIT: retval = uinput_set_bit(arg, sndbit, SND_MAX); goto out; case UI_SET_FFBIT: retval = uinput_set_bit(arg, ffbit, FF_MAX); goto out; case UI_SET_SWBIT: retval = uinput_set_bit(arg, swbit, SW_MAX); goto out; case UI_SET_PROPBIT: retval = uinput_set_bit(arg, propbit, INPUT_PROP_MAX); goto out; case UI_SET_PHYS: if (udev->state == UIST_CREATED) { retval = -EINVAL; goto out; } phys = strndup_user(p, 1024); if (IS_ERR(phys)) { retval = PTR_ERR(phys); goto out; } kfree(udev->dev->phys); udev->dev->phys = phys; goto out; case UI_BEGIN_FF_UPLOAD: retval = uinput_ff_upload_from_user(p, &ff_up); if (retval) goto out; req = uinput_request_find(udev, ff_up.request_id); if (!req || req->code != UI_FF_UPLOAD || !req->u.upload.effect) { retval = -EINVAL; goto out; } ff_up.retval = 0; ff_up.effect = *req->u.upload.effect; if (req->u.upload.old) ff_up.old = *req->u.upload.old; else memset(&ff_up.old, 0, sizeof(struct ff_effect)); retval = uinput_ff_upload_to_user(p, &ff_up); goto out; case UI_BEGIN_FF_ERASE: if (copy_from_user(&ff_erase, p, sizeof(ff_erase))) { retval = -EFAULT; goto out; } req = uinput_request_find(udev, ff_erase.request_id); if (!req || req->code != UI_FF_ERASE) { retval = -EINVAL; goto out; } ff_erase.retval = 0; ff_erase.effect_id = req->u.effect_id; if (copy_to_user(p, &ff_erase, sizeof(ff_erase))) { retval = -EFAULT; goto out; } goto out; case UI_END_FF_UPLOAD: retval = uinput_ff_upload_from_user(p, &ff_up); if (retval) goto out; req = uinput_request_find(udev, ff_up.request_id); if (!req || req->code != UI_FF_UPLOAD || !req->u.upload.effect) { retval = -EINVAL; goto out; } req->retval = ff_up.retval; complete(&req->done); goto out; case UI_END_FF_ERASE: if (copy_from_user(&ff_erase, p, sizeof(ff_erase))) { retval = -EFAULT; goto out; } req = uinput_request_find(udev, ff_erase.request_id); if (!req || req->code != UI_FF_ERASE) { retval = -EINVAL; goto out; } req->retval = ff_erase.retval; complete(&req->done); goto out; } size = _IOC_SIZE(cmd); /* Now check variable-length commands */ switch (cmd & ~IOCSIZE_MASK) { case UI_GET_SYSNAME(0): if (udev->state != UIST_CREATED) { retval = -ENOENT; goto out; } name = dev_name(&udev->dev->dev); retval = uinput_str_to_user(p, name, size); goto out; case UI_ABS_SETUP & ~IOCSIZE_MASK: retval = uinput_abs_setup(udev, p, size); goto out; } retval = -EINVAL; out: mutex_unlock(&udev->mutex); return retval; } static long uinput_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { return uinput_ioctl_handler(file, cmd, arg, (void __user *)arg); } #ifdef CONFIG_COMPAT /* * These IOCTLs change their size and thus their numbers between * 32 and 64 bits. */ #define UI_SET_PHYS_COMPAT \ _IOW(UINPUT_IOCTL_BASE, 108, compat_uptr_t) #define UI_BEGIN_FF_UPLOAD_COMPAT \ _IOWR(UINPUT_IOCTL_BASE, 200, struct uinput_ff_upload_compat) #define UI_END_FF_UPLOAD_COMPAT \ _IOW(UINPUT_IOCTL_BASE, 201, struct uinput_ff_upload_compat) static long uinput_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { switch (cmd) { case UI_SET_PHYS_COMPAT: cmd = UI_SET_PHYS; break; case UI_BEGIN_FF_UPLOAD_COMPAT: cmd = UI_BEGIN_FF_UPLOAD; break; case UI_END_FF_UPLOAD_COMPAT: cmd = UI_END_FF_UPLOAD; break; } return uinput_ioctl_handler(file, cmd, arg, compat_ptr(arg)); } #endif static const struct file_operations uinput_fops = { .owner = THIS_MODULE, .open = uinput_open, .release = uinput_release, .read = uinput_read, .write = uinput_write, .poll = uinput_poll, .unlocked_ioctl = uinput_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = uinput_compat_ioctl, #endif }; static struct miscdevice uinput_misc = { .fops = &uinput_fops, .minor = UINPUT_MINOR, .name = UINPUT_NAME, }; module_misc_device(uinput_misc); MODULE_ALIAS_MISCDEV(UINPUT_MINOR); MODULE_ALIAS("devname:" UINPUT_NAME); MODULE_AUTHOR("Aristeu Sergio Rozanski Filho"); MODULE_DESCRIPTION("User level driver support for input subsystem"); MODULE_LICENSE("GPL"); |
| 88 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_FRAG_H__ #define __NET_FRAG_H__ #include <linux/rhashtable-types.h> #include <linux/completion.h> #include <linux/in6.h> #include <linux/rbtree_types.h> #include <linux/refcount.h> #include <net/dropreason-core.h> /* Per netns frag queues directory */ struct fqdir { /* sysctls */ long high_thresh; long low_thresh; int timeout; int max_dist; struct inet_frags *f; struct net *net; bool dead; struct rhashtable rhashtable ____cacheline_aligned_in_smp; /* Keep atomic mem on separate cachelines in structs that include it */ atomic_long_t mem ____cacheline_aligned_in_smp; struct work_struct destroy_work; struct llist_node free_list; }; /** * enum: fragment queue flags * * @INET_FRAG_FIRST_IN: first fragment has arrived * @INET_FRAG_LAST_IN: final fragment has arrived * @INET_FRAG_COMPLETE: frag queue has been processed and is due for destruction * @INET_FRAG_HASH_DEAD: inet_frag_kill() has not removed fq from rhashtable * @INET_FRAG_DROP: if skbs must be dropped (instead of being consumed) */ enum { INET_FRAG_FIRST_IN = BIT(0), INET_FRAG_LAST_IN = BIT(1), INET_FRAG_COMPLETE = BIT(2), INET_FRAG_HASH_DEAD = BIT(3), INET_FRAG_DROP = BIT(4), }; struct frag_v4_compare_key { __be32 saddr; __be32 daddr; u32 user; u32 vif; __be16 id; u16 protocol; }; struct frag_v6_compare_key { struct in6_addr saddr; struct in6_addr daddr; u32 user; __be32 id; u32 iif; }; /** * struct inet_frag_queue - fragment queue * * @node: rhash node * @key: keys identifying this frag. * @timer: queue expiration timer * @lock: spinlock protecting this frag * @refcnt: reference count of the queue * @rb_fragments: received fragments rb-tree root * @fragments_tail: received fragments tail * @last_run_head: the head of the last "run". see ip_fragment.c * @stamp: timestamp of the last received fragment * @len: total length of the original datagram * @meat: length of received fragments so far * @tstamp_type: stamp has a mono delivery time (EDT) * @flags: fragment queue flags * @max_size: maximum received fragment size * @fqdir: pointer to struct fqdir * @rcu: rcu head for freeing deferall */ struct inet_frag_queue { struct rhash_head node; union { struct frag_v4_compare_key v4; struct frag_v6_compare_key v6; } key; struct timer_list timer; spinlock_t lock; refcount_t refcnt; struct rb_root rb_fragments; struct sk_buff *fragments_tail; struct sk_buff *last_run_head; ktime_t stamp; int len; int meat; u8 tstamp_type; __u8 flags; u16 max_size; struct fqdir *fqdir; struct rcu_head rcu; }; struct inet_frags { unsigned int qsize; void (*constructor)(struct inet_frag_queue *q, const void *arg); void (*destructor)(struct inet_frag_queue *); void (*frag_expire)(struct timer_list *t); struct kmem_cache *frags_cachep; const char *frags_cache_name; struct rhashtable_params rhash_params; refcount_t refcnt; struct completion completion; }; int inet_frags_init(struct inet_frags *); void inet_frags_fini(struct inet_frags *); int fqdir_init(struct fqdir **fqdirp, struct inet_frags *f, struct net *net); void fqdir_pre_exit(struct fqdir *fqdir); void fqdir_exit(struct fqdir *fqdir); void inet_frag_kill(struct inet_frag_queue *q, int *refs); void inet_frag_destroy(struct inet_frag_queue *q); struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key); void inet_frag_queue_flush(struct inet_frag_queue *q, enum skb_drop_reason reason); static inline void inet_frag_putn(struct inet_frag_queue *q, int refs) { if (refs && refcount_sub_and_test(refs, &q->refcnt)) inet_frag_destroy(q); } /* Memory Tracking Functions. */ static inline long frag_mem_limit(const struct fqdir *fqdir) { return atomic_long_read(&fqdir->mem); } static inline void sub_frag_mem_limit(struct fqdir *fqdir, long val) { atomic_long_sub(val, &fqdir->mem); } static inline void add_frag_mem_limit(struct fqdir *fqdir, long val) { atomic_long_add(val, &fqdir->mem); } /* RFC 3168 support : * We want to check ECN values of all fragments, do detect invalid combinations. * In ipq->ecn, we store the OR value of each ip4_frag_ecn() fragment value. */ #define IPFRAG_ECN_NOT_ECT 0x01 /* one frag had ECN_NOT_ECT */ #define IPFRAG_ECN_ECT_1 0x02 /* one frag had ECN_ECT_1 */ #define IPFRAG_ECN_ECT_0 0x04 /* one frag had ECN_ECT_0 */ #define IPFRAG_ECN_CE 0x08 /* one frag had ECN_CE */ extern const u8 ip_frag_ecn_table[16]; /* Return values of inet_frag_queue_insert() */ #define IPFRAG_OK 0 #define IPFRAG_DUP 1 #define IPFRAG_OVERLAP 2 int inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb, int offset, int end); void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb, struct sk_buff *parent); void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head, void *reasm_data, bool try_coalesce); struct sk_buff *inet_frag_pull_head(struct inet_frag_queue *q); #endif |
| 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_SPECCTRL_H_ #define _ASM_X86_SPECCTRL_H_ #include <linux/thread_info.h> #include <asm/nospec-branch.h> #include <asm/msr.h> /* * On VMENTER we must preserve whatever view of the SPEC_CTRL MSR * the guest has, while on VMEXIT we restore the host view. This * would be easier if SPEC_CTRL were architecturally maskable or * shadowable for guests but this is not (currently) the case. * Takes the guest view of SPEC_CTRL MSR as a parameter and also * the guest's version of VIRT_SPEC_CTRL, if emulated. */ extern void x86_virt_spec_ctrl(u64 guest_virt_spec_ctrl, bool guest); /** * x86_spec_ctrl_set_guest - Set speculation control registers for the guest * @guest_spec_ctrl: The guest content of MSR_SPEC_CTRL * @guest_virt_spec_ctrl: The guest controlled bits of MSR_VIRT_SPEC_CTRL * (may get translated to MSR_AMD64_LS_CFG bits) * * Avoids writing to the MSR if the content/bits are the same */ static inline void x86_spec_ctrl_set_guest(u64 guest_virt_spec_ctrl) { x86_virt_spec_ctrl(guest_virt_spec_ctrl, true); } /** * x86_spec_ctrl_restore_host - Restore host speculation control registers * @guest_spec_ctrl: The guest content of MSR_SPEC_CTRL * @guest_virt_spec_ctrl: The guest controlled bits of MSR_VIRT_SPEC_CTRL * (may get translated to MSR_AMD64_LS_CFG bits) * * Avoids writing to the MSR if the content/bits are the same */ static inline void x86_spec_ctrl_restore_host(u64 guest_virt_spec_ctrl) { x86_virt_spec_ctrl(guest_virt_spec_ctrl, false); } /* AMD specific Speculative Store Bypass MSR data */ extern u64 x86_amd_ls_cfg_base; extern u64 x86_amd_ls_cfg_ssbd_mask; static inline u64 ssbd_tif_to_spec_ctrl(u64 tifn) { BUILD_BUG_ON(TIF_SSBD < SPEC_CTRL_SSBD_SHIFT); return (tifn & _TIF_SSBD) >> (TIF_SSBD - SPEC_CTRL_SSBD_SHIFT); } static inline u64 stibp_tif_to_spec_ctrl(u64 tifn) { BUILD_BUG_ON(TIF_SPEC_IB < SPEC_CTRL_STIBP_SHIFT); return (tifn & _TIF_SPEC_IB) >> (TIF_SPEC_IB - SPEC_CTRL_STIBP_SHIFT); } static inline unsigned long ssbd_spec_ctrl_to_tif(u64 spec_ctrl) { BUILD_BUG_ON(TIF_SSBD < SPEC_CTRL_SSBD_SHIFT); return (spec_ctrl & SPEC_CTRL_SSBD) << (TIF_SSBD - SPEC_CTRL_SSBD_SHIFT); } static inline unsigned long stibp_spec_ctrl_to_tif(u64 spec_ctrl) { BUILD_BUG_ON(TIF_SPEC_IB < SPEC_CTRL_STIBP_SHIFT); return (spec_ctrl & SPEC_CTRL_STIBP) << (TIF_SPEC_IB - SPEC_CTRL_STIBP_SHIFT); } static inline u64 ssbd_tif_to_amd_ls_cfg(u64 tifn) { return (tifn & _TIF_SSBD) ? x86_amd_ls_cfg_ssbd_mask : 0ULL; } /* * This can be used in noinstr functions & should only be called in bare * metal context. */ static __always_inline void __update_spec_ctrl(u64 val) { __this_cpu_write(x86_spec_ctrl_current, val); native_wrmsrq(MSR_IA32_SPEC_CTRL, val); } #ifdef CONFIG_SMP extern void speculative_store_bypass_ht_init(void); #else static inline void speculative_store_bypass_ht_init(void) { } #endif extern void speculation_ctrl_update(unsigned long tif); extern void speculation_ctrl_update_current(void); extern bool itlb_multihit_kvm_mitigation; #endif |
| 1 7 1 6 6 6 5 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 | // SPDX-License-Identifier: GPL-2.0-or-later /* * sctp_offload - GRO/GSO Offloading for SCTP * * Copyright (C) 2015, Marcelo Ricardo Leitner <marcelo.leitner@gmail.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/kprobes.h> #include <linux/socket.h> #include <linux/sctp.h> #include <linux/proc_fs.h> #include <linux/vmalloc.h> #include <linux/module.h> #include <linux/kfifo.h> #include <linux/time.h> #include <net/net_namespace.h> #include <linux/skbuff.h> #include <net/sctp/sctp.h> #include <net/sctp/checksum.h> #include <net/protocol.h> #include <net/gso.h> static __le32 sctp_gso_make_checksum(struct sk_buff *skb) { skb->ip_summed = CHECKSUM_NONE; skb->csum_not_inet = 0; /* csum and csum_start in GSO CB may be needed to do the UDP * checksum when it's a UDP tunneling packet. */ SKB_GSO_CB(skb)->csum = (__force __wsum)~0; SKB_GSO_CB(skb)->csum_start = skb_headroom(skb) + skb->len; return sctp_compute_cksum(skb, skb_transport_offset(skb)); } static struct sk_buff *sctp_gso_segment(struct sk_buff *skb, netdev_features_t features) { struct sk_buff *segs = ERR_PTR(-EINVAL); struct sctphdr *sh; if (!skb_is_gso_sctp(skb)) goto out; sh = sctp_hdr(skb); if (!pskb_may_pull(skb, sizeof(*sh))) goto out; __skb_pull(skb, sizeof(*sh)); if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) { /* Packet is from an untrusted source, reset gso_segs. */ struct skb_shared_info *pinfo = skb_shinfo(skb); struct sk_buff *frag_iter; pinfo->gso_segs = 0; if (skb->len != skb->data_len) { /* Means we have chunks in here too */ pinfo->gso_segs++; } skb_walk_frags(skb, frag_iter) pinfo->gso_segs++; segs = NULL; goto out; } segs = skb_segment(skb, (features | NETIF_F_HW_CSUM) & ~NETIF_F_SG); if (IS_ERR(segs)) goto out; /* All that is left is update SCTP CRC if necessary */ if (!(features & NETIF_F_SCTP_CRC)) { for (skb = segs; skb; skb = skb->next) { if (skb->ip_summed == CHECKSUM_PARTIAL) { sh = sctp_hdr(skb); sh->checksum = sctp_gso_make_checksum(skb); } } } out: return segs; } static const struct net_offload sctp_offload = { .callbacks = { .gso_segment = sctp_gso_segment, }, }; static const struct net_offload sctp6_offload = { .callbacks = { .gso_segment = sctp_gso_segment, }, }; int __init sctp_offload_init(void) { int ret; ret = inet_add_offload(&sctp_offload, IPPROTO_SCTP); if (ret) goto out; ret = inet6_add_offload(&sctp6_offload, IPPROTO_SCTP); if (ret) goto ipv4; return ret; ipv4: inet_del_offload(&sctp_offload, IPPROTO_SCTP); out: return ret; } |
| 25 9 16 16 16 4 1 3 23 10 13 128 129 4 2 3 10 10 9 9 4 5 9 1 1 1 7 1 8 14 14 124 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 | // SPDX-License-Identifier: GPL-2.0-only /* * vxcan.c - Virtual CAN Tunnel for cross namespace communication * * This code is derived from drivers/net/can/vcan.c for the virtual CAN * specific parts and from drivers/net/veth.c to implement the netlink API * for network interface pairs in a common and established way. * * Copyright (c) 2017 Oliver Hartkopp <socketcan@hartkopp.net> */ #include <linux/ethtool.h> #include <linux/module.h> #include <linux/init.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/if_ether.h> #include <linux/can.h> #include <linux/can/dev.h> #include <linux/can/skb.h> #include <linux/can/vxcan.h> #include <linux/can/can-ml.h> #include <linux/slab.h> #include <net/rtnetlink.h> #define DRV_NAME "vxcan" MODULE_DESCRIPTION("Virtual CAN Tunnel"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Oliver Hartkopp <socketcan@hartkopp.net>"); MODULE_ALIAS_RTNL_LINK(DRV_NAME); struct vxcan_priv { struct net_device __rcu *peer; }; static netdev_tx_t vxcan_xmit(struct sk_buff *oskb, struct net_device *dev) { struct vxcan_priv *priv = netdev_priv(dev); struct net_device *peer; struct net_device_stats *peerstats, *srcstats = &dev->stats; struct sk_buff *skb; unsigned int len; if (can_dropped_invalid_skb(dev, oskb)) return NETDEV_TX_OK; rcu_read_lock(); peer = rcu_dereference(priv->peer); if (unlikely(!peer)) { kfree_skb(oskb); dev->stats.tx_dropped++; goto out_unlock; } skb_tx_timestamp(oskb); skb = skb_clone(oskb, GFP_ATOMIC); if (skb) { consume_skb(oskb); } else { kfree_skb(oskb); goto out_unlock; } /* reset CAN GW hop counter */ skb->csum_start = 0; skb->pkt_type = PACKET_BROADCAST; skb->dev = peer; skb->ip_summed = CHECKSUM_UNNECESSARY; len = can_skb_get_data_len(skb); if (netif_rx(skb) == NET_RX_SUCCESS) { srcstats->tx_packets++; srcstats->tx_bytes += len; peerstats = &peer->stats; peerstats->rx_packets++; peerstats->rx_bytes += len; } out_unlock: rcu_read_unlock(); return NETDEV_TX_OK; } static int vxcan_open(struct net_device *dev) { struct vxcan_priv *priv = netdev_priv(dev); struct net_device *peer = rtnl_dereference(priv->peer); if (!peer) return -ENOTCONN; if (peer->flags & IFF_UP) { netif_carrier_on(dev); netif_carrier_on(peer); } return 0; } static int vxcan_close(struct net_device *dev) { struct vxcan_priv *priv = netdev_priv(dev); struct net_device *peer = rtnl_dereference(priv->peer); netif_carrier_off(dev); if (peer) netif_carrier_off(peer); return 0; } static int vxcan_get_iflink(const struct net_device *dev) { struct vxcan_priv *priv = netdev_priv(dev); struct net_device *peer; int iflink; rcu_read_lock(); peer = rcu_dereference(priv->peer); iflink = peer ? READ_ONCE(peer->ifindex) : 0; rcu_read_unlock(); return iflink; } static int vxcan_change_mtu(struct net_device *dev, int new_mtu) { /* Do not allow changing the MTU while running */ if (dev->flags & IFF_UP) return -EBUSY; if (new_mtu != CAN_MTU && new_mtu != CANFD_MTU && !can_is_canxl_dev_mtu(new_mtu)) return -EINVAL; WRITE_ONCE(dev->mtu, new_mtu); return 0; } static const struct net_device_ops vxcan_netdev_ops = { .ndo_open = vxcan_open, .ndo_stop = vxcan_close, .ndo_start_xmit = vxcan_xmit, .ndo_get_iflink = vxcan_get_iflink, .ndo_change_mtu = vxcan_change_mtu, }; static const struct ethtool_ops vxcan_ethtool_ops = { .get_ts_info = ethtool_op_get_ts_info, }; static void vxcan_setup(struct net_device *dev) { struct can_ml_priv *can_ml; dev->type = ARPHRD_CAN; dev->mtu = CANXL_MTU; dev->hard_header_len = 0; dev->addr_len = 0; dev->tx_queue_len = 0; dev->flags = IFF_NOARP; dev->netdev_ops = &vxcan_netdev_ops; dev->ethtool_ops = &vxcan_ethtool_ops; dev->needs_free_netdev = true; can_ml = netdev_priv(dev) + ALIGN(sizeof(struct vxcan_priv), NETDEV_ALIGN); can_set_ml_priv(dev, can_ml); } /* forward declaration for rtnl_create_link() */ static struct rtnl_link_ops vxcan_link_ops; static int vxcan_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct net *peer_net = rtnl_newlink_peer_net(params); struct nlattr **data = params->data; struct nlattr **tb = params->tb; struct vxcan_priv *priv; struct net_device *peer; struct nlattr *peer_tb[IFLA_MAX + 1], **tbp = tb; char ifname[IFNAMSIZ]; unsigned char name_assign_type; struct ifinfomsg *ifmp = NULL; int err; /* register peer device */ if (data && data[VXCAN_INFO_PEER]) { struct nlattr *nla_peer = data[VXCAN_INFO_PEER]; ifmp = nla_data(nla_peer); rtnl_nla_parse_ifinfomsg(peer_tb, nla_peer, extack); tbp = peer_tb; } if (ifmp && tbp[IFLA_IFNAME]) { nla_strscpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ); name_assign_type = NET_NAME_USER; } else { snprintf(ifname, IFNAMSIZ, DRV_NAME "%%d"); name_assign_type = NET_NAME_ENUM; } peer = rtnl_create_link(peer_net, ifname, name_assign_type, &vxcan_link_ops, tbp, extack); if (IS_ERR(peer)) return PTR_ERR(peer); if (ifmp && dev->ifindex) peer->ifindex = ifmp->ifi_index; err = register_netdevice(peer); if (err < 0) { free_netdev(peer); return err; } netif_carrier_off(peer); err = rtnl_configure_link(peer, ifmp, 0, NULL); if (err < 0) goto unregister_network_device; /* register first device */ if (tb[IFLA_IFNAME]) nla_strscpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ); else snprintf(dev->name, IFNAMSIZ, DRV_NAME "%%d"); err = register_netdevice(dev); if (err < 0) goto unregister_network_device; netif_carrier_off(dev); /* cross link the device pair */ priv = netdev_priv(dev); rcu_assign_pointer(priv->peer, peer); priv = netdev_priv(peer); rcu_assign_pointer(priv->peer, dev); return 0; unregister_network_device: unregister_netdevice(peer); return err; } static void vxcan_dellink(struct net_device *dev, struct list_head *head) { struct vxcan_priv *priv; struct net_device *peer; priv = netdev_priv(dev); peer = rtnl_dereference(priv->peer); /* Note : dellink() is called from default_device_exit_batch(), * before a rcu_synchronize() point. The devices are guaranteed * not being freed before one RCU grace period. */ RCU_INIT_POINTER(priv->peer, NULL); unregister_netdevice_queue(dev, head); if (peer) { priv = netdev_priv(peer); RCU_INIT_POINTER(priv->peer, NULL); unregister_netdevice_queue(peer, head); } } static const struct nla_policy vxcan_policy[VXCAN_INFO_MAX + 1] = { [VXCAN_INFO_PEER] = { .len = sizeof(struct ifinfomsg) }, }; static struct net *vxcan_get_link_net(const struct net_device *dev) { struct vxcan_priv *priv = netdev_priv(dev); struct net_device *peer = rtnl_dereference(priv->peer); return peer ? dev_net(peer) : dev_net(dev); } static struct rtnl_link_ops vxcan_link_ops = { .kind = DRV_NAME, .priv_size = ALIGN(sizeof(struct vxcan_priv), NETDEV_ALIGN) + sizeof(struct can_ml_priv), .setup = vxcan_setup, .newlink = vxcan_newlink, .dellink = vxcan_dellink, .policy = vxcan_policy, .peer_type = VXCAN_INFO_PEER, .maxtype = VXCAN_INFO_MAX, .get_link_net = vxcan_get_link_net, }; static __init int vxcan_init(void) { pr_info("vxcan: Virtual CAN Tunnel driver\n"); return rtnl_link_register(&vxcan_link_ops); } static __exit void vxcan_exit(void) { rtnl_link_unregister(&vxcan_link_ops); } module_init(vxcan_init); module_exit(vxcan_exit); |
| 2 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 | // SPDX-License-Identifier: GPL-2.0 /* * Witness Service client for CIFS * * Copyright (c) 2020 Samuel Cabrero <scabrero@suse.de> */ #include <linux/kref.h> #include <net/genetlink.h> #include <uapi/linux/cifs/cifs_netlink.h> #include "cifs_swn.h" #include "cifsglob.h" #include "cifsproto.h" #include "fscache.h" #include "cifs_debug.h" #include "netlink.h" static DEFINE_IDR(cifs_swnreg_idr); static DEFINE_MUTEX(cifs_swnreg_idr_mutex); struct cifs_swn_reg { int id; struct kref ref_count; const char *net_name; const char *share_name; bool net_name_notify; bool share_name_notify; bool ip_notify; struct cifs_tcon *tcon; }; static int cifs_swn_auth_info_krb(struct cifs_tcon *tcon, struct sk_buff *skb) { int ret; ret = nla_put_flag(skb, CIFS_GENL_ATTR_SWN_KRB_AUTH); if (ret < 0) return ret; return 0; } static int cifs_swn_auth_info_ntlm(struct cifs_tcon *tcon, struct sk_buff *skb) { int ret; if (tcon->ses->user_name != NULL) { ret = nla_put_string(skb, CIFS_GENL_ATTR_SWN_USER_NAME, tcon->ses->user_name); if (ret < 0) return ret; } if (tcon->ses->password != NULL) { ret = nla_put_string(skb, CIFS_GENL_ATTR_SWN_PASSWORD, tcon->ses->password); if (ret < 0) return ret; } if (tcon->ses->domainName != NULL) { ret = nla_put_string(skb, CIFS_GENL_ATTR_SWN_DOMAIN_NAME, tcon->ses->domainName); if (ret < 0) return ret; } return 0; } /* * Sends a register message to the userspace daemon based on the registration. * The authentication information to connect to the witness service is bundled * into the message. */ static int cifs_swn_send_register_message(struct cifs_swn_reg *swnreg) { struct sk_buff *skb; struct genlmsghdr *hdr; enum securityEnum authtype; struct sockaddr_storage *addr; int ret; skb = genlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb) return -ENOMEM; hdr = genlmsg_put(skb, 0, 0, &cifs_genl_family, 0, CIFS_GENL_CMD_SWN_REGISTER); if (hdr == NULL) { ret = -ENOMEM; goto nlmsg_fail; } ret = nla_put_u32(skb, CIFS_GENL_ATTR_SWN_REGISTRATION_ID, swnreg->id); if (ret < 0) goto nlmsg_fail; ret = nla_put_string(skb, CIFS_GENL_ATTR_SWN_NET_NAME, swnreg->net_name); if (ret < 0) goto nlmsg_fail; ret = nla_put_string(skb, CIFS_GENL_ATTR_SWN_SHARE_NAME, swnreg->share_name); if (ret < 0) goto nlmsg_fail; /* * If there is an address stored use it instead of the server address, because we are * in the process of reconnecting to it after a share has been moved or we have been * told to switch to it (client move message). In these cases we unregister from the * server address and register to the new address when we receive the notification. */ if (swnreg->tcon->ses->server->use_swn_dstaddr) addr = &swnreg->tcon->ses->server->swn_dstaddr; else addr = &swnreg->tcon->ses->server->dstaddr; ret = nla_put(skb, CIFS_GENL_ATTR_SWN_IP, sizeof(struct sockaddr_storage), addr); if (ret < 0) goto nlmsg_fail; if (swnreg->net_name_notify) { ret = nla_put_flag(skb, CIFS_GENL_ATTR_SWN_NET_NAME_NOTIFY); if (ret < 0) goto nlmsg_fail; } if (swnreg->share_name_notify) { ret = nla_put_flag(skb, CIFS_GENL_ATTR_SWN_SHARE_NAME_NOTIFY); if (ret < 0) goto nlmsg_fail; } if (swnreg->ip_notify) { ret = nla_put_flag(skb, CIFS_GENL_ATTR_SWN_IP_NOTIFY); if (ret < 0) goto nlmsg_fail; } authtype = cifs_select_sectype(swnreg->tcon->ses->server, swnreg->tcon->ses->sectype); switch (authtype) { case Kerberos: ret = cifs_swn_auth_info_krb(swnreg->tcon, skb); if (ret < 0) { cifs_dbg(VFS, "%s: Failed to get kerberos auth info: %d\n", __func__, ret); goto nlmsg_fail; } break; case NTLMv2: case RawNTLMSSP: ret = cifs_swn_auth_info_ntlm(swnreg->tcon, skb); if (ret < 0) { cifs_dbg(VFS, "%s: Failed to get NTLM auth info: %d\n", __func__, ret); goto nlmsg_fail; } break; default: cifs_dbg(VFS, "%s: secType %d not supported!\n", __func__, authtype); ret = -EINVAL; goto nlmsg_fail; } genlmsg_end(skb, hdr); genlmsg_multicast(&cifs_genl_family, skb, 0, CIFS_GENL_MCGRP_SWN, GFP_ATOMIC); cifs_dbg(FYI, "%s: Message to register for network name %s with id %d sent\n", __func__, swnreg->net_name, swnreg->id); return 0; nlmsg_fail: genlmsg_cancel(skb, hdr); nlmsg_free(skb); return ret; } /* * Sends an uregister message to the userspace daemon based on the registration */ static int cifs_swn_send_unregister_message(struct cifs_swn_reg *swnreg) { struct sk_buff *skb; struct genlmsghdr *hdr; int ret; skb = genlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (skb == NULL) return -ENOMEM; hdr = genlmsg_put(skb, 0, 0, &cifs_genl_family, 0, CIFS_GENL_CMD_SWN_UNREGISTER); if (hdr == NULL) { ret = -ENOMEM; goto nlmsg_fail; } ret = nla_put_u32(skb, CIFS_GENL_ATTR_SWN_REGISTRATION_ID, swnreg->id); if (ret < 0) goto nlmsg_fail; ret = nla_put_string(skb, CIFS_GENL_ATTR_SWN_NET_NAME, swnreg->net_name); if (ret < 0) goto nlmsg_fail; ret = nla_put_string(skb, CIFS_GENL_ATTR_SWN_SHARE_NAME, swnreg->share_name); if (ret < 0) goto nlmsg_fail; ret = nla_put(skb, CIFS_GENL_ATTR_SWN_IP, sizeof(struct sockaddr_storage), &swnreg->tcon->ses->server->dstaddr); if (ret < 0) goto nlmsg_fail; if (swnreg->net_name_notify) { ret = nla_put_flag(skb, CIFS_GENL_ATTR_SWN_NET_NAME_NOTIFY); if (ret < 0) goto nlmsg_fail; } if (swnreg->share_name_notify) { ret = nla_put_flag(skb, CIFS_GENL_ATTR_SWN_SHARE_NAME_NOTIFY); if (ret < 0) goto nlmsg_fail; } if (swnreg->ip_notify) { ret = nla_put_flag(skb, CIFS_GENL_ATTR_SWN_IP_NOTIFY); if (ret < 0) goto nlmsg_fail; } genlmsg_end(skb, hdr); genlmsg_multicast(&cifs_genl_family, skb, 0, CIFS_GENL_MCGRP_SWN, GFP_ATOMIC); cifs_dbg(FYI, "%s: Message to unregister for network name %s with id %d sent\n", __func__, swnreg->net_name, swnreg->id); return 0; nlmsg_fail: genlmsg_cancel(skb, hdr); nlmsg_free(skb); return ret; } /* * Try to find a matching registration for the tcon's server name and share name. * Calls to this function must be protected by cifs_swnreg_idr_mutex. * TODO Try to avoid memory allocations */ static struct cifs_swn_reg *cifs_find_swn_reg(struct cifs_tcon *tcon) { struct cifs_swn_reg *swnreg; int id; const char *share_name; const char *net_name; net_name = extract_hostname(tcon->tree_name); if (IS_ERR(net_name)) { int ret; ret = PTR_ERR(net_name); cifs_dbg(VFS, "%s: failed to extract host name from target '%s': %d\n", __func__, tcon->tree_name, ret); return ERR_PTR(-EINVAL); } share_name = extract_sharename(tcon->tree_name); if (IS_ERR(share_name)) { int ret; ret = PTR_ERR(share_name); cifs_dbg(VFS, "%s: failed to extract share name from target '%s': %d\n", __func__, tcon->tree_name, ret); kfree(net_name); return ERR_PTR(-EINVAL); } idr_for_each_entry(&cifs_swnreg_idr, swnreg, id) { if (strcasecmp(swnreg->net_name, net_name) != 0 || strcasecmp(swnreg->share_name, share_name) != 0) { continue; } cifs_dbg(FYI, "Existing swn registration for %s:%s found\n", swnreg->net_name, swnreg->share_name); kfree(net_name); kfree(share_name); return swnreg; } kfree(net_name); kfree(share_name); return ERR_PTR(-EEXIST); } /* * Get a registration for the tcon's server and share name, allocating a new one if it does not * exists */ static struct cifs_swn_reg *cifs_get_swn_reg(struct cifs_tcon *tcon) { struct cifs_swn_reg *reg = NULL; int ret; mutex_lock(&cifs_swnreg_idr_mutex); /* Check if we are already registered for this network and share names */ reg = cifs_find_swn_reg(tcon); if (!IS_ERR(reg)) { kref_get(®->ref_count); goto unlock; } else if (PTR_ERR(reg) != -EEXIST) { goto unlock; } reg = kmalloc(sizeof(struct cifs_swn_reg), GFP_ATOMIC); if (reg == NULL) { ret = -ENOMEM; goto fail_unlock; } kref_init(®->ref_count); reg->id = idr_alloc(&cifs_swnreg_idr, reg, 1, 0, GFP_ATOMIC); if (reg->id < 0) { cifs_dbg(FYI, "%s: failed to allocate registration id\n", __func__); ret = reg->id; goto fail; } reg->net_name = extract_hostname(tcon->tree_name); if (IS_ERR(reg->net_name)) { ret = PTR_ERR(reg->net_name); cifs_dbg(VFS, "%s: failed to extract host name from target: %d\n", __func__, ret); goto fail_idr; } reg->share_name = extract_sharename(tcon->tree_name); if (IS_ERR(reg->share_name)) { ret = PTR_ERR(reg->share_name); cifs_dbg(VFS, "%s: failed to extract share name from target: %d\n", __func__, ret); goto fail_net_name; } reg->net_name_notify = true; reg->share_name_notify = true; reg->ip_notify = (tcon->capabilities & SMB2_SHARE_CAP_SCALEOUT); reg->tcon = tcon; unlock: mutex_unlock(&cifs_swnreg_idr_mutex); return reg; fail_net_name: kfree(reg->net_name); fail_idr: idr_remove(&cifs_swnreg_idr, reg->id); fail: kfree(reg); fail_unlock: mutex_unlock(&cifs_swnreg_idr_mutex); return ERR_PTR(ret); } static void cifs_swn_reg_release(struct kref *ref) { struct cifs_swn_reg *swnreg = container_of(ref, struct cifs_swn_reg, ref_count); int ret; ret = cifs_swn_send_unregister_message(swnreg); if (ret < 0) cifs_dbg(VFS, "%s: Failed to send unregister message: %d\n", __func__, ret); idr_remove(&cifs_swnreg_idr, swnreg->id); kfree(swnreg->net_name); kfree(swnreg->share_name); kfree(swnreg); } static void cifs_put_swn_reg(struct cifs_swn_reg *swnreg) { mutex_lock(&cifs_swnreg_idr_mutex); kref_put(&swnreg->ref_count, cifs_swn_reg_release); mutex_unlock(&cifs_swnreg_idr_mutex); } static int cifs_swn_resource_state_changed(struct cifs_swn_reg *swnreg, const char *name, int state) { switch (state) { case CIFS_SWN_RESOURCE_STATE_UNAVAILABLE: cifs_dbg(FYI, "%s: resource name '%s' become unavailable\n", __func__, name); cifs_signal_cifsd_for_reconnect(swnreg->tcon->ses->server, true); break; case CIFS_SWN_RESOURCE_STATE_AVAILABLE: cifs_dbg(FYI, "%s: resource name '%s' become available\n", __func__, name); cifs_signal_cifsd_for_reconnect(swnreg->tcon->ses->server, true); break; case CIFS_SWN_RESOURCE_STATE_UNKNOWN: cifs_dbg(FYI, "%s: resource name '%s' changed to unknown state\n", __func__, name); break; } return 0; } static bool cifs_sockaddr_equal(struct sockaddr_storage *addr1, struct sockaddr_storage *addr2) { if (addr1->ss_family != addr2->ss_family) return false; if (addr1->ss_family == AF_INET) { return (memcmp(&((const struct sockaddr_in *)addr1)->sin_addr, &((const struct sockaddr_in *)addr2)->sin_addr, sizeof(struct in_addr)) == 0); } if (addr1->ss_family == AF_INET6) { return (memcmp(&((const struct sockaddr_in6 *)addr1)->sin6_addr, &((const struct sockaddr_in6 *)addr2)->sin6_addr, sizeof(struct in6_addr)) == 0); } return false; } static int cifs_swn_store_swn_addr(const struct sockaddr_storage *new, const struct sockaddr_storage *old, struct sockaddr_storage *dst) { __be16 port = cpu_to_be16(CIFS_PORT); if (old->ss_family == AF_INET) { struct sockaddr_in *ipv4 = (struct sockaddr_in *)old; port = ipv4->sin_port; } else if (old->ss_family == AF_INET6) { struct sockaddr_in6 *ipv6 = (struct sockaddr_in6 *)old; port = ipv6->sin6_port; } if (new->ss_family == AF_INET) { struct sockaddr_in *ipv4 = (struct sockaddr_in *)new; ipv4->sin_port = port; } else if (new->ss_family == AF_INET6) { struct sockaddr_in6 *ipv6 = (struct sockaddr_in6 *)new; ipv6->sin6_port = port; } *dst = *new; return 0; } static int cifs_swn_reconnect(struct cifs_tcon *tcon, struct sockaddr_storage *addr) { int ret = 0; /* Store the reconnect address */ cifs_server_lock(tcon->ses->server); if (cifs_sockaddr_equal(&tcon->ses->server->dstaddr, addr)) goto unlock; ret = cifs_swn_store_swn_addr(addr, &tcon->ses->server->dstaddr, &tcon->ses->server->swn_dstaddr); if (ret < 0) { cifs_dbg(VFS, "%s: failed to store address: %d\n", __func__, ret); goto unlock; } tcon->ses->server->use_swn_dstaddr = true; /* * Unregister to stop receiving notifications for the old IP address. */ ret = cifs_swn_unregister(tcon); if (ret < 0) { cifs_dbg(VFS, "%s: Failed to unregister for witness notifications: %d\n", __func__, ret); goto unlock; } /* * And register to receive notifications for the new IP address now that we have * stored the new address. */ ret = cifs_swn_register(tcon); if (ret < 0) { cifs_dbg(VFS, "%s: Failed to register for witness notifications: %d\n", __func__, ret); goto unlock; } cifs_signal_cifsd_for_reconnect(tcon->ses->server, false); unlock: cifs_server_unlock(tcon->ses->server); return ret; } static int cifs_swn_client_move(struct cifs_swn_reg *swnreg, struct sockaddr_storage *addr) { struct sockaddr_in *ipv4 = (struct sockaddr_in *)addr; struct sockaddr_in6 *ipv6 = (struct sockaddr_in6 *)addr; if (addr->ss_family == AF_INET) cifs_dbg(FYI, "%s: move to %pI4\n", __func__, &ipv4->sin_addr); else if (addr->ss_family == AF_INET6) cifs_dbg(FYI, "%s: move to %pI6\n", __func__, &ipv6->sin6_addr); return cifs_swn_reconnect(swnreg->tcon, addr); } int cifs_swn_notify(struct sk_buff *skb, struct genl_info *info) { struct cifs_swn_reg *swnreg; char name[256]; int type; if (info->attrs[CIFS_GENL_ATTR_SWN_REGISTRATION_ID]) { int swnreg_id; swnreg_id = nla_get_u32(info->attrs[CIFS_GENL_ATTR_SWN_REGISTRATION_ID]); mutex_lock(&cifs_swnreg_idr_mutex); swnreg = idr_find(&cifs_swnreg_idr, swnreg_id); mutex_unlock(&cifs_swnreg_idr_mutex); if (swnreg == NULL) { cifs_dbg(FYI, "%s: registration id %d not found\n", __func__, swnreg_id); return -EINVAL; } } else { cifs_dbg(FYI, "%s: missing registration id attribute\n", __func__); return -EINVAL; } if (info->attrs[CIFS_GENL_ATTR_SWN_NOTIFICATION_TYPE]) { type = nla_get_u32(info->attrs[CIFS_GENL_ATTR_SWN_NOTIFICATION_TYPE]); } else { cifs_dbg(FYI, "%s: missing notification type attribute\n", __func__); return -EINVAL; } switch (type) { case CIFS_SWN_NOTIFICATION_RESOURCE_CHANGE: { int state; if (info->attrs[CIFS_GENL_ATTR_SWN_RESOURCE_NAME]) { nla_strscpy(name, info->attrs[CIFS_GENL_ATTR_SWN_RESOURCE_NAME], sizeof(name)); } else { cifs_dbg(FYI, "%s: missing resource name attribute\n", __func__); return -EINVAL; } if (info->attrs[CIFS_GENL_ATTR_SWN_RESOURCE_STATE]) { state = nla_get_u32(info->attrs[CIFS_GENL_ATTR_SWN_RESOURCE_STATE]); } else { cifs_dbg(FYI, "%s: missing resource state attribute\n", __func__); return -EINVAL; } return cifs_swn_resource_state_changed(swnreg, name, state); } case CIFS_SWN_NOTIFICATION_CLIENT_MOVE: { struct sockaddr_storage addr; if (info->attrs[CIFS_GENL_ATTR_SWN_IP]) { nla_memcpy(&addr, info->attrs[CIFS_GENL_ATTR_SWN_IP], sizeof(addr)); } else { cifs_dbg(FYI, "%s: missing IP address attribute\n", __func__); return -EINVAL; } return cifs_swn_client_move(swnreg, &addr); } default: cifs_dbg(FYI, "%s: unknown notification type %d\n", __func__, type); break; } return 0; } int cifs_swn_register(struct cifs_tcon *tcon) { struct cifs_swn_reg *swnreg; int ret; swnreg = cifs_get_swn_reg(tcon); if (IS_ERR(swnreg)) return PTR_ERR(swnreg); ret = cifs_swn_send_register_message(swnreg); if (ret < 0) { cifs_dbg(VFS, "%s: Failed to send swn register message: %d\n", __func__, ret); /* Do not put the swnreg or return error, the echo task will retry */ } return 0; } int cifs_swn_unregister(struct cifs_tcon *tcon) { struct cifs_swn_reg *swnreg; mutex_lock(&cifs_swnreg_idr_mutex); swnreg = cifs_find_swn_reg(tcon); if (IS_ERR(swnreg)) { mutex_unlock(&cifs_swnreg_idr_mutex); return PTR_ERR(swnreg); } mutex_unlock(&cifs_swnreg_idr_mutex); cifs_put_swn_reg(swnreg); return 0; } void cifs_swn_dump(struct seq_file *m) { struct cifs_swn_reg *swnreg; struct sockaddr_in *sa; struct sockaddr_in6 *sa6; int id; seq_puts(m, "Witness registrations:"); mutex_lock(&cifs_swnreg_idr_mutex); idr_for_each_entry(&cifs_swnreg_idr, swnreg, id) { seq_printf(m, "\nId: %u Refs: %u Network name: '%s'%s Share name: '%s'%s Ip address: ", id, kref_read(&swnreg->ref_count), swnreg->net_name, swnreg->net_name_notify ? "(y)" : "(n)", swnreg->share_name, swnreg->share_name_notify ? "(y)" : "(n)"); switch (swnreg->tcon->ses->server->dstaddr.ss_family) { case AF_INET: sa = (struct sockaddr_in *) &swnreg->tcon->ses->server->dstaddr; seq_printf(m, "%pI4", &sa->sin_addr.s_addr); break; case AF_INET6: sa6 = (struct sockaddr_in6 *) &swnreg->tcon->ses->server->dstaddr; seq_printf(m, "%pI6", &sa6->sin6_addr.s6_addr); if (sa6->sin6_scope_id) seq_printf(m, "%%%u", sa6->sin6_scope_id); break; default: seq_puts(m, "(unknown)"); } seq_printf(m, "%s", swnreg->ip_notify ? "(y)" : "(n)"); } mutex_unlock(&cifs_swnreg_idr_mutex); seq_puts(m, "\n"); } void cifs_swn_check(void) { struct cifs_swn_reg *swnreg; int id; int ret; mutex_lock(&cifs_swnreg_idr_mutex); idr_for_each_entry(&cifs_swnreg_idr, swnreg, id) { ret = cifs_swn_send_register_message(swnreg); if (ret < 0) cifs_dbg(FYI, "%s: Failed to send register message: %d\n", __func__, ret); } mutex_unlock(&cifs_swnreg_idr_mutex); } |
| 55 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __ASM_GENERIC_CHECKSUM_H #define __ASM_GENERIC_CHECKSUM_H #include <linux/bitops.h> /* * computes the checksum of a memory block at buff, length len, * and adds in "sum" (32-bit) * * returns a 32-bit number suitable for feeding into itself * or csum_tcpudp_magic * * this function must be called with even lengths, except * for the last fragment, which may be odd * * it's best to have buff aligned on a 32-bit boundary */ extern __wsum csum_partial(const void *buff, int len, __wsum sum); #ifndef ip_fast_csum /* * This is a version of ip_compute_csum() optimized for IP headers, * which always checksum on 4 octet boundaries. */ extern __sum16 ip_fast_csum(const void *iph, unsigned int ihl); #endif #ifndef csum_fold /* * Fold a partial checksum */ static inline __sum16 csum_fold(__wsum csum) { u32 sum = (__force u32)csum; return (__force __sum16)((~sum - ror32(sum, 16)) >> 16); } #endif #ifndef csum_tcpudp_nofold /* * computes the checksum of the TCP/UDP pseudo-header * returns a 16-bit checksum, already complemented */ extern __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __wsum sum); #endif #ifndef csum_tcpudp_magic static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __wsum sum) { return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum)); } #endif /* * this routine is used for miscellaneous IP-like checksums, mainly * in icmp.c */ extern __sum16 ip_compute_csum(const void *buff, int len); #endif /* __ASM_GENERIC_CHECKSUM_H */ |
| 2 25 2 6 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 22 3 20 5 5 5 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 | // SPDX-License-Identifier: GPL-2.0-or-later /* Linux driver for Philips webcam USB and Video4Linux interface part. (C) 1999-2004 Nemosoft Unv. (C) 2004-2006 Luc Saillard (luc@saillard.org) (C) 2011 Hans de Goede <hdegoede@redhat.com> NOTE: this version of pwc is an unofficial (modified) release of pwc & pcwx driver and thus may have bugs that are not present in the original version. Please send bug reports and support requests to <luc@saillard.org>. The decompression routines have been implemented by reverse-engineering the Nemosoft binary pwcx module. Caveat emptor. */ /* This code forms the interface between the USB layers and the Philips specific stuff. Some adanved stuff of the driver falls under an NDA, signed between me and Philips B.V., Eindhoven, the Netherlands, and is thus not distributed in source form. The binary pwcx.o module contains the code that falls under the NDA. In case you're wondering: 'pwc' stands for "Philips WebCam", but I really didn't want to type 'philips_web_cam' every time (I'm lazy as any Linux kernel hacker, but I don't like uncomprehensible abbreviations without explanation). Oh yes, convention: to disctinguish between all the various pointers to device-structures, I use these names for the pointer variables: udev: struct usb_device * vdev: struct video_device (member of pwc_dev) pdev: struct pwc_devive * */ /* Contributors: - Alvarado: adding whitebalance code - Alistar Moire: QuickCam 3000 Pro device/product ID - Tony Hoyle: Creative Labs Webcam 5 device/product ID - Mark Burazin: solving hang in VIDIOCSYNC when camera gets unplugged - Jk Fang: Sotec Afina Eye ID - Xavier Roche: QuickCam Pro 4000 ID - Jens Knudsen: QuickCam Zoom ID - J. Debert: QuickCam for Notebooks ID - Pham Thanh Nam: webcam snapshot button as an event input device */ #include <linux/errno.h> #include <linux/init.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/poll.h> #include <linux/slab.h> #ifdef CONFIG_USB_PWC_INPUT_EVDEV #include <linux/usb/input.h> #endif #include <linux/vmalloc.h> #include <asm/io.h> #include "pwc.h" #include "pwc-kiara.h" #include "pwc-timon.h" #include "pwc-dec23.h" #include "pwc-dec1.h" #define CREATE_TRACE_POINTS #include <trace/events/pwc.h> /* Function prototypes and driver templates */ /* hotplug device table support */ static const struct usb_device_id pwc_device_table [] = { { USB_DEVICE(0x041E, 0x400C) }, /* Creative Webcam 5 */ { USB_DEVICE(0x041E, 0x4011) }, /* Creative Webcam Pro Ex */ { USB_DEVICE(0x046D, 0x08B0) }, /* Logitech QuickCam 3000 Pro */ { USB_DEVICE(0x046D, 0x08B1) }, /* Logitech QuickCam Notebook Pro */ { USB_DEVICE(0x046D, 0x08B2) }, /* Logitech QuickCam 4000 Pro */ { USB_DEVICE(0x046D, 0x08B3) }, /* Logitech QuickCam Zoom (old model) */ { USB_DEVICE(0x046D, 0x08B4) }, /* Logitech QuickCam Zoom (new model) */ { USB_DEVICE(0x046D, 0x08B5) }, /* Logitech QuickCam Orbit/Sphere */ { USB_DEVICE(0x046D, 0x08B6) }, /* Logitech/Cisco VT Camera */ { USB_DEVICE(0x046D, 0x08B7) }, /* Logitech ViewPort AV 100 */ { USB_DEVICE(0x046D, 0x08B8) }, /* Logitech QuickCam */ { USB_DEVICE(0x0471, 0x0302) }, /* Philips PCA645VC */ { USB_DEVICE(0x0471, 0x0303) }, /* Philips PCA646VC */ { USB_DEVICE(0x0471, 0x0304) }, /* Askey VC010 type 2 */ { USB_DEVICE(0x0471, 0x0307) }, /* Philips PCVC675K (Vesta) */ { USB_DEVICE(0x0471, 0x0308) }, /* Philips PCVC680K (Vesta Pro) */ { USB_DEVICE(0x0471, 0x030C) }, /* Philips PCVC690K (Vesta Pro Scan) */ { USB_DEVICE(0x0471, 0x0310) }, /* Philips PCVC730K (ToUCam Fun)/PCVC830 (ToUCam II) */ { USB_DEVICE(0x0471, 0x0311) }, /* Philips PCVC740K (ToUCam Pro)/PCVC840 (ToUCam II) */ { USB_DEVICE(0x0471, 0x0312) }, /* Philips PCVC750K (ToUCam Pro Scan) */ { USB_DEVICE(0x0471, 0x0313) }, /* Philips PCVC720K/40 (ToUCam XS) */ { USB_DEVICE(0x0471, 0x0329) }, /* Philips SPC 900NC webcam */ { USB_DEVICE(0x0471, 0x032C) }, /* Philips SPC 880NC webcam */ { USB_DEVICE(0x04CC, 0x8116) }, /* Sotec Afina Eye */ { USB_DEVICE(0x055D, 0x9000) }, /* Samsung MPC-C10 */ { USB_DEVICE(0x055D, 0x9001) }, /* Samsung MPC-C30 */ { USB_DEVICE(0x055D, 0x9002) }, /* Samsung SNC-35E (Ver3.0) */ { USB_DEVICE(0x069A, 0x0001) }, /* Askey VC010 type 1 */ { USB_DEVICE(0x06BE, 0x8116) }, /* AME Co. Afina Eye */ { USB_DEVICE(0x0d81, 0x1900) }, /* Visionite VCS-UC300 */ { USB_DEVICE(0x0d81, 0x1910) }, /* Visionite VCS-UM100 */ { } }; MODULE_DEVICE_TABLE(usb, pwc_device_table); static int usb_pwc_probe(struct usb_interface *intf, const struct usb_device_id *id); static void usb_pwc_disconnect(struct usb_interface *intf); static void pwc_isoc_cleanup(struct pwc_device *pdev); static struct usb_driver pwc_driver = { .name = "Philips webcam", /* name */ .id_table = pwc_device_table, .probe = usb_pwc_probe, /* probe() */ .disconnect = usb_pwc_disconnect, /* disconnect() */ }; #define MAX_DEV_HINTS 20 #define MAX_ISOC_ERRORS 20 #ifdef CONFIG_USB_PWC_DEBUG int pwc_trace = PWC_DEBUG_LEVEL; #endif static int power_save = -1; static int leds[2] = { 100, 0 }; /***/ static const struct v4l2_file_operations pwc_fops = { .owner = THIS_MODULE, .open = v4l2_fh_open, .release = vb2_fop_release, .read = vb2_fop_read, .poll = vb2_fop_poll, .mmap = vb2_fop_mmap, .unlocked_ioctl = video_ioctl2, }; static const struct video_device pwc_template = { .name = "Philips Webcam", /* Filled in later */ .release = video_device_release_empty, .fops = &pwc_fops, .ioctl_ops = &pwc_ioctl_ops, }; /***************************************************************************/ /* Private functions */ static void *pwc_alloc_urb_buffer(struct usb_device *dev, size_t size, dma_addr_t *dma_handle) { struct device *dmadev = dev->bus->sysdev; void *buffer = kmalloc(size, GFP_KERNEL); if (!buffer) return NULL; *dma_handle = dma_map_single(dmadev, buffer, size, DMA_FROM_DEVICE); if (dma_mapping_error(dmadev, *dma_handle)) { kfree(buffer); return NULL; } return buffer; } static void pwc_free_urb_buffer(struct usb_device *dev, size_t size, void *buffer, dma_addr_t dma_handle) { struct device *dmadev = dev->bus->sysdev; dma_unmap_single(dmadev, dma_handle, size, DMA_FROM_DEVICE); kfree(buffer); } static struct pwc_frame_buf *pwc_get_next_fill_buf(struct pwc_device *pdev) { unsigned long flags = 0; struct pwc_frame_buf *buf = NULL; spin_lock_irqsave(&pdev->queued_bufs_lock, flags); if (list_empty(&pdev->queued_bufs)) goto leave; buf = list_entry(pdev->queued_bufs.next, struct pwc_frame_buf, list); list_del(&buf->list); leave: spin_unlock_irqrestore(&pdev->queued_bufs_lock, flags); return buf; } static void pwc_snapshot_button(struct pwc_device *pdev, int down) { if (down) { PWC_TRACE("Snapshot button pressed.\n"); } else { PWC_TRACE("Snapshot button released.\n"); } #ifdef CONFIG_USB_PWC_INPUT_EVDEV if (pdev->button_dev) { input_report_key(pdev->button_dev, KEY_CAMERA, down); input_sync(pdev->button_dev); } #endif } static void pwc_frame_complete(struct pwc_device *pdev) { struct pwc_frame_buf *fbuf = pdev->fill_buf; /* The ToUCam Fun CMOS sensor causes the firmware to send 2 or 3 bogus frames on the USB wire after an exposure change. This conditition is however detected in the cam and a bit is set in the header. */ if (pdev->type == 730) { unsigned char *ptr = (unsigned char *)fbuf->data; if (ptr[1] == 1 && ptr[0] & 0x10) { PWC_TRACE("Hyundai CMOS sensor bug. Dropping frame.\n"); pdev->drop_frames += 2; } if ((ptr[0] ^ pdev->vmirror) & 0x01) { pwc_snapshot_button(pdev, ptr[0] & 0x01); } if ((ptr[0] ^ pdev->vmirror) & 0x02) { if (ptr[0] & 0x02) PWC_TRACE("Image is mirrored.\n"); else PWC_TRACE("Image is normal.\n"); } pdev->vmirror = ptr[0] & 0x03; /* Sometimes the trailer of the 730 is still sent as a 4 byte packet after a short frame; this condition is filtered out specifically. A 4 byte frame doesn't make sense anyway. So we get either this sequence: drop_bit set -> 4 byte frame -> short frame -> good frame Or this one: drop_bit set -> short frame -> good frame So we drop either 3 or 2 frames in all! */ if (fbuf->filled == 4) pdev->drop_frames++; } else if (pdev->type == 740 || pdev->type == 720) { unsigned char *ptr = (unsigned char *)fbuf->data; if ((ptr[0] ^ pdev->vmirror) & 0x01) { pwc_snapshot_button(pdev, ptr[0] & 0x01); } pdev->vmirror = ptr[0] & 0x03; } /* In case we were instructed to drop the frame, do so silently. */ if (pdev->drop_frames > 0) { pdev->drop_frames--; } else { /* Check for underflow first */ if (fbuf->filled < pdev->frame_total_size) { PWC_DEBUG_FLOW("Frame buffer underflow (%d bytes); discarded.\n", fbuf->filled); } else { fbuf->vb.field = V4L2_FIELD_NONE; fbuf->vb.sequence = pdev->vframe_count; vb2_buffer_done(&fbuf->vb.vb2_buf, VB2_BUF_STATE_DONE); pdev->fill_buf = NULL; pdev->vsync = 0; } } /* !drop_frames */ pdev->vframe_count++; } /* This gets called for the Isochronous pipe (video). This is done in * interrupt time, so it has to be fast, not crash, and not stall. Neat. */ static void pwc_isoc_handler(struct urb *urb) { struct pwc_device *pdev = (struct pwc_device *)urb->context; struct device *dmadev = urb->dev->bus->sysdev; int i, fst, flen; unsigned char *iso_buf = NULL; trace_pwc_handler_enter(urb, pdev); if (urb->status == -ENOENT || urb->status == -ECONNRESET || urb->status == -ESHUTDOWN) { PWC_DEBUG_OPEN("URB (%p) unlinked %ssynchronously.\n", urb, urb->status == -ENOENT ? "" : "a"); return; } if (pdev->fill_buf == NULL) pdev->fill_buf = pwc_get_next_fill_buf(pdev); if (urb->status != 0) { const char *errmsg; errmsg = "Unknown"; switch(urb->status) { case -ENOSR: errmsg = "Buffer error (overrun)"; break; case -EPIPE: errmsg = "Stalled (device not responding)"; break; case -EOVERFLOW: errmsg = "Babble (bad cable?)"; break; case -EPROTO: errmsg = "Bit-stuff error (bad cable?)"; break; case -EILSEQ: errmsg = "CRC/Timeout (could be anything)"; break; case -ETIME: errmsg = "Device does not respond"; break; } PWC_ERROR("pwc_isoc_handler() called with status %d [%s].\n", urb->status, errmsg); /* Give up after a number of contiguous errors */ if (++pdev->visoc_errors > MAX_ISOC_ERRORS) { PWC_ERROR("Too many ISOC errors, bailing out.\n"); if (pdev->fill_buf) { vb2_buffer_done(&pdev->fill_buf->vb.vb2_buf, VB2_BUF_STATE_ERROR); pdev->fill_buf = NULL; } } pdev->vsync = 0; /* Drop the current frame */ goto handler_end; } /* Reset ISOC error counter. We did get here, after all. */ pdev->visoc_errors = 0; dma_sync_single_for_cpu(dmadev, urb->transfer_dma, urb->transfer_buffer_length, DMA_FROM_DEVICE); /* vsync: 0 = don't copy data 1 = sync-hunt 2 = synched */ /* Compact data */ for (i = 0; i < urb->number_of_packets; i++) { fst = urb->iso_frame_desc[i].status; flen = urb->iso_frame_desc[i].actual_length; iso_buf = urb->transfer_buffer + urb->iso_frame_desc[i].offset; if (fst != 0) { PWC_ERROR("Iso frame %d has error %d\n", i, fst); continue; } if (flen > 0 && pdev->vsync) { struct pwc_frame_buf *fbuf = pdev->fill_buf; if (pdev->vsync == 1) { fbuf->vb.vb2_buf.timestamp = ktime_get_ns(); pdev->vsync = 2; } if (flen + fbuf->filled > pdev->frame_total_size) { PWC_ERROR("Frame overflow (%d > %d)\n", flen + fbuf->filled, pdev->frame_total_size); pdev->vsync = 0; /* Let's wait for an EOF */ } else { memcpy(fbuf->data + fbuf->filled, iso_buf, flen); fbuf->filled += flen; } } if (flen < pdev->vlast_packet_size) { /* Shorter packet... end of frame */ if (pdev->vsync == 2) pwc_frame_complete(pdev); if (pdev->fill_buf == NULL) pdev->fill_buf = pwc_get_next_fill_buf(pdev); if (pdev->fill_buf) { pdev->fill_buf->filled = 0; pdev->vsync = 1; } } pdev->vlast_packet_size = flen; } dma_sync_single_for_device(dmadev, urb->transfer_dma, urb->transfer_buffer_length, DMA_FROM_DEVICE); handler_end: trace_pwc_handler_exit(urb, pdev); i = usb_submit_urb(urb, GFP_ATOMIC); if (i != 0) PWC_ERROR("Error (%d) re-submitting urb in pwc_isoc_handler.\n", i); } /* Both v4l2_lock and vb_queue_lock should be locked when calling this */ static int pwc_isoc_init(struct pwc_device *pdev) { struct usb_device *udev; struct urb *urb; int i, j, ret; struct usb_interface *intf; struct usb_host_interface *idesc = NULL; int compression = 0; /* 0..3 = uncompressed..high */ pdev->vsync = 0; pdev->vlast_packet_size = 0; pdev->fill_buf = NULL; pdev->vframe_count = 0; pdev->visoc_errors = 0; udev = pdev->udev; retry: /* We first try with low compression and then retry with a higher compression setting if there is not enough bandwidth. */ ret = pwc_set_video_mode(pdev, pdev->width, pdev->height, pdev->pixfmt, pdev->vframes, &compression, 1); /* Get the current alternate interface, adjust packet size */ intf = usb_ifnum_to_if(udev, 0); if (intf) idesc = usb_altnum_to_altsetting(intf, pdev->valternate); if (!idesc) return -EIO; /* Search video endpoint */ pdev->vmax_packet_size = -1; for (i = 0; i < idesc->desc.bNumEndpoints; i++) { if ((idesc->endpoint[i].desc.bEndpointAddress & 0xF) == pdev->vendpoint) { pdev->vmax_packet_size = le16_to_cpu(idesc->endpoint[i].desc.wMaxPacketSize); break; } } if (pdev->vmax_packet_size < 0 || pdev->vmax_packet_size > ISO_MAX_FRAME_SIZE) { PWC_ERROR("Failed to find packet size for video endpoint in current alternate setting.\n"); return -ENFILE; /* Odd error, that should be noticeable */ } /* Set alternate interface */ PWC_DEBUG_OPEN("Setting alternate interface %d\n", pdev->valternate); ret = usb_set_interface(pdev->udev, 0, pdev->valternate); if (ret == -ENOSPC && compression < 3) { compression++; goto retry; } if (ret < 0) return ret; /* Allocate and init Isochronuous urbs */ for (i = 0; i < MAX_ISO_BUFS; i++) { urb = usb_alloc_urb(ISO_FRAMES_PER_DESC, GFP_KERNEL); if (urb == NULL) { pwc_isoc_cleanup(pdev); return -ENOMEM; } pdev->urbs[i] = urb; PWC_DEBUG_MEMORY("Allocated URB at 0x%p\n", urb); urb->interval = 1; // devik urb->dev = udev; urb->pipe = usb_rcvisocpipe(udev, pdev->vendpoint); urb->transfer_flags = URB_ISO_ASAP | URB_NO_TRANSFER_DMA_MAP; urb->transfer_buffer_length = ISO_BUFFER_SIZE; urb->transfer_buffer = pwc_alloc_urb_buffer(udev, urb->transfer_buffer_length, &urb->transfer_dma); if (urb->transfer_buffer == NULL) { PWC_ERROR("Failed to allocate urb buffer %d\n", i); pwc_isoc_cleanup(pdev); return -ENOMEM; } urb->complete = pwc_isoc_handler; urb->context = pdev; urb->start_frame = 0; urb->number_of_packets = ISO_FRAMES_PER_DESC; for (j = 0; j < ISO_FRAMES_PER_DESC; j++) { urb->iso_frame_desc[j].offset = j * ISO_MAX_FRAME_SIZE; urb->iso_frame_desc[j].length = pdev->vmax_packet_size; } } /* link */ for (i = 0; i < MAX_ISO_BUFS; i++) { ret = usb_submit_urb(pdev->urbs[i], GFP_KERNEL); if (ret == -ENOSPC && compression < 3) { compression++; pwc_isoc_cleanup(pdev); goto retry; } if (ret) { PWC_ERROR("isoc_init() submit_urb %d failed with error %d\n", i, ret); pwc_isoc_cleanup(pdev); return ret; } PWC_DEBUG_MEMORY("URB 0x%p submitted.\n", pdev->urbs[i]); } /* All is done... */ PWC_DEBUG_OPEN("<< pwc_isoc_init()\n"); return 0; } static void pwc_iso_stop(struct pwc_device *pdev) { int i; /* Unlinking ISOC buffers one by one */ for (i = 0; i < MAX_ISO_BUFS; i++) { if (pdev->urbs[i]) { PWC_DEBUG_MEMORY("Unlinking URB %p\n", pdev->urbs[i]); usb_kill_urb(pdev->urbs[i]); } } } static void pwc_iso_free(struct pwc_device *pdev) { int i; /* Freeing ISOC buffers one by one */ for (i = 0; i < MAX_ISO_BUFS; i++) { struct urb *urb = pdev->urbs[i]; if (urb) { PWC_DEBUG_MEMORY("Freeing URB\n"); if (urb->transfer_buffer) pwc_free_urb_buffer(urb->dev, urb->transfer_buffer_length, urb->transfer_buffer, urb->transfer_dma); usb_free_urb(urb); pdev->urbs[i] = NULL; } } } /* Both v4l2_lock and vb_queue_lock should be locked when calling this */ static void pwc_isoc_cleanup(struct pwc_device *pdev) { PWC_DEBUG_OPEN(">> pwc_isoc_cleanup()\n"); pwc_iso_stop(pdev); pwc_iso_free(pdev); usb_set_interface(pdev->udev, 0, 0); PWC_DEBUG_OPEN("<< pwc_isoc_cleanup()\n"); } /* Must be called with vb_queue_lock hold */ static void pwc_cleanup_queued_bufs(struct pwc_device *pdev, enum vb2_buffer_state state) { unsigned long flags = 0; spin_lock_irqsave(&pdev->queued_bufs_lock, flags); while (!list_empty(&pdev->queued_bufs)) { struct pwc_frame_buf *buf; buf = list_entry(pdev->queued_bufs.next, struct pwc_frame_buf, list); list_del(&buf->list); vb2_buffer_done(&buf->vb.vb2_buf, state); } spin_unlock_irqrestore(&pdev->queued_bufs_lock, flags); } #ifdef CONFIG_USB_PWC_DEBUG static const char *pwc_sensor_type_to_string(unsigned int sensor_type) { switch(sensor_type) { case 0x00: return "Hyundai CMOS sensor"; case 0x20: return "Sony CCD sensor + TDA8787"; case 0x2E: return "Sony CCD sensor + Exas 98L59"; case 0x2F: return "Sony CCD sensor + ADI 9804"; case 0x30: return "Sharp CCD sensor + TDA8787"; case 0x3E: return "Sharp CCD sensor + Exas 98L59"; case 0x3F: return "Sharp CCD sensor + ADI 9804"; case 0x40: return "UPA 1021 sensor"; case 0x100: return "VGA sensor"; case 0x101: return "PAL MR sensor"; default: return "unknown type of sensor"; } } #endif /***************************************************************************/ /* Video4Linux functions */ static void pwc_video_release(struct v4l2_device *v) { struct pwc_device *pdev = container_of(v, struct pwc_device, v4l2_dev); v4l2_ctrl_handler_free(&pdev->ctrl_handler); v4l2_device_unregister(&pdev->v4l2_dev); kfree(pdev->ctrl_buf); kfree(pdev); } /***************************************************************************/ /* Videobuf2 operations */ static int queue_setup(struct vb2_queue *vq, unsigned int *nbuffers, unsigned int *nplanes, unsigned int sizes[], struct device *alloc_devs[]) { struct pwc_device *pdev = vb2_get_drv_priv(vq); int size; if (*nbuffers < MIN_FRAMES) *nbuffers = MIN_FRAMES; else if (*nbuffers > MAX_FRAMES) *nbuffers = MAX_FRAMES; *nplanes = 1; size = pwc_get_size(pdev, MAX_WIDTH, MAX_HEIGHT); sizes[0] = PAGE_ALIGN(pwc_image_sizes[size][0] * pwc_image_sizes[size][1] * 3 / 2); return 0; } static int buffer_init(struct vb2_buffer *vb) { struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb); struct pwc_frame_buf *buf = container_of(vbuf, struct pwc_frame_buf, vb); /* need vmalloc since frame buffer > 128K */ buf->data = vzalloc(PWC_FRAME_SIZE); if (buf->data == NULL) return -ENOMEM; return 0; } static int buffer_prepare(struct vb2_buffer *vb) { struct pwc_device *pdev = vb2_get_drv_priv(vb->vb2_queue); /* Don't allow queueing new buffers after device disconnection */ if (!pdev->udev) return -ENODEV; return 0; } static void buffer_finish(struct vb2_buffer *vb) { struct pwc_device *pdev = vb2_get_drv_priv(vb->vb2_queue); struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb); struct pwc_frame_buf *buf = container_of(vbuf, struct pwc_frame_buf, vb); if (vb->state == VB2_BUF_STATE_DONE) { /* * Application has called dqbuf and is getting back a buffer * we've filled, take the pwc data we've stored in buf->data * and decompress it into a usable format, storing the result * in the vb2_buffer. */ pwc_decompress(pdev, buf); } } static void buffer_cleanup(struct vb2_buffer *vb) { struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb); struct pwc_frame_buf *buf = container_of(vbuf, struct pwc_frame_buf, vb); vfree(buf->data); } static void buffer_queue(struct vb2_buffer *vb) { struct pwc_device *pdev = vb2_get_drv_priv(vb->vb2_queue); struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb); struct pwc_frame_buf *buf = container_of(vbuf, struct pwc_frame_buf, vb); unsigned long flags = 0; /* Check the device has not disconnected between prep and queuing */ if (!pdev->udev) { vb2_buffer_done(vb, VB2_BUF_STATE_ERROR); return; } spin_lock_irqsave(&pdev->queued_bufs_lock, flags); list_add_tail(&buf->list, &pdev->queued_bufs); spin_unlock_irqrestore(&pdev->queued_bufs_lock, flags); } static int start_streaming(struct vb2_queue *vq, unsigned int count) { struct pwc_device *pdev = vb2_get_drv_priv(vq); int r; if (!pdev->udev) return -ENODEV; if (mutex_lock_interruptible(&pdev->v4l2_lock)) return -ERESTARTSYS; /* Turn on camera and set LEDS on */ pwc_camera_power(pdev, 1); pwc_set_leds(pdev, leds[0], leds[1]); r = pwc_isoc_init(pdev); if (r) { /* If we failed turn camera and LEDS back off */ pwc_set_leds(pdev, 0, 0); pwc_camera_power(pdev, 0); /* And cleanup any queued bufs!! */ pwc_cleanup_queued_bufs(pdev, VB2_BUF_STATE_QUEUED); } mutex_unlock(&pdev->v4l2_lock); return r; } static void stop_streaming(struct vb2_queue *vq) { struct pwc_device *pdev = vb2_get_drv_priv(vq); mutex_lock(&pdev->v4l2_lock); if (pdev->udev) { pwc_set_leds(pdev, 0, 0); pwc_camera_power(pdev, 0); pwc_isoc_cleanup(pdev); } pwc_cleanup_queued_bufs(pdev, VB2_BUF_STATE_ERROR); if (pdev->fill_buf) vb2_buffer_done(&pdev->fill_buf->vb.vb2_buf, VB2_BUF_STATE_ERROR); mutex_unlock(&pdev->v4l2_lock); } static const struct vb2_ops pwc_vb_queue_ops = { .queue_setup = queue_setup, .buf_init = buffer_init, .buf_prepare = buffer_prepare, .buf_finish = buffer_finish, .buf_cleanup = buffer_cleanup, .buf_queue = buffer_queue, .start_streaming = start_streaming, .stop_streaming = stop_streaming, }; /***************************************************************************/ /* USB functions */ /* This function gets called when a new device is plugged in or the usb core * is loaded. */ static int usb_pwc_probe(struct usb_interface *intf, const struct usb_device_id *id) { struct usb_device *udev = interface_to_usbdev(intf); struct pwc_device *pdev = NULL; int vendor_id, product_id, type_id; int rc; int features = 0; int compression = 0; int my_power_save = power_save; char serial_number[30], *name; vendor_id = le16_to_cpu(udev->descriptor.idVendor); product_id = le16_to_cpu(udev->descriptor.idProduct); /* Check if we can handle this device */ PWC_DEBUG_PROBE("probe() called [%04X %04X], if %d\n", vendor_id, product_id, intf->altsetting->desc.bInterfaceNumber); /* the interfaces are probed one by one. We are only interested in the video interface (0) now. Interface 1 is the Audio Control, and interface 2 Audio itself. */ if (intf->altsetting->desc.bInterfaceNumber > 0) return -ENODEV; if (vendor_id == 0x0471) { switch (product_id) { case 0x0302: PWC_INFO("Philips PCA645VC USB webcam detected.\n"); name = "Philips 645 webcam"; type_id = 645; break; case 0x0303: PWC_INFO("Philips PCA646VC USB webcam detected.\n"); name = "Philips 646 webcam"; type_id = 646; break; case 0x0304: PWC_INFO("Askey VC010 type 2 USB webcam detected.\n"); name = "Askey VC010 webcam"; type_id = 646; break; case 0x0307: PWC_INFO("Philips PCVC675K (Vesta) USB webcam detected.\n"); name = "Philips 675 webcam"; type_id = 675; break; case 0x0308: PWC_INFO("Philips PCVC680K (Vesta Pro) USB webcam detected.\n"); name = "Philips 680 webcam"; type_id = 680; break; case 0x030C: PWC_INFO("Philips PCVC690K (Vesta Pro Scan) USB webcam detected.\n"); name = "Philips 690 webcam"; type_id = 690; break; case 0x0310: PWC_INFO("Philips PCVC730K (ToUCam Fun)/PCVC830 (ToUCam II) USB webcam detected.\n"); name = "Philips 730 webcam"; type_id = 730; break; case 0x0311: PWC_INFO("Philips PCVC740K (ToUCam Pro)/PCVC840 (ToUCam II) USB webcam detected.\n"); name = "Philips 740 webcam"; type_id = 740; break; case 0x0312: PWC_INFO("Philips PCVC750K (ToUCam Pro Scan) USB webcam detected.\n"); name = "Philips 750 webcam"; type_id = 750; break; case 0x0313: PWC_INFO("Philips PCVC720K/40 (ToUCam XS) USB webcam detected.\n"); name = "Philips 720K/40 webcam"; type_id = 720; break; case 0x0329: PWC_INFO("Philips SPC 900NC USB webcam detected.\n"); name = "Philips SPC 900NC webcam"; type_id = 740; break; case 0x032C: PWC_INFO("Philips SPC 880NC USB webcam detected.\n"); name = "Philips SPC 880NC webcam"; type_id = 740; break; default: return -ENODEV; } } else if (vendor_id == 0x069A) { switch(product_id) { case 0x0001: PWC_INFO("Askey VC010 type 1 USB webcam detected.\n"); name = "Askey VC010 webcam"; type_id = 645; break; default: return -ENODEV; } } else if (vendor_id == 0x046d) { switch(product_id) { case 0x08b0: PWC_INFO("Logitech QuickCam Pro 3000 USB webcam detected.\n"); name = "Logitech QuickCam Pro 3000"; type_id = 740; /* CCD sensor */ break; case 0x08b1: PWC_INFO("Logitech QuickCam Notebook Pro USB webcam detected.\n"); name = "Logitech QuickCam Notebook Pro"; type_id = 740; /* CCD sensor */ break; case 0x08b2: PWC_INFO("Logitech QuickCam 4000 Pro USB webcam detected.\n"); name = "Logitech QuickCam Pro 4000"; type_id = 740; /* CCD sensor */ if (my_power_save == -1) my_power_save = 1; break; case 0x08b3: PWC_INFO("Logitech QuickCam Zoom USB webcam detected.\n"); name = "Logitech QuickCam Zoom"; type_id = 740; /* CCD sensor */ break; case 0x08B4: PWC_INFO("Logitech QuickCam Zoom (new model) USB webcam detected.\n"); name = "Logitech QuickCam Zoom"; type_id = 740; /* CCD sensor */ if (my_power_save == -1) my_power_save = 1; break; case 0x08b5: PWC_INFO("Logitech QuickCam Orbit/Sphere USB webcam detected.\n"); name = "Logitech QuickCam Orbit"; type_id = 740; /* CCD sensor */ if (my_power_save == -1) my_power_save = 1; features |= FEATURE_MOTOR_PANTILT; break; case 0x08b6: PWC_INFO("Logitech/Cisco VT Camera webcam detected.\n"); name = "Cisco VT Camera"; type_id = 740; /* CCD sensor */ break; case 0x08b7: PWC_INFO("Logitech ViewPort AV 100 webcam detected.\n"); name = "Logitech ViewPort AV 100"; type_id = 740; /* CCD sensor */ break; case 0x08b8: /* Where this released? */ PWC_INFO("Logitech QuickCam detected (reserved ID).\n"); name = "Logitech QuickCam (res.)"; type_id = 730; /* Assuming CMOS */ break; default: return -ENODEV; } } else if (vendor_id == 0x055d) { /* I don't know the difference between the C10 and the C30; I suppose the difference is the sensor, but both cameras work equally well with a type_id of 675 */ switch(product_id) { case 0x9000: PWC_INFO("Samsung MPC-C10 USB webcam detected.\n"); name = "Samsung MPC-C10"; type_id = 675; break; case 0x9001: PWC_INFO("Samsung MPC-C30 USB webcam detected.\n"); name = "Samsung MPC-C30"; type_id = 675; break; case 0x9002: PWC_INFO("Samsung SNC-35E (v3.0) USB webcam detected.\n"); name = "Samsung MPC-C30"; type_id = 740; break; default: return -ENODEV; } } else if (vendor_id == 0x041e) { switch(product_id) { case 0x400c: PWC_INFO("Creative Labs Webcam 5 detected.\n"); name = "Creative Labs Webcam 5"; type_id = 730; if (my_power_save == -1) my_power_save = 1; break; case 0x4011: PWC_INFO("Creative Labs Webcam Pro Ex detected.\n"); name = "Creative Labs Webcam Pro Ex"; type_id = 740; break; default: return -ENODEV; } } else if (vendor_id == 0x04cc) { switch(product_id) { case 0x8116: PWC_INFO("Sotec Afina Eye USB webcam detected.\n"); name = "Sotec Afina Eye"; type_id = 730; break; default: return -ENODEV; } } else if (vendor_id == 0x06be) { switch(product_id) { case 0x8116: /* This is essentially the same cam as the Sotec Afina Eye */ PWC_INFO("AME Co. Afina Eye USB webcam detected.\n"); name = "AME Co. Afina Eye"; type_id = 750; break; default: return -ENODEV; } } else if (vendor_id == 0x0d81) { switch(product_id) { case 0x1900: PWC_INFO("Visionite VCS-UC300 USB webcam detected.\n"); name = "Visionite VCS-UC300"; type_id = 740; /* CCD sensor */ break; case 0x1910: PWC_INFO("Visionite VCS-UM100 USB webcam detected.\n"); name = "Visionite VCS-UM100"; type_id = 730; /* CMOS sensor */ break; default: return -ENODEV; } } else return -ENODEV; /* Not any of the know types; but the list keeps growing. */ if (my_power_save == -1) my_power_save = 0; memset(serial_number, 0, 30); usb_string(udev, udev->descriptor.iSerialNumber, serial_number, 29); PWC_DEBUG_PROBE("Device serial number is %s\n", serial_number); if (udev->descriptor.bNumConfigurations > 1) PWC_WARNING("Warning: more than 1 configuration available.\n"); /* Allocate structure, initialize pointers, mutexes, etc. and link it to the usb_device */ pdev = kzalloc(sizeof(struct pwc_device), GFP_KERNEL); if (pdev == NULL) { PWC_ERROR("Oops, could not allocate memory for pwc_device.\n"); return -ENOMEM; } pdev->type = type_id; pdev->features = features; pwc_construct(pdev); /* set min/max sizes correct */ mutex_init(&pdev->v4l2_lock); mutex_init(&pdev->vb_queue_lock); spin_lock_init(&pdev->queued_bufs_lock); INIT_LIST_HEAD(&pdev->queued_bufs); pdev->udev = udev; pdev->power_save = my_power_save; /* Init videobuf2 queue structure */ pdev->vb_queue.type = V4L2_BUF_TYPE_VIDEO_CAPTURE; pdev->vb_queue.io_modes = VB2_MMAP | VB2_USERPTR | VB2_READ; pdev->vb_queue.drv_priv = pdev; pdev->vb_queue.buf_struct_size = sizeof(struct pwc_frame_buf); pdev->vb_queue.ops = &pwc_vb_queue_ops; pdev->vb_queue.mem_ops = &vb2_vmalloc_memops; pdev->vb_queue.timestamp_flags = V4L2_BUF_FLAG_TIMESTAMP_MONOTONIC; pdev->vb_queue.lock = &pdev->vb_queue_lock; rc = vb2_queue_init(&pdev->vb_queue); if (rc < 0) { PWC_ERROR("Oops, could not initialize vb2 queue.\n"); goto err_free_mem; } /* Init video_device structure */ pdev->vdev = pwc_template; strscpy(pdev->vdev.name, name, sizeof(pdev->vdev.name)); pdev->vdev.queue = &pdev->vb_queue; video_set_drvdata(&pdev->vdev, pdev); pdev->release = le16_to_cpu(udev->descriptor.bcdDevice); PWC_DEBUG_PROBE("Release: %04x\n", pdev->release); /* Allocate USB command buffers */ pdev->ctrl_buf = kmalloc(sizeof(pdev->cmd_buf), GFP_KERNEL); if (!pdev->ctrl_buf) { PWC_ERROR("Oops, could not allocate memory for pwc_device.\n"); rc = -ENOMEM; goto err_free_mem; } #ifdef CONFIG_USB_PWC_DEBUG /* Query sensor type */ if (pwc_get_cmos_sensor(pdev, &rc) >= 0) { PWC_DEBUG_OPEN("This %s camera is equipped with a %s (%d).\n", pdev->vdev.name, pwc_sensor_type_to_string(rc), rc); } #endif /* Set the leds off */ pwc_set_leds(pdev, 0, 0); /* Setup initial videomode */ rc = pwc_set_video_mode(pdev, MAX_WIDTH, MAX_HEIGHT, V4L2_PIX_FMT_YUV420, 30, &compression, 1); if (rc) goto err_free_mem; /* Register controls (and read default values from camera */ rc = pwc_init_controls(pdev); if (rc) { PWC_ERROR("Failed to register v4l2 controls (%d).\n", rc); goto err_free_mem; } /* And powerdown the camera until streaming starts */ pwc_camera_power(pdev, 0); /* Register the v4l2_device structure */ pdev->v4l2_dev.release = pwc_video_release; rc = v4l2_device_register(&intf->dev, &pdev->v4l2_dev); if (rc) { PWC_ERROR("Failed to register v4l2-device (%d).\n", rc); goto err_free_controls; } pdev->v4l2_dev.ctrl_handler = &pdev->ctrl_handler; pdev->vdev.v4l2_dev = &pdev->v4l2_dev; pdev->vdev.lock = &pdev->v4l2_lock; pdev->vdev.device_caps = V4L2_CAP_VIDEO_CAPTURE | V4L2_CAP_STREAMING | V4L2_CAP_READWRITE; rc = video_register_device(&pdev->vdev, VFL_TYPE_VIDEO, -1); if (rc < 0) { PWC_ERROR("Failed to register as video device (%d).\n", rc); goto err_unregister_v4l2_dev; } PWC_INFO("Registered as %s.\n", video_device_node_name(&pdev->vdev)); #ifdef CONFIG_USB_PWC_INPUT_EVDEV /* register webcam snapshot button input device */ pdev->button_dev = input_allocate_device(); if (!pdev->button_dev) { rc = -ENOMEM; goto err_video_unreg; } usb_make_path(udev, pdev->button_phys, sizeof(pdev->button_phys)); strlcat(pdev->button_phys, "/input0", sizeof(pdev->button_phys)); pdev->button_dev->name = "PWC snapshot button"; pdev->button_dev->phys = pdev->button_phys; usb_to_input_id(pdev->udev, &pdev->button_dev->id); pdev->button_dev->dev.parent = &pdev->udev->dev; pdev->button_dev->evbit[0] = BIT_MASK(EV_KEY); pdev->button_dev->keybit[BIT_WORD(KEY_CAMERA)] = BIT_MASK(KEY_CAMERA); rc = input_register_device(pdev->button_dev); if (rc) { input_free_device(pdev->button_dev); pdev->button_dev = NULL; goto err_video_unreg; } #endif return 0; #ifdef CONFIG_USB_PWC_INPUT_EVDEV err_video_unreg: video_unregister_device(&pdev->vdev); #endif err_unregister_v4l2_dev: v4l2_device_unregister(&pdev->v4l2_dev); err_free_controls: v4l2_ctrl_handler_free(&pdev->ctrl_handler); err_free_mem: kfree(pdev->ctrl_buf); kfree(pdev); return rc; } /* The user yanked out the cable... */ static void usb_pwc_disconnect(struct usb_interface *intf) { struct v4l2_device *v = usb_get_intfdata(intf); struct pwc_device *pdev = container_of(v, struct pwc_device, v4l2_dev); mutex_lock(&pdev->vb_queue_lock); mutex_lock(&pdev->v4l2_lock); /* No need to keep the urbs around after disconnection */ if (pdev->vb_queue.streaming) pwc_isoc_cleanup(pdev); pdev->udev = NULL; v4l2_device_disconnect(&pdev->v4l2_dev); video_unregister_device(&pdev->vdev); mutex_unlock(&pdev->v4l2_lock); mutex_unlock(&pdev->vb_queue_lock); #ifdef CONFIG_USB_PWC_INPUT_EVDEV if (pdev->button_dev) input_unregister_device(pdev->button_dev); #endif v4l2_device_put(&pdev->v4l2_dev); } /* * Initialization code & module stuff */ static unsigned int leds_nargs; #ifdef CONFIG_USB_PWC_DEBUG module_param_named(trace, pwc_trace, int, 0644); #endif module_param(power_save, int, 0644); module_param_array(leds, int, &leds_nargs, 0444); #ifdef CONFIG_USB_PWC_DEBUG MODULE_PARM_DESC(trace, "For debugging purposes"); #endif MODULE_PARM_DESC(power_save, "Turn power saving for new cameras on or off"); MODULE_PARM_DESC(leds, "LED on,off time in milliseconds"); MODULE_DESCRIPTION("Philips & OEM USB webcam driver"); MODULE_AUTHOR("Luc Saillard <luc@saillard.org>"); MODULE_LICENSE("GPL"); MODULE_ALIAS("pwcx"); MODULE_VERSION( PWC_VERSION ); module_usb_driver(pwc_driver); |
| 8 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * All Rights Reserved. */ #ifndef __XFS_LINUX__ #define __XFS_LINUX__ #include <linux/types.h> #include <linux/uuid.h> /* * Kernel specific type declarations for XFS */ typedef __s64 xfs_off_t; /* <file offset> type */ typedef unsigned long long xfs_ino_t; /* <inode> type */ typedef __s64 xfs_daddr_t; /* <disk address> type */ typedef __u32 xfs_dev_t; typedef __u32 xfs_nlink_t; #include "xfs_types.h" #include <linux/semaphore.h> #include <linux/mm.h> #include <linux/sched/mm.h> #include <linux/kernel.h> #include <linux/blkdev.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/crc32c.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/file.h> #include <linux/filelock.h> #include <linux/swap.h> #include <linux/errno.h> #include <linux/sched/signal.h> #include <linux/bitops.h> #include <linux/major.h> #include <linux/pagemap.h> #include <linux/vfs.h> #include <linux/seq_file.h> #include <linux/init.h> #include <linux/list.h> #include <linux/proc_fs.h> #include <linux/sort.h> #include <linux/cpu.h> #include <linux/notifier.h> #include <linux/delay.h> #include <linux/log2.h> #include <linux/rwsem.h> #include <linux/spinlock.h> #include <linux/random.h> #include <linux/ctype.h> #include <linux/writeback.h> #include <linux/capability.h> #include <linux/kthread.h> #include <linux/freezer.h> #include <linux/list_sort.h> #include <linux/ratelimit.h> #include <linux/rhashtable.h> #include <linux/xattr.h> #include <linux/mnt_idmapping.h> #include <linux/debugfs.h> #include <asm/page.h> #include <asm/div64.h> #include <asm/param.h> #include <linux/uaccess.h> #include <asm/byteorder.h> #include <linux/unaligned.h> #include "xfs_fs.h" #include "xfs_stats.h" #include "xfs_sysctl.h" #include "xfs_iops.h" #include "xfs_aops.h" #include "xfs_super.h" #include "xfs_cksum.h" #include "xfs_buf.h" #include "xfs_message.h" #include "xfs_drain.h" #include "xfs_hooks.h" #ifdef __BIG_ENDIAN #define XFS_NATIVE_HOST 1 #else #undef XFS_NATIVE_HOST #endif #define xfs_panic_mask xfs_params.panic_mask.val #define xfs_error_level xfs_params.error_level.val #define xfs_syncd_centisecs xfs_params.syncd_timer.val #define xfs_stats_clear xfs_params.stats_clear.val #define xfs_inherit_sync xfs_params.inherit_sync.val #define xfs_inherit_nodump xfs_params.inherit_nodump.val #define xfs_inherit_noatime xfs_params.inherit_noatim.val #define xfs_inherit_nosymlinks xfs_params.inherit_nosym.val #define xfs_rotorstep xfs_params.rotorstep.val #define xfs_inherit_nodefrag xfs_params.inherit_nodfrg.val #define xfs_fstrm_centisecs xfs_params.fstrm_timer.val #define xfs_blockgc_secs xfs_params.blockgc_timer.val #define current_cpu() (raw_smp_processor_id()) #define current_set_flags_nested(sp, f) \ (*(sp) = current->flags, current->flags |= (f)) #define current_restore_flags_nested(sp, f) \ (current->flags = ((current->flags & ~(f)) | (*(sp) & (f)))) #define NBBY 8 /* number of bits per byte */ /* * Size of block device i/o is parameterized here. * Currently the system supports page-sized i/o. */ #define BLKDEV_IOSHIFT PAGE_SHIFT #define BLKDEV_IOSIZE (1<<BLKDEV_IOSHIFT) /* number of BB's per block device block */ #define BLKDEV_BB BTOBB(BLKDEV_IOSIZE) #define ENOATTR ENODATA /* Attribute not found */ #define EWRONGFS EINVAL /* Mount with wrong filesystem type */ #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ #define EFSBADCRC EBADMSG /* Bad CRC detected */ #define __return_address __builtin_return_address(0) /* * Return the address of a label. Use barrier() so that the optimizer * won't reorder code to refactor the error jumpouts into a single * return, which throws off the reported address. */ #define __this_address ({ __label__ __here; __here: barrier(); &&__here; }) #define howmany(x, y) (((x)+((y)-1))/(y)) static inline void delay(long ticks) { schedule_timeout_uninterruptible(ticks); } /* * XFS wrapper structure for sysfs support. It depends on external data * structures and is embedded in various internal data structures to implement * the XFS sysfs object heirarchy. Define it here for broad access throughout * the codebase. */ struct xfs_kobj { struct kobject kobject; struct completion complete; }; struct xstats { struct xfsstats __percpu *xs_stats; struct xfs_kobj xs_kobj; }; extern struct xstats xfsstats; static inline dev_t xfs_to_linux_dev_t(xfs_dev_t dev) { return MKDEV(sysv_major(dev) & 0x1ff, sysv_minor(dev)); } static inline xfs_dev_t linux_to_xfs_dev_t(dev_t dev) { return sysv_encode_dev(dev); } /* * Various platform dependent calls that don't fit anywhere else */ #define xfs_sort(a,n,s,fn) sort(a,n,s,fn,NULL) #define xfs_stack_trace() dump_stack() static inline uint64_t rounddown_64(uint64_t x, uint32_t y) { do_div(x, y); return x * y; } static inline uint64_t roundup_64(uint64_t x, uint32_t y) { x += y - 1; do_div(x, y); return x * y; } static inline uint64_t howmany_64(uint64_t x, uint32_t y) { x += y - 1; do_div(x, y); return x; } static inline bool isaligned_64(uint64_t x, uint32_t y) { return do_div(x, y) == 0; } /* If @b is a power of 2, return log2(b). Else return -1. */ static inline int8_t log2_if_power2(unsigned long b) { return is_power_of_2(b) ? ilog2(b) : -1; } /* If @b is a power of 2, return a mask of the lower bits, else return zero. */ static inline unsigned long long mask64_if_power2(unsigned long b) { return is_power_of_2(b) ? b - 1 : 0; } int xfs_rw_bdev(struct block_device *bdev, sector_t sector, unsigned int count, char *data, enum req_op op); #define ASSERT_ALWAYS(expr) \ (likely(expr) ? (void)0 : assfail(NULL, #expr, __FILE__, __LINE__)) #ifdef DEBUG #define ASSERT(expr) \ (likely(expr) ? (void)0 : assfail(NULL, #expr, __FILE__, __LINE__)) #else /* !DEBUG */ #ifdef XFS_WARN #define ASSERT(expr) \ (likely(expr) ? (void)0 : asswarn(NULL, #expr, __FILE__, __LINE__)) #else /* !DEBUG && !XFS_WARN */ #define ASSERT(expr) ((void)0) #endif /* XFS_WARN */ #endif /* DEBUG */ #define XFS_IS_CORRUPT(mp, expr) \ (unlikely(expr) ? xfs_corruption_error(#expr, XFS_ERRLEVEL_LOW, (mp), \ NULL, 0, __FILE__, __LINE__, \ __this_address), \ true : false) #define STATIC static noinline #ifdef CONFIG_XFS_RT /* * make sure we ignore the inode flag if the filesystem doesn't have a * configured realtime device. */ #define XFS_IS_REALTIME_INODE(ip) \ (((ip)->i_diflags & XFS_DIFLAG_REALTIME) && \ (ip)->i_mount->m_rtdev_targp) #define XFS_IS_REALTIME_MOUNT(mp) ((mp)->m_rtdev_targp ? 1 : 0) #else #define XFS_IS_REALTIME_INODE(ip) (0) #define XFS_IS_REALTIME_MOUNT(mp) (0) #endif /* * Starting in Linux 4.15, the %p (raw pointer value) printk modifier * prints a hashed version of the pointer to avoid leaking kernel * pointers into dmesg. If we're trying to debug the kernel we want the * raw values, so override this behavior as best we can. */ #ifdef DEBUG # define PTR_FMT "%px" #else # define PTR_FMT "%p" #endif /* * Helper for IO routines to grab backing pages from allocated kernel memory. */ static inline struct page * kmem_to_page(void *addr) { if (is_vmalloc_addr(addr)) return vmalloc_to_page(addr); return virt_to_page(addr); } #endif /* __XFS_LINUX__ */ |
| 142 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 | /* * linux/drivers/video/console/fbcon.h -- Low level frame buffer based console driver * * Copyright (C) 1997 Geert Uytterhoeven * * This file is subject to the terms and conditions of the GNU General Public * License. See the file COPYING in the main directory of this archive * for more details. */ #ifndef _VIDEO_FBCON_H #define _VIDEO_FBCON_H #include <linux/types.h> #include <linux/vt_buffer.h> #include <linux/vt_kern.h> #include <linux/workqueue.h> #include <asm/io.h> /* * This is the interface between the low-level console driver and the * low-level frame buffer device */ struct fbcon_display { /* Filled in by the low-level console driver */ const u_char *fontdata; int userfont; /* != 0 if fontdata kmalloc()ed */ #ifdef CONFIG_FRAMEBUFFER_CONSOLE_LEGACY_ACCELERATION u_short scrollmode; /* Scroll Method, use fb_scrollmode() */ #endif u_short inverse; /* != 0 text black on white as default */ short yscroll; /* Hardware scrolling */ int vrows; /* number of virtual rows */ int cursor_shape; int con_rotate; u32 xres_virtual; u32 yres_virtual; u32 height; u32 width; u32 bits_per_pixel; u32 grayscale; u32 nonstd; u32 accel_flags; u32 rotate; struct fb_bitfield red; struct fb_bitfield green; struct fb_bitfield blue; struct fb_bitfield transp; const struct fb_videomode *mode; }; struct fbcon_bitops { void (*bmove)(struct vc_data *vc, struct fb_info *info, int sy, int sx, int dy, int dx, int height, int width); void (*clear)(struct vc_data *vc, struct fb_info *info, int sy, int sx, int height, int width, int fb, int bg); void (*putcs)(struct vc_data *vc, struct fb_info *info, const unsigned short *s, int count, int yy, int xx, int fg, int bg); void (*clear_margins)(struct vc_data *vc, struct fb_info *info, int color, int bottom_only); void (*cursor)(struct vc_data *vc, struct fb_info *info, bool enable, int fg, int bg); int (*update_start)(struct fb_info *info); int (*rotate_font)(struct fb_info *info, struct vc_data *vc); }; struct fbcon_par { struct fb_var_screeninfo var; /* copy of the current fb_var_screeninfo */ struct delayed_work cursor_work; /* Cursor timer */ struct fb_cursor cursor_state; struct fbcon_display *p; struct fb_info *info; int currcon; /* Current VC. */ int cur_blink_jiffies; int cursor_flash; int cursor_reset; int blank_state; int graphics; int save_graphics; /* for debug enter/leave */ bool initialized; int rotate; int cur_rotate; char *cursor_data; u8 *fontbuffer; u8 *fontdata; u8 *cursor_src; u32 cursor_size; u32 fd_size; const struct fbcon_bitops *bitops; }; /* * Attribute Decoding */ /* Color */ #define attr_fgcol(fgshift,s) \ (((s) >> (fgshift)) & 0x0f) #define attr_bgcol(bgshift,s) \ (((s) >> (bgshift)) & 0x0f) /* Monochrome */ #define attr_bold(s) \ ((s) & 0x200) #define attr_reverse(s) \ ((s) & 0x800) #define attr_underline(s) \ ((s) & 0x400) #define attr_blink(s) \ ((s) & 0x8000) static inline int mono_col(const struct fb_info *info) { __u32 max_len; max_len = max(info->var.green.length, info->var.red.length); max_len = max(info->var.blue.length, max_len); return (~(0xfff << max_len)) & 0xff; } /* * Scroll Method */ /* There are several methods fbcon can use to move text around the screen: * * Operation Pan Wrap *--------------------------------------------- * SCROLL_MOVE copyarea No No * SCROLL_PAN_MOVE copyarea Yes No * SCROLL_WRAP_MOVE copyarea No Yes * SCROLL_REDRAW imageblit No No * SCROLL_PAN_REDRAW imageblit Yes No * SCROLL_WRAP_REDRAW imageblit No Yes * * (SCROLL_WRAP_REDRAW is not implemented yet) * * In general, fbcon will choose the best scrolling * method based on the rule below: * * Pan/Wrap > accel imageblit > accel copyarea > * soft imageblit > (soft copyarea) * * Exception to the rule: Pan + accel copyarea is * preferred over Pan + accel imageblit. * * The above is typical for PCI/AGP cards. Unless * overridden, fbcon will never use soft copyarea. * * If you need to override the above rule, set the * appropriate flags in fb_info->flags. For example, * to prefer copyarea over imageblit, set * FBINFO_READS_FAST. * * Other notes: * + use the hardware engine to move the text * (hw-accelerated copyarea() and fillrect()) * + use hardware-supported panning on a large virtual screen * + amifb can not only pan, but also wrap the display by N lines * (i.e. visible line i = physical line (i+N) % yres). * + read what's already rendered on the screen and * write it in a different place (this is cfb_copyarea()) * + re-render the text to the screen * * Whether to use wrapping or panning can only be figured out at * runtime (when we know whether our font height is a multiple * of the pan/wrap step) * */ #define SCROLL_MOVE 0x001 #define SCROLL_PAN_MOVE 0x002 #define SCROLL_WRAP_MOVE 0x003 #define SCROLL_REDRAW 0x004 #define SCROLL_PAN_REDRAW 0x005 static inline u_short fb_scrollmode(struct fbcon_display *fb) { #ifdef CONFIG_FRAMEBUFFER_CONSOLE_LEGACY_ACCELERATION return fb->scrollmode; #else /* hardcoded to SCROLL_REDRAW if acceleration was disabled. */ return SCROLL_REDRAW; #endif } #ifdef CONFIG_FB_TILEBLITTING extern void fbcon_set_tileops(struct vc_data *vc, struct fb_info *info); #endif extern void fbcon_set_bitops_ur(struct fbcon_par *par); extern int soft_cursor(struct fb_info *info, struct fb_cursor *cursor); #define FBCON_ATTRIBUTE_UNDERLINE 1 #define FBCON_ATTRIBUTE_REVERSE 2 #define FBCON_ATTRIBUTE_BOLD 4 static inline int real_y(struct fbcon_display *p, int ypos) { int rows = p->vrows; ypos += p->yscroll; return ypos < rows ? ypos : ypos - rows; } static inline int get_attribute(struct fb_info *info, u16 c) { int attribute = 0; if (fb_get_color_depth(&info->var, &info->fix) == 1) { if (attr_underline(c)) attribute |= FBCON_ATTRIBUTE_UNDERLINE; if (attr_reverse(c)) attribute |= FBCON_ATTRIBUTE_REVERSE; if (attr_bold(c)) attribute |= FBCON_ATTRIBUTE_BOLD; } return attribute; } #define FBCON_SWAP(i,r,v) ({ \ typeof(r) _r = (r); \ typeof(v) _v = (v); \ (void) (&_r == &_v); \ (i == FB_ROTATE_UR || i == FB_ROTATE_UD) ? _r : _v; }) #endif /* _VIDEO_FBCON_H */ |
| 15 9629 2 2 2 2 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner * * NOHZ implementation for low and high resolution timers * * Started by: Thomas Gleixner and Ingo Molnar */ #include <linux/compiler.h> #include <linux/cpu.h> #include <linux/err.h> #include <linux/hrtimer.h> #include <linux/interrupt.h> #include <linux/kernel_stat.h> #include <linux/percpu.h> #include <linux/nmi.h> #include <linux/profile.h> #include <linux/sched/signal.h> #include <linux/sched/clock.h> #include <linux/sched/stat.h> #include <linux/sched/nohz.h> #include <linux/sched/loadavg.h> #include <linux/module.h> #include <linux/irq_work.h> #include <linux/posix-timers.h> #include <linux/context_tracking.h> #include <linux/mm.h> #include <asm/irq_regs.h> #include "tick-internal.h" #include <trace/events/timer.h> /* * Per-CPU nohz control structure */ static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); struct tick_sched *tick_get_tick_sched(int cpu) { return &per_cpu(tick_cpu_sched, cpu); } /* * The time when the last jiffy update happened. Write access must hold * jiffies_lock and jiffies_seq. tick_nohz_next_event() needs to get a * consistent view of jiffies and last_jiffies_update. */ static ktime_t last_jiffies_update; /* * Must be called with interrupts disabled ! */ static void tick_do_update_jiffies64(ktime_t now) { unsigned long ticks = 1; ktime_t delta, nextp; /* * 64-bit can do a quick check without holding the jiffies lock and * without looking at the sequence count. The smp_load_acquire() * pairs with the update done later in this function. * * 32-bit cannot do that because the store of 'tick_next_period' * consists of two 32-bit stores, and the first store could be * moved by the CPU to a random point in the future. */ if (IS_ENABLED(CONFIG_64BIT)) { if (ktime_before(now, smp_load_acquire(&tick_next_period))) return; } else { unsigned int seq; /* * Avoid contention on 'jiffies_lock' and protect the quick * check with the sequence count. */ do { seq = read_seqcount_begin(&jiffies_seq); nextp = tick_next_period; } while (read_seqcount_retry(&jiffies_seq, seq)); if (ktime_before(now, nextp)) return; } /* Quick check failed, i.e. update is required. */ raw_spin_lock(&jiffies_lock); /* * Re-evaluate with the lock held. Another CPU might have done the * update already. */ if (ktime_before(now, tick_next_period)) { raw_spin_unlock(&jiffies_lock); return; } write_seqcount_begin(&jiffies_seq); delta = ktime_sub(now, tick_next_period); if (unlikely(delta >= TICK_NSEC)) { /* Slow path for long idle sleep times */ s64 incr = TICK_NSEC; ticks += ktime_divns(delta, incr); last_jiffies_update = ktime_add_ns(last_jiffies_update, incr * ticks); } else { last_jiffies_update = ktime_add_ns(last_jiffies_update, TICK_NSEC); } /* Advance jiffies to complete the 'jiffies_seq' protected job */ jiffies_64 += ticks; /* Keep the tick_next_period variable up to date */ nextp = ktime_add_ns(last_jiffies_update, TICK_NSEC); if (IS_ENABLED(CONFIG_64BIT)) { /* * Pairs with smp_load_acquire() in the lockless quick * check above, and ensures that the update to 'jiffies_64' is * not reordered vs. the store to 'tick_next_period', neither * by the compiler nor by the CPU. */ smp_store_release(&tick_next_period, nextp); } else { /* * A plain store is good enough on 32-bit, as the quick check * above is protected by the sequence count. */ tick_next_period = nextp; } /* * Release the sequence count. calc_global_load() below is not * protected by it, but 'jiffies_lock' needs to be held to prevent * concurrent invocations. */ write_seqcount_end(&jiffies_seq); calc_global_load(); raw_spin_unlock(&jiffies_lock); update_wall_time(); } /* * Initialize and return retrieve the jiffies update. */ static ktime_t tick_init_jiffy_update(void) { ktime_t period; raw_spin_lock(&jiffies_lock); write_seqcount_begin(&jiffies_seq); /* Have we started the jiffies update yet ? */ if (last_jiffies_update == 0) { u32 rem; /* * Ensure that the tick is aligned to a multiple of * TICK_NSEC. */ div_u64_rem(tick_next_period, TICK_NSEC, &rem); if (rem) tick_next_period += TICK_NSEC - rem; last_jiffies_update = tick_next_period; } period = last_jiffies_update; write_seqcount_end(&jiffies_seq); raw_spin_unlock(&jiffies_lock); return period; } static inline int tick_sched_flag_test(struct tick_sched *ts, unsigned long flag) { return !!(ts->flags & flag); } static inline void tick_sched_flag_set(struct tick_sched *ts, unsigned long flag) { lockdep_assert_irqs_disabled(); ts->flags |= flag; } static inline void tick_sched_flag_clear(struct tick_sched *ts, unsigned long flag) { lockdep_assert_irqs_disabled(); ts->flags &= ~flag; } /* * Allow only one non-timekeeper CPU at a time update jiffies from * the timer tick. * * Returns true if update was run. */ static bool tick_limited_update_jiffies64(struct tick_sched *ts, ktime_t now) { static atomic_t in_progress; int inp; inp = atomic_read(&in_progress); if (inp || !atomic_try_cmpxchg(&in_progress, &inp, 1)) return false; if (ts->last_tick_jiffies == jiffies) tick_do_update_jiffies64(now); atomic_set(&in_progress, 0); return true; } #define MAX_STALLED_JIFFIES 5 static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now) { int tick_cpu, cpu = smp_processor_id(); /* * Check if the do_timer duty was dropped. We don't care about * concurrency: This happens only when the CPU in charge went * into a long sleep. If two CPUs happen to assign themselves to * this duty, then the jiffies update is still serialized by * 'jiffies_lock'. * * If nohz_full is enabled, this should not happen because the * 'tick_do_timer_cpu' CPU never relinquishes. */ tick_cpu = READ_ONCE(tick_do_timer_cpu); if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && unlikely(tick_cpu == TICK_DO_TIMER_NONE)) { #ifdef CONFIG_NO_HZ_FULL WARN_ON_ONCE(tick_nohz_full_running); #endif WRITE_ONCE(tick_do_timer_cpu, cpu); tick_cpu = cpu; } /* Check if jiffies need an update */ if (tick_cpu == cpu) tick_do_update_jiffies64(now); /* * If the jiffies update stalled for too long (timekeeper in stop_machine() * or VMEXIT'ed for several msecs), force an update. */ if (ts->last_tick_jiffies != jiffies) { ts->stalled_jiffies = 0; ts->last_tick_jiffies = READ_ONCE(jiffies); } else { if (++ts->stalled_jiffies >= MAX_STALLED_JIFFIES) { if (tick_limited_update_jiffies64(ts, now)) { ts->stalled_jiffies = 0; ts->last_tick_jiffies = READ_ONCE(jiffies); } } } if (tick_sched_flag_test(ts, TS_FLAG_INIDLE)) ts->got_idle_tick = 1; } static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) { /* * When we are idle and the tick is stopped, we have to touch * the watchdog as we might not schedule for a really long * time. This happens on completely idle SMP systems while * waiting on the login prompt. We also increment the "start of * idle" jiffy stamp so the idle accounting adjustment we do * when we go busy again does not account too many ticks. */ if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && tick_sched_flag_test(ts, TS_FLAG_STOPPED)) { touch_softlockup_watchdog_sched(); if (is_idle_task(current)) ts->idle_jiffies++; /* * In case the current tick fired too early past its expected * expiration, make sure we don't bypass the next clock reprogramming * to the same deadline. */ ts->next_tick = 0; } update_process_times(user_mode(regs)); profile_tick(CPU_PROFILING); } /* * We rearm the timer until we get disabled by the idle code. * Called with interrupts disabled. */ static enum hrtimer_restart tick_nohz_handler(struct hrtimer *timer) { struct tick_sched *ts = container_of(timer, struct tick_sched, sched_timer); struct pt_regs *regs = get_irq_regs(); ktime_t now = ktime_get(); tick_sched_do_timer(ts, now); /* * Do not call when we are not in IRQ context and have * no valid 'regs' pointer */ if (regs) tick_sched_handle(ts, regs); else ts->next_tick = 0; /* * In dynticks mode, tick reprogram is deferred: * - to the idle task if in dynticks-idle * - to IRQ exit if in full-dynticks. */ if (unlikely(tick_sched_flag_test(ts, TS_FLAG_STOPPED))) return HRTIMER_NORESTART; hrtimer_forward(timer, now, TICK_NSEC); return HRTIMER_RESTART; } #ifdef CONFIG_NO_HZ_FULL cpumask_var_t tick_nohz_full_mask; EXPORT_SYMBOL_GPL(tick_nohz_full_mask); bool tick_nohz_full_running; EXPORT_SYMBOL_GPL(tick_nohz_full_running); static atomic_t tick_dep_mask; static bool check_tick_dependency(atomic_t *dep) { int val = atomic_read(dep); if (val & TICK_DEP_MASK_POSIX_TIMER) { trace_tick_stop(0, TICK_DEP_MASK_POSIX_TIMER); return true; } if (val & TICK_DEP_MASK_PERF_EVENTS) { trace_tick_stop(0, TICK_DEP_MASK_PERF_EVENTS); return true; } if (val & TICK_DEP_MASK_SCHED) { trace_tick_stop(0, TICK_DEP_MASK_SCHED); return true; } if (val & TICK_DEP_MASK_CLOCK_UNSTABLE) { trace_tick_stop(0, TICK_DEP_MASK_CLOCK_UNSTABLE); return true; } if (val & TICK_DEP_MASK_RCU) { trace_tick_stop(0, TICK_DEP_MASK_RCU); return true; } if (val & TICK_DEP_MASK_RCU_EXP) { trace_tick_stop(0, TICK_DEP_MASK_RCU_EXP); return true; } return false; } static bool can_stop_full_tick(int cpu, struct tick_sched *ts) { lockdep_assert_irqs_disabled(); if (unlikely(!cpu_online(cpu))) return false; if (check_tick_dependency(&tick_dep_mask)) return false; if (check_tick_dependency(&ts->tick_dep_mask)) return false; if (check_tick_dependency(¤t->tick_dep_mask)) return false; if (check_tick_dependency(¤t->signal->tick_dep_mask)) return false; return true; } static void nohz_full_kick_func(struct irq_work *work) { /* Empty, the tick restart happens on tick_nohz_irq_exit() */ } static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = IRQ_WORK_INIT_HARD(nohz_full_kick_func); /* * Kick this CPU if it's full dynticks in order to force it to * re-evaluate its dependency on the tick and restart it if necessary. * This kick, unlike tick_nohz_full_kick_cpu() and tick_nohz_full_kick_all(), * is NMI safe. */ static void tick_nohz_full_kick(void) { if (!tick_nohz_full_cpu(smp_processor_id())) return; irq_work_queue(this_cpu_ptr(&nohz_full_kick_work)); } /* * Kick the CPU if it's full dynticks in order to force it to * re-evaluate its dependency on the tick and restart it if necessary. */ void tick_nohz_full_kick_cpu(int cpu) { if (!tick_nohz_full_cpu(cpu)) return; irq_work_queue_on(&per_cpu(nohz_full_kick_work, cpu), cpu); } static void tick_nohz_kick_task(struct task_struct *tsk) { int cpu; /* * If the task is not running, run_posix_cpu_timers() * has nothing to elapse, and an IPI can then be optimized out. * * activate_task() STORE p->tick_dep_mask * STORE p->on_rq * __schedule() (switch to task 'p') smp_mb() (atomic_fetch_or()) * LOCK rq->lock LOAD p->on_rq * smp_mb__after_spin_lock() * tick_nohz_task_switch() * LOAD p->tick_dep_mask * * XXX given a task picks up the dependency on schedule(), should we * only care about tasks that are currently on the CPU instead of all * that are on the runqueue? * * That is, does this want to be: task_on_cpu() / task_curr()? */ if (!sched_task_on_rq(tsk)) return; /* * If the task concurrently migrates to another CPU, * we guarantee it sees the new tick dependency upon * schedule. * * set_task_cpu(p, cpu); * STORE p->cpu = @cpu * __schedule() (switch to task 'p') * LOCK rq->lock * smp_mb__after_spin_lock() STORE p->tick_dep_mask * tick_nohz_task_switch() smp_mb() (atomic_fetch_or()) * LOAD p->tick_dep_mask LOAD p->cpu */ cpu = task_cpu(tsk); preempt_disable(); if (cpu_online(cpu)) tick_nohz_full_kick_cpu(cpu); preempt_enable(); } /* * Kick all full dynticks CPUs in order to force these to re-evaluate * their dependency on the tick and restart it if necessary. */ static void tick_nohz_full_kick_all(void) { int cpu; if (!tick_nohz_full_running) return; preempt_disable(); for_each_cpu_and(cpu, tick_nohz_full_mask, cpu_online_mask) tick_nohz_full_kick_cpu(cpu); preempt_enable(); } static void tick_nohz_dep_set_all(atomic_t *dep, enum tick_dep_bits bit) { int prev; prev = atomic_fetch_or(BIT(bit), dep); if (!prev) tick_nohz_full_kick_all(); } /* * Set a global tick dependency. Used by perf events that rely on freq and * unstable clocks. */ void tick_nohz_dep_set(enum tick_dep_bits bit) { tick_nohz_dep_set_all(&tick_dep_mask, bit); } void tick_nohz_dep_clear(enum tick_dep_bits bit) { atomic_andnot(BIT(bit), &tick_dep_mask); } /* * Set per-CPU tick dependency. Used by scheduler and perf events in order to * manage event-throttling. */ void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit) { int prev; struct tick_sched *ts; ts = per_cpu_ptr(&tick_cpu_sched, cpu); prev = atomic_fetch_or(BIT(bit), &ts->tick_dep_mask); if (!prev) { preempt_disable(); /* Perf needs local kick that is NMI safe */ if (cpu == smp_processor_id()) { tick_nohz_full_kick(); } else { /* Remote IRQ work not NMI-safe */ if (!WARN_ON_ONCE(in_nmi())) tick_nohz_full_kick_cpu(cpu); } preempt_enable(); } } EXPORT_SYMBOL_GPL(tick_nohz_dep_set_cpu); void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit) { struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu); atomic_andnot(BIT(bit), &ts->tick_dep_mask); } EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_cpu); /* * Set a per-task tick dependency. RCU needs this. Also posix CPU timers * in order to elapse per task timers. */ void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit) { if (!atomic_fetch_or(BIT(bit), &tsk->tick_dep_mask)) tick_nohz_kick_task(tsk); } EXPORT_SYMBOL_GPL(tick_nohz_dep_set_task); void tick_nohz_dep_clear_task(struct task_struct *tsk, enum tick_dep_bits bit) { atomic_andnot(BIT(bit), &tsk->tick_dep_mask); } EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_task); /* * Set a per-taskgroup tick dependency. Posix CPU timers need this in order to elapse * per process timers. */ void tick_nohz_dep_set_signal(struct task_struct *tsk, enum tick_dep_bits bit) { int prev; struct signal_struct *sig = tsk->signal; prev = atomic_fetch_or(BIT(bit), &sig->tick_dep_mask); if (!prev) { struct task_struct *t; lockdep_assert_held(&tsk->sighand->siglock); __for_each_thread(sig, t) tick_nohz_kick_task(t); } } void tick_nohz_dep_clear_signal(struct signal_struct *sig, enum tick_dep_bits bit) { atomic_andnot(BIT(bit), &sig->tick_dep_mask); } /* * Re-evaluate the need for the tick as we switch the current task. * It might need the tick due to per task/process properties: * perf events, posix CPU timers, ... */ void __tick_nohz_task_switch(void) { struct tick_sched *ts; if (!tick_nohz_full_cpu(smp_processor_id())) return; ts = this_cpu_ptr(&tick_cpu_sched); if (tick_sched_flag_test(ts, TS_FLAG_STOPPED)) { if (atomic_read(¤t->tick_dep_mask) || atomic_read(¤t->signal->tick_dep_mask)) tick_nohz_full_kick(); } } /* Get the boot-time nohz CPU list from the kernel parameters. */ void __init tick_nohz_full_setup(cpumask_var_t cpumask) { alloc_bootmem_cpumask_var(&tick_nohz_full_mask); cpumask_copy(tick_nohz_full_mask, cpumask); tick_nohz_full_running = true; } bool tick_nohz_cpu_hotpluggable(unsigned int cpu) { /* * The 'tick_do_timer_cpu' CPU handles housekeeping duty (unbound * timers, workqueues, timekeeping, ...) on behalf of full dynticks * CPUs. It must remain online when nohz full is enabled. */ if (tick_nohz_full_running && READ_ONCE(tick_do_timer_cpu) == cpu) return false; return true; } static int tick_nohz_cpu_down(unsigned int cpu) { return tick_nohz_cpu_hotpluggable(cpu) ? 0 : -EBUSY; } void __init tick_nohz_init(void) { int cpu, ret; if (!tick_nohz_full_running) return; /* * Full dynticks uses IRQ work to drive the tick rescheduling on safe * locking contexts. But then we need IRQ work to raise its own * interrupts to avoid circular dependency on the tick. */ if (!arch_irq_work_has_interrupt()) { pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support IRQ work self-IPIs\n"); cpumask_clear(tick_nohz_full_mask); tick_nohz_full_running = false; return; } if (IS_ENABLED(CONFIG_PM_SLEEP_SMP) && !IS_ENABLED(CONFIG_PM_SLEEP_SMP_NONZERO_CPU)) { cpu = smp_processor_id(); if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) { pr_warn("NO_HZ: Clearing %d from nohz_full range " "for timekeeping\n", cpu); cpumask_clear_cpu(cpu, tick_nohz_full_mask); } } for_each_cpu(cpu, tick_nohz_full_mask) ct_cpu_track_user(cpu); ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "kernel/nohz:predown", NULL, tick_nohz_cpu_down); WARN_ON(ret < 0); pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n", cpumask_pr_args(tick_nohz_full_mask)); } #endif /* #ifdef CONFIG_NO_HZ_FULL */ /* * NOHZ - aka dynamic tick functionality */ #ifdef CONFIG_NO_HZ_COMMON /* * NO HZ enabled ? */ bool tick_nohz_enabled __read_mostly = true; unsigned long tick_nohz_active __read_mostly; /* * Enable / Disable tickless mode */ static int __init setup_tick_nohz(char *str) { return (kstrtobool(str, &tick_nohz_enabled) == 0); } __setup("nohz=", setup_tick_nohz); bool tick_nohz_tick_stopped(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); return tick_sched_flag_test(ts, TS_FLAG_STOPPED); } bool tick_nohz_tick_stopped_cpu(int cpu) { struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu); return tick_sched_flag_test(ts, TS_FLAG_STOPPED); } /** * tick_nohz_update_jiffies - update jiffies when idle was interrupted * @now: current ktime_t * * Called from interrupt entry when the CPU was idle * * In case the sched_tick was stopped on this CPU, we have to check if jiffies * must be updated. Otherwise an interrupt handler could use a stale jiffy * value. We do this unconditionally on any CPU, as we don't know whether the * CPU, which has the update task assigned, is in a long sleep. */ static void tick_nohz_update_jiffies(ktime_t now) { unsigned long flags; __this_cpu_write(tick_cpu_sched.idle_waketime, now); local_irq_save(flags); tick_do_update_jiffies64(now); local_irq_restore(flags); touch_softlockup_watchdog_sched(); } static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now) { ktime_t delta; if (WARN_ON_ONCE(!tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE))) return; delta = ktime_sub(now, ts->idle_entrytime); write_seqcount_begin(&ts->idle_sleeptime_seq); if (nr_iowait_cpu(smp_processor_id()) > 0) ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta); else ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); ts->idle_entrytime = now; tick_sched_flag_clear(ts, TS_FLAG_IDLE_ACTIVE); write_seqcount_end(&ts->idle_sleeptime_seq); sched_clock_idle_wakeup_event(); } static void tick_nohz_start_idle(struct tick_sched *ts) { write_seqcount_begin(&ts->idle_sleeptime_seq); ts->idle_entrytime = ktime_get(); tick_sched_flag_set(ts, TS_FLAG_IDLE_ACTIVE); write_seqcount_end(&ts->idle_sleeptime_seq); sched_clock_idle_sleep_event(); } static u64 get_cpu_sleep_time_us(struct tick_sched *ts, ktime_t *sleeptime, bool compute_delta, u64 *last_update_time) { ktime_t now, idle; unsigned int seq; if (!tick_nohz_active) return -1; now = ktime_get(); if (last_update_time) *last_update_time = ktime_to_us(now); do { seq = read_seqcount_begin(&ts->idle_sleeptime_seq); if (tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE) && compute_delta) { ktime_t delta = ktime_sub(now, ts->idle_entrytime); idle = ktime_add(*sleeptime, delta); } else { idle = *sleeptime; } } while (read_seqcount_retry(&ts->idle_sleeptime_seq, seq)); return ktime_to_us(idle); } /** * get_cpu_idle_time_us - get the total idle time of a CPU * @cpu: CPU number to query * @last_update_time: variable to store update time in. Do not update * counters if NULL. * * Return the cumulative idle time (since boot) for a given * CPU, in microseconds. Note that this is partially broken due to * the counter of iowait tasks that can be remotely updated without * any synchronization. Therefore it is possible to observe backward * values within two consecutive reads. * * This time is measured via accounting rather than sampling, * and is as accurate as ktime_get() is. * * Return: -1 if NOHZ is not enabled, else total idle time of the @cpu */ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); return get_cpu_sleep_time_us(ts, &ts->idle_sleeptime, !nr_iowait_cpu(cpu), last_update_time); } EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); /** * get_cpu_iowait_time_us - get the total iowait time of a CPU * @cpu: CPU number to query * @last_update_time: variable to store update time in. Do not update * counters if NULL. * * Return the cumulative iowait time (since boot) for a given * CPU, in microseconds. Note this is partially broken due to * the counter of iowait tasks that can be remotely updated without * any synchronization. Therefore it is possible to observe backward * values within two consecutive reads. * * This time is measured via accounting rather than sampling, * and is as accurate as ktime_get() is. * * Return: -1 if NOHZ is not enabled, else total iowait time of @cpu */ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); return get_cpu_sleep_time_us(ts, &ts->iowait_sleeptime, nr_iowait_cpu(cpu), last_update_time); } EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) { hrtimer_cancel(&ts->sched_timer); hrtimer_set_expires(&ts->sched_timer, ts->last_tick); /* Forward the time to expire in the future */ hrtimer_forward(&ts->sched_timer, now, TICK_NSEC); if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) { hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED_HARD); } else { tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); } /* * Reset to make sure the next tick stop doesn't get fooled by past * cached clock deadline. */ ts->next_tick = 0; } static inline bool local_timer_softirq_pending(void) { return local_timers_pending() & BIT(TIMER_SOFTIRQ); } /* * Read jiffies and the time when jiffies were updated last */ u64 get_jiffies_update(unsigned long *basej) { unsigned long basejiff; unsigned int seq; u64 basemono; do { seq = read_seqcount_begin(&jiffies_seq); basemono = last_jiffies_update; basejiff = jiffies; } while (read_seqcount_retry(&jiffies_seq, seq)); *basej = basejiff; return basemono; } /** * tick_nohz_next_event() - return the clock monotonic based next event * @ts: pointer to tick_sched struct * @cpu: CPU number * * Return: * *%0 - When the next event is a maximum of TICK_NSEC in the future * and the tick is not stopped yet * *%next_event - Next event based on clock monotonic */ static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) { u64 basemono, next_tick, delta, expires; unsigned long basejiff; int tick_cpu; basemono = get_jiffies_update(&basejiff); ts->last_jiffies = basejiff; ts->timer_expires_base = basemono; /* * Keep the periodic tick, when RCU, architecture or irq_work * requests it. * Aside of that, check whether the local timer softirq is * pending. If so, its a bad idea to call get_next_timer_interrupt(), * because there is an already expired timer, so it will request * immediate expiry, which rearms the hardware timer with a * minimal delta, which brings us back to this place * immediately. Lather, rinse and repeat... */ if (rcu_needs_cpu() || arch_needs_cpu() || irq_work_needs_cpu() || local_timer_softirq_pending()) { next_tick = basemono + TICK_NSEC; } else { /* * Get the next pending timer. If high resolution * timers are enabled this only takes the timer wheel * timers into account. If high resolution timers are * disabled this also looks at the next expiring * hrtimer. */ next_tick = get_next_timer_interrupt(basejiff, basemono); ts->next_timer = next_tick; } /* Make sure next_tick is never before basemono! */ if (WARN_ON_ONCE(basemono > next_tick)) next_tick = basemono; /* * If the tick is due in the next period, keep it ticking or * force prod the timer. */ delta = next_tick - basemono; if (delta <= (u64)TICK_NSEC) { /* * We've not stopped the tick yet, and there's a timer in the * next period, so no point in stopping it either, bail. */ if (!tick_sched_flag_test(ts, TS_FLAG_STOPPED)) { ts->timer_expires = 0; goto out; } } /* * If this CPU is the one which had the do_timer() duty last, we limit * the sleep time to the timekeeping 'max_deferment' value. * Otherwise we can sleep as long as we want. */ delta = timekeeping_max_deferment(); tick_cpu = READ_ONCE(tick_do_timer_cpu); if (tick_cpu != cpu && (tick_cpu != TICK_DO_TIMER_NONE || !tick_sched_flag_test(ts, TS_FLAG_DO_TIMER_LAST))) delta = KTIME_MAX; /* Calculate the next expiry time */ if (delta < (KTIME_MAX - basemono)) expires = basemono + delta; else expires = KTIME_MAX; ts->timer_expires = min_t(u64, expires, next_tick); out: return ts->timer_expires; } static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) { struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); unsigned long basejiff = ts->last_jiffies; u64 basemono = ts->timer_expires_base; bool timer_idle = tick_sched_flag_test(ts, TS_FLAG_STOPPED); int tick_cpu; u64 expires; /* Make sure we won't be trying to stop it twice in a row. */ ts->timer_expires_base = 0; /* * Now the tick should be stopped definitely - so the timer base needs * to be marked idle as well to not miss a newly queued timer. */ expires = timer_base_try_to_set_idle(basejiff, basemono, &timer_idle); if (expires > ts->timer_expires) { /* * This path could only happen when the first timer was removed * between calculating the possible sleep length and now (when * high resolution mode is not active, timer could also be a * hrtimer). * * We have to stick to the original calculated expiry value to * not stop the tick for too long with a shallow C-state (which * was programmed by cpuidle because of an early next expiration * value). */ expires = ts->timer_expires; } /* If the timer base is not idle, retain the not yet stopped tick. */ if (!timer_idle) return; /* * If this CPU is the one which updates jiffies, then give up * the assignment and let it be taken by the CPU which runs * the tick timer next, which might be this CPU as well. If we * don't drop this here, the jiffies might be stale and * do_timer() never gets invoked. Keep track of the fact that it * was the one which had the do_timer() duty last. */ tick_cpu = READ_ONCE(tick_do_timer_cpu); if (tick_cpu == cpu) { WRITE_ONCE(tick_do_timer_cpu, TICK_DO_TIMER_NONE); tick_sched_flag_set(ts, TS_FLAG_DO_TIMER_LAST); } else if (tick_cpu != TICK_DO_TIMER_NONE) { tick_sched_flag_clear(ts, TS_FLAG_DO_TIMER_LAST); } /* Skip reprogram of event if it's not changed */ if (tick_sched_flag_test(ts, TS_FLAG_STOPPED) && (expires == ts->next_tick)) { /* Sanity check: make sure clockevent is actually programmed */ if (expires == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer)) return; WARN_ONCE(1, "basemono: %llu ts->next_tick: %llu dev->next_event: %llu " "timer->active: %d timer->expires: %llu\n", basemono, ts->next_tick, dev->next_event, hrtimer_active(&ts->sched_timer), hrtimer_get_expires(&ts->sched_timer)); } /* * tick_nohz_stop_tick() can be called several times before * tick_nohz_restart_sched_tick() is called. This happens when * interrupts arrive which do not cause a reschedule. In the first * call we save the current tick time, so we can restart the * scheduler tick in tick_nohz_restart_sched_tick(). */ if (!tick_sched_flag_test(ts, TS_FLAG_STOPPED)) { calc_load_nohz_start(); quiet_vmstat(); ts->last_tick = hrtimer_get_expires(&ts->sched_timer); tick_sched_flag_set(ts, TS_FLAG_STOPPED); trace_tick_stop(1, TICK_DEP_MASK_NONE); } ts->next_tick = expires; /* * If the expiration time == KTIME_MAX, then we simply stop * the tick timer. */ if (unlikely(expires == KTIME_MAX)) { if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) hrtimer_cancel(&ts->sched_timer); else tick_program_event(KTIME_MAX, 1); return; } if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) { hrtimer_start(&ts->sched_timer, expires, HRTIMER_MODE_ABS_PINNED_HARD); } else { hrtimer_set_expires(&ts->sched_timer, expires); tick_program_event(expires, 1); } } static void tick_nohz_retain_tick(struct tick_sched *ts) { ts->timer_expires_base = 0; } #ifdef CONFIG_NO_HZ_FULL static void tick_nohz_full_stop_tick(struct tick_sched *ts, int cpu) { if (tick_nohz_next_event(ts, cpu)) tick_nohz_stop_tick(ts, cpu); else tick_nohz_retain_tick(ts); } #endif /* CONFIG_NO_HZ_FULL */ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) { /* Update jiffies first */ tick_do_update_jiffies64(now); /* * Clear the timer idle flag, so we avoid IPIs on remote queueing and * the clock forward checks in the enqueue path: */ timer_clear_idle(); calc_load_nohz_stop(); touch_softlockup_watchdog_sched(); /* Cancel the scheduled timer and restore the tick: */ tick_sched_flag_clear(ts, TS_FLAG_STOPPED); tick_nohz_restart(ts, now); } static void __tick_nohz_full_update_tick(struct tick_sched *ts, ktime_t now) { #ifdef CONFIG_NO_HZ_FULL int cpu = smp_processor_id(); if (can_stop_full_tick(cpu, ts)) tick_nohz_full_stop_tick(ts, cpu); else if (tick_sched_flag_test(ts, TS_FLAG_STOPPED)) tick_nohz_restart_sched_tick(ts, now); #endif } static void tick_nohz_full_update_tick(struct tick_sched *ts) { if (!tick_nohz_full_cpu(smp_processor_id())) return; if (!tick_sched_flag_test(ts, TS_FLAG_NOHZ)) return; __tick_nohz_full_update_tick(ts, ktime_get()); } /* * A pending softirq outside an IRQ (or softirq disabled section) context * should be waiting for ksoftirqd to handle it. Therefore we shouldn't * reach this code due to the need_resched() early check in can_stop_idle_tick(). * * However if we are between CPUHP_AP_SMPBOOT_THREADS and CPU_TEARDOWN_CPU on the * cpu_down() process, softirqs can still be raised while ksoftirqd is parked, * triggering the code below, since wakep_softirqd() is ignored. * */ static bool report_idle_softirq(void) { static int ratelimit; unsigned int pending = local_softirq_pending(); if (likely(!pending)) return false; /* Some softirqs claim to be safe against hotplug and ksoftirqd parking */ if (!cpu_active(smp_processor_id())) { pending &= ~SOFTIRQ_HOTPLUG_SAFE_MASK; if (!pending) return false; } /* On RT, softirq handling may be waiting on some lock */ if (local_bh_blocked()) return false; if (ratelimit < 10) { pr_warn("NOHZ tick-stop error: local softirq work is pending, handler #%02x!!!\n", pending); ratelimit++; } return true; } static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) { WARN_ON_ONCE(cpu_is_offline(cpu)); if (unlikely(!tick_sched_flag_test(ts, TS_FLAG_NOHZ))) return false; if (need_resched()) return false; if (unlikely(report_idle_softirq())) return false; if (tick_nohz_full_enabled()) { int tick_cpu = READ_ONCE(tick_do_timer_cpu); /* * Keep the tick alive to guarantee timekeeping progression * if there are full dynticks CPUs around */ if (tick_cpu == cpu) return false; /* Should not happen for nohz-full */ if (WARN_ON_ONCE(tick_cpu == TICK_DO_TIMER_NONE)) return false; } return true; } /** * tick_nohz_idle_stop_tick - stop the idle tick from the idle task * * When the next event is more than a tick into the future, stop the idle tick */ void tick_nohz_idle_stop_tick(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); int cpu = smp_processor_id(); ktime_t expires; /* * If tick_nohz_get_sleep_length() ran tick_nohz_next_event(), the * tick timer expiration time is known already. */ if (ts->timer_expires_base) expires = ts->timer_expires; else if (can_stop_idle_tick(cpu, ts)) expires = tick_nohz_next_event(ts, cpu); else return; ts->idle_calls++; if (expires > 0LL) { int was_stopped = tick_sched_flag_test(ts, TS_FLAG_STOPPED); tick_nohz_stop_tick(ts, cpu); ts->idle_sleeps++; ts->idle_expires = expires; if (!was_stopped && tick_sched_flag_test(ts, TS_FLAG_STOPPED)) { ts->idle_jiffies = ts->last_jiffies; nohz_balance_enter_idle(cpu); } } else { tick_nohz_retain_tick(ts); } } void tick_nohz_idle_retain_tick(void) { tick_nohz_retain_tick(this_cpu_ptr(&tick_cpu_sched)); } /** * tick_nohz_idle_enter - prepare for entering idle on the current CPU * * Called when we start the idle loop. */ void tick_nohz_idle_enter(void) { struct tick_sched *ts; lockdep_assert_irqs_enabled(); local_irq_disable(); ts = this_cpu_ptr(&tick_cpu_sched); WARN_ON_ONCE(ts->timer_expires_base); tick_sched_flag_set(ts, TS_FLAG_INIDLE); tick_nohz_start_idle(ts); local_irq_enable(); } /** * tick_nohz_irq_exit - Notify the tick about IRQ exit * * A timer may have been added/modified/deleted either by the current IRQ, * or by another place using this IRQ as a notification. This IRQ may have * also updated the RCU callback list. These events may require a * re-evaluation of the next tick. Depending on the context: * * 1) If the CPU is idle and no resched is pending, just proceed with idle * time accounting. The next tick will be re-evaluated on the next idle * loop iteration. * * 2) If the CPU is nohz_full: * * 2.1) If there is any tick dependency, restart the tick if stopped. * * 2.2) If there is no tick dependency, (re-)evaluate the next tick and * stop/update it accordingly. */ void tick_nohz_irq_exit(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); if (tick_sched_flag_test(ts, TS_FLAG_INIDLE)) tick_nohz_start_idle(ts); else tick_nohz_full_update_tick(ts); } /** * tick_nohz_idle_got_tick - Check whether or not the tick handler has run * * Return: %true if the tick handler has run, otherwise %false */ bool tick_nohz_idle_got_tick(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); if (ts->got_idle_tick) { ts->got_idle_tick = 0; return true; } return false; } /** * tick_nohz_get_next_hrtimer - return the next expiration time for the hrtimer * or the tick, whichever expires first. Note that, if the tick has been * stopped, it returns the next hrtimer. * * Called from power state control code with interrupts disabled * * Return: the next expiration time */ ktime_t tick_nohz_get_next_hrtimer(void) { return __this_cpu_read(tick_cpu_device.evtdev)->next_event; } /** * tick_nohz_get_sleep_length - return the expected length of the current sleep * @delta_next: duration until the next event if the tick cannot be stopped * * Called from power state control code with interrupts disabled. * * The return value of this function and/or the value returned by it through the * @delta_next pointer can be negative which must be taken into account by its * callers. * * Return: the expected length of the current sleep */ ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next) { struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); int cpu = smp_processor_id(); /* * The idle entry time is expected to be a sufficient approximation of * the current time at this point. */ ktime_t now = ts->idle_entrytime; ktime_t next_event; WARN_ON_ONCE(!tick_sched_flag_test(ts, TS_FLAG_INIDLE)); *delta_next = ktime_sub(dev->next_event, now); if (!can_stop_idle_tick(cpu, ts)) return *delta_next; next_event = tick_nohz_next_event(ts, cpu); if (!next_event) return *delta_next; /* * If the next highres timer to expire is earlier than 'next_event', the * idle governor needs to know that. */ next_event = min_t(u64, next_event, hrtimer_next_event_without(&ts->sched_timer)); return ktime_sub(next_event, now); } /** * tick_nohz_get_idle_calls_cpu - return the current idle calls counter value * for a particular CPU. * @cpu: target CPU number * * Called from the schedutil frequency scaling governor in scheduler context. * * Return: the current idle calls counter value for @cpu */ unsigned long tick_nohz_get_idle_calls_cpu(int cpu) { struct tick_sched *ts = tick_get_tick_sched(cpu); return ts->idle_calls; } static void tick_nohz_account_idle_time(struct tick_sched *ts, ktime_t now) { unsigned long ticks; ts->idle_exittime = now; if (vtime_accounting_enabled_this_cpu()) return; /* * We stopped the tick in idle. update_process_times() would miss the * time we slept, as it does only a 1 tick accounting. * Enforce that this is accounted to idle ! */ ticks = jiffies - ts->idle_jiffies; /* * We might be one off. Do not randomly account a huge number of ticks! */ if (ticks && ticks < LONG_MAX) account_idle_ticks(ticks); } void tick_nohz_idle_restart_tick(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); if (tick_sched_flag_test(ts, TS_FLAG_STOPPED)) { ktime_t now = ktime_get(); tick_nohz_restart_sched_tick(ts, now); tick_nohz_account_idle_time(ts, now); } } static void tick_nohz_idle_update_tick(struct tick_sched *ts, ktime_t now) { if (tick_nohz_full_cpu(smp_processor_id())) __tick_nohz_full_update_tick(ts, now); else tick_nohz_restart_sched_tick(ts, now); tick_nohz_account_idle_time(ts, now); } /** * tick_nohz_idle_exit - Update the tick upon idle task exit * * When the idle task exits, update the tick depending on the * following situations: * * 1) If the CPU is not in nohz_full mode (most cases), then * restart the tick. * * 2) If the CPU is in nohz_full mode (corner case): * 2.1) If the tick can be kept stopped (no tick dependencies) * then re-evaluate the next tick and try to keep it stopped * as long as possible. * 2.2) If the tick has dependencies, restart the tick. * */ void tick_nohz_idle_exit(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); bool idle_active, tick_stopped; ktime_t now; local_irq_disable(); WARN_ON_ONCE(!tick_sched_flag_test(ts, TS_FLAG_INIDLE)); WARN_ON_ONCE(ts->timer_expires_base); tick_sched_flag_clear(ts, TS_FLAG_INIDLE); idle_active = tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE); tick_stopped = tick_sched_flag_test(ts, TS_FLAG_STOPPED); if (idle_active || tick_stopped) now = ktime_get(); if (idle_active) tick_nohz_stop_idle(ts, now); if (tick_stopped) tick_nohz_idle_update_tick(ts, now); local_irq_enable(); } /* * In low-resolution mode, the tick handler must be implemented directly * at the clockevent level. hrtimer can't be used instead, because its * infrastructure actually relies on the tick itself as a backend in * low-resolution mode (see hrtimer_run_queues()). */ static void tick_nohz_lowres_handler(struct clock_event_device *dev) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); dev->next_event = KTIME_MAX; if (likely(tick_nohz_handler(&ts->sched_timer) == HRTIMER_RESTART)) tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); } static inline void tick_nohz_activate(struct tick_sched *ts) { if (!tick_nohz_enabled) return; tick_sched_flag_set(ts, TS_FLAG_NOHZ); /* One update is enough */ if (!test_and_set_bit(0, &tick_nohz_active)) timers_update_nohz(); } /** * tick_nohz_switch_to_nohz - switch to NOHZ mode */ static void tick_nohz_switch_to_nohz(void) { if (!tick_nohz_enabled) return; if (tick_switch_to_oneshot(tick_nohz_lowres_handler)) return; /* * Recycle the hrtimer in 'ts', so we can share the * highres code. */ tick_setup_sched_timer(false); } static inline void tick_nohz_irq_enter(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); ktime_t now; if (!tick_sched_flag_test(ts, TS_FLAG_STOPPED | TS_FLAG_IDLE_ACTIVE)) return; now = ktime_get(); if (tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE)) tick_nohz_stop_idle(ts, now); /* * If all CPUs are idle we may need to update a stale jiffies value. * Note nohz_full is a special case: a timekeeper is guaranteed to stay * alive but it might be busy looping with interrupts disabled in some * rare case (typically stop machine). So we must make sure we have a * last resort. */ if (tick_sched_flag_test(ts, TS_FLAG_STOPPED)) tick_nohz_update_jiffies(now); } #else static inline void tick_nohz_switch_to_nohz(void) { } static inline void tick_nohz_irq_enter(void) { } static inline void tick_nohz_activate(struct tick_sched *ts) { } #endif /* CONFIG_NO_HZ_COMMON */ /* * Called from irq_enter() to notify about the possible interruption of idle() */ void tick_irq_enter(void) { tick_check_oneshot_broadcast_this_cpu(); tick_nohz_irq_enter(); } static int sched_skew_tick; static int __init skew_tick(char *str) { get_option(&str, &sched_skew_tick); return 0; } early_param("skew_tick", skew_tick); /** * tick_setup_sched_timer - setup the tick emulation timer * @hrtimer: whether to use the hrtimer or not */ void tick_setup_sched_timer(bool hrtimer) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); /* Emulate tick processing via per-CPU hrtimers: */ hrtimer_setup(&ts->sched_timer, tick_nohz_handler, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && hrtimer) tick_sched_flag_set(ts, TS_FLAG_HIGHRES); /* Get the next period (per-CPU) */ hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); /* Offset the tick to avert 'jiffies_lock' contention. */ if (sched_skew_tick) { u64 offset = TICK_NSEC >> 1; do_div(offset, num_possible_cpus()); offset *= smp_processor_id(); hrtimer_add_expires_ns(&ts->sched_timer, offset); } hrtimer_forward_now(&ts->sched_timer, TICK_NSEC); if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && hrtimer) hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED_HARD); else tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); tick_nohz_activate(ts); } /* * Shut down the tick and make sure the CPU won't try to retake the timekeeping * duty before disabling IRQs in idle for the last time. */ void tick_sched_timer_dying(int cpu) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); ktime_t idle_sleeptime, iowait_sleeptime; unsigned long idle_calls, idle_sleeps; /* This must happen before hrtimers are migrated! */ if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) hrtimer_cancel(&ts->sched_timer); idle_sleeptime = ts->idle_sleeptime; iowait_sleeptime = ts->iowait_sleeptime; idle_calls = ts->idle_calls; idle_sleeps = ts->idle_sleeps; memset(ts, 0, sizeof(*ts)); ts->idle_sleeptime = idle_sleeptime; ts->iowait_sleeptime = iowait_sleeptime; ts->idle_calls = idle_calls; ts->idle_sleeps = idle_sleeps; } /* * Async notification about clocksource changes */ void tick_clock_notify(void) { int cpu; for_each_possible_cpu(cpu) set_bit(0, &per_cpu(tick_cpu_sched, cpu).check_clocks); } /* * Async notification about clock event changes */ void tick_oneshot_notify(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); set_bit(0, &ts->check_clocks); } /* * Check if a change happened, which makes oneshot possible. * * Called cyclically from the hrtimer softirq (driven by the timer * softirq). 'allow_nohz' signals that we can switch into low-res NOHZ * mode, because high resolution timers are disabled (either compile * or runtime). Called with interrupts disabled. */ int tick_check_oneshot_change(int allow_nohz) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); if (!test_and_clear_bit(0, &ts->check_clocks)) return 0; if (tick_sched_flag_test(ts, TS_FLAG_NOHZ)) return 0; if (!timekeeping_valid_for_hres() || !tick_is_oneshot_available()) return 0; if (!allow_nohz) return 1; tick_nohz_switch_to_nohz(); return 0; } |
| 8 1 6 1 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 | // SPDX-License-Identifier: GPL-2.0-only /* * (C) 2013 Astaro GmbH & Co KG */ #include <linux/module.h> #include <linux/skbuff.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_labels.h> #include <linux/netfilter/x_tables.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Florian Westphal <fw@strlen.de>"); MODULE_DESCRIPTION("Xtables: add/match connection tracking labels"); MODULE_ALIAS("ipt_connlabel"); MODULE_ALIAS("ip6t_connlabel"); static bool connlabel_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_connlabel_mtinfo *info = par->matchinfo; enum ip_conntrack_info ctinfo; struct nf_conn_labels *labels; struct nf_conn *ct; bool invert = info->options & XT_CONNLABEL_OP_INVERT; ct = nf_ct_get(skb, &ctinfo); if (ct == NULL) return invert; labels = nf_ct_labels_find(ct); if (!labels) return invert; if (test_bit(info->bit, labels->bits)) return !invert; if (info->options & XT_CONNLABEL_OP_SET) { if (!test_and_set_bit(info->bit, labels->bits)) nf_conntrack_event_cache(IPCT_LABEL, ct); return !invert; } return invert; } static int connlabel_mt_check(const struct xt_mtchk_param *par) { const int options = XT_CONNLABEL_OP_INVERT | XT_CONNLABEL_OP_SET; struct xt_connlabel_mtinfo *info = par->matchinfo; int ret; if (info->options & ~options) { pr_info_ratelimited("Unknown options in mask %x\n", info->options); return -EINVAL; } ret = nf_ct_netns_get(par->net, par->family); if (ret < 0) { pr_info_ratelimited("cannot load conntrack support for proto=%u\n", par->family); return ret; } ret = nf_connlabels_get(par->net, info->bit); if (ret < 0) nf_ct_netns_put(par->net, par->family); return ret; } static void connlabel_mt_destroy(const struct xt_mtdtor_param *par) { nf_connlabels_put(par->net); nf_ct_netns_put(par->net, par->family); } static struct xt_match connlabels_mt_reg __read_mostly = { .name = "connlabel", .family = NFPROTO_UNSPEC, .checkentry = connlabel_mt_check, .match = connlabel_mt, .matchsize = sizeof(struct xt_connlabel_mtinfo), .destroy = connlabel_mt_destroy, .me = THIS_MODULE, }; static int __init connlabel_mt_init(void) { return xt_register_match(&connlabels_mt_reg); } static void __exit connlabel_mt_exit(void) { xt_unregister_match(&connlabels_mt_reg); } module_init(connlabel_mt_init); module_exit(connlabel_mt_exit); |
| 1031 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 | // SPDX-License-Identifier: GPL-2.0 struct io_tctx_node { struct list_head ctx_node; struct task_struct *task; struct io_ring_ctx *ctx; }; int io_uring_alloc_task_context(struct task_struct *task, struct io_ring_ctx *ctx); void io_uring_del_tctx_node(unsigned long index); int __io_uring_add_tctx_node(struct io_ring_ctx *ctx); int __io_uring_add_tctx_node_from_submit(struct io_ring_ctx *ctx); void io_uring_clean_tctx(struct io_uring_task *tctx); void io_uring_unreg_ringfd(void); int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg, unsigned nr_args); int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg, unsigned nr_args); /* * Note that this task has used io_uring. We use it for cancelation purposes. */ static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx) { struct io_uring_task *tctx = current->io_uring; if (likely(tctx && tctx->last == ctx)) return 0; return __io_uring_add_tctx_node_from_submit(ctx); } |
| 4010 3031 4014 4009 4010 2616 3996 4010 4020 4009 3023 3026 3031 2393 3409 3416 3031 2798 3018 3016 3035 2390 2391 2396 2387 2392 2658 2813 2815 3 147 2850 2675 2768 1 1 1 1 1 1 1 2861 2774 2859 2679 2836 2854 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 | // SPDX-License-Identifier: GPL-2.0-only /* * klist.c - Routines for manipulating klists. * * Copyright (C) 2005 Patrick Mochel * * This klist interface provides a couple of structures that wrap around * struct list_head to provide explicit list "head" (struct klist) and list * "node" (struct klist_node) objects. For struct klist, a spinlock is * included that protects access to the actual list itself. struct * klist_node provides a pointer to the klist that owns it and a kref * reference count that indicates the number of current users of that node * in the list. * * The entire point is to provide an interface for iterating over a list * that is safe and allows for modification of the list during the * iteration (e.g. insertion and removal), including modification of the * current node on the list. * * It works using a 3rd object type - struct klist_iter - that is declared * and initialized before an iteration. klist_next() is used to acquire the * next element in the list. It returns NULL if there are no more items. * Internally, that routine takes the klist's lock, decrements the * reference count of the previous klist_node and increments the count of * the next klist_node. It then drops the lock and returns. * * There are primitives for adding and removing nodes to/from a klist. * When deleting, klist_del() will simply decrement the reference count. * Only when the count goes to 0 is the node removed from the list. * klist_remove() will try to delete the node from the list and block until * it is actually removed. This is useful for objects (like devices) that * have been removed from the system and must be freed (but must wait until * all accessors have finished). */ #include <linux/klist.h> #include <linux/export.h> #include <linux/sched.h> /* * Use the lowest bit of n_klist to mark deleted nodes and exclude * dead ones from iteration. */ #define KNODE_DEAD 1LU #define KNODE_KLIST_MASK ~KNODE_DEAD static struct klist *knode_klist(struct klist_node *knode) { return (struct klist *) ((unsigned long)knode->n_klist & KNODE_KLIST_MASK); } static bool knode_dead(struct klist_node *knode) { return (unsigned long)knode->n_klist & KNODE_DEAD; } static void knode_set_klist(struct klist_node *knode, struct klist *klist) { knode->n_klist = klist; /* no knode deserves to start its life dead */ WARN_ON(knode_dead(knode)); } static void knode_kill(struct klist_node *knode) { /* and no knode should die twice ever either, see we're very humane */ WARN_ON(knode_dead(knode)); *(unsigned long *)&knode->n_klist |= KNODE_DEAD; } /** * klist_init - Initialize a klist structure. * @k: The klist we're initializing. * @get: The get function for the embedding object (NULL if none) * @put: The put function for the embedding object (NULL if none) * * Initialises the klist structure. If the klist_node structures are * going to be embedded in refcounted objects (necessary for safe * deletion) then the get/put arguments are used to initialise * functions that take and release references on the embedding * objects. */ void klist_init(struct klist *k, void (*get)(struct klist_node *), void (*put)(struct klist_node *)) { INIT_LIST_HEAD(&k->k_list); spin_lock_init(&k->k_lock); k->get = get; k->put = put; } EXPORT_SYMBOL_GPL(klist_init); static void add_head(struct klist *k, struct klist_node *n) { spin_lock(&k->k_lock); list_add(&n->n_node, &k->k_list); spin_unlock(&k->k_lock); } static void add_tail(struct klist *k, struct klist_node *n) { spin_lock(&k->k_lock); list_add_tail(&n->n_node, &k->k_list); spin_unlock(&k->k_lock); } static void klist_node_init(struct klist *k, struct klist_node *n) { INIT_LIST_HEAD(&n->n_node); kref_init(&n->n_ref); knode_set_klist(n, k); if (k->get) k->get(n); } /** * klist_add_head - Initialize a klist_node and add it to front. * @n: node we're adding. * @k: klist it's going on. */ void klist_add_head(struct klist_node *n, struct klist *k) { klist_node_init(k, n); add_head(k, n); } EXPORT_SYMBOL_GPL(klist_add_head); /** * klist_add_tail - Initialize a klist_node and add it to back. * @n: node we're adding. * @k: klist it's going on. */ void klist_add_tail(struct klist_node *n, struct klist *k) { klist_node_init(k, n); add_tail(k, n); } EXPORT_SYMBOL_GPL(klist_add_tail); /** * klist_add_behind - Init a klist_node and add it after an existing node * @n: node we're adding. * @pos: node to put @n after */ void klist_add_behind(struct klist_node *n, struct klist_node *pos) { struct klist *k = knode_klist(pos); klist_node_init(k, n); spin_lock(&k->k_lock); list_add(&n->n_node, &pos->n_node); spin_unlock(&k->k_lock); } EXPORT_SYMBOL_GPL(klist_add_behind); /** * klist_add_before - Init a klist_node and add it before an existing node * @n: node we're adding. * @pos: node to put @n after */ void klist_add_before(struct klist_node *n, struct klist_node *pos) { struct klist *k = knode_klist(pos); klist_node_init(k, n); spin_lock(&k->k_lock); list_add_tail(&n->n_node, &pos->n_node); spin_unlock(&k->k_lock); } EXPORT_SYMBOL_GPL(klist_add_before); struct klist_waiter { struct list_head list; struct klist_node *node; struct task_struct *process; int woken; }; static DEFINE_SPINLOCK(klist_remove_lock); static LIST_HEAD(klist_remove_waiters); static void klist_release(struct kref *kref) { struct klist_waiter *waiter, *tmp; struct klist_node *n = container_of(kref, struct klist_node, n_ref); WARN_ON(!knode_dead(n)); list_del(&n->n_node); spin_lock(&klist_remove_lock); list_for_each_entry_safe(waiter, tmp, &klist_remove_waiters, list) { if (waiter->node != n) continue; list_del(&waiter->list); waiter->woken = 1; mb(); wake_up_process(waiter->process); } spin_unlock(&klist_remove_lock); knode_set_klist(n, NULL); } static int klist_dec_and_del(struct klist_node *n) { return kref_put(&n->n_ref, klist_release); } static void klist_put(struct klist_node *n, bool kill) { struct klist *k = knode_klist(n); void (*put)(struct klist_node *) = k->put; spin_lock(&k->k_lock); if (kill) knode_kill(n); if (!klist_dec_and_del(n)) put = NULL; spin_unlock(&k->k_lock); if (put) put(n); } /** * klist_del - Decrement the reference count of node and try to remove. * @n: node we're deleting. */ void klist_del(struct klist_node *n) { klist_put(n, true); } EXPORT_SYMBOL_GPL(klist_del); /** * klist_remove - Decrement the refcount of node and wait for it to go away. * @n: node we're removing. */ void klist_remove(struct klist_node *n) { struct klist_waiter waiter; waiter.node = n; waiter.process = current; waiter.woken = 0; spin_lock(&klist_remove_lock); list_add(&waiter.list, &klist_remove_waiters); spin_unlock(&klist_remove_lock); klist_del(n); for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); if (waiter.woken) break; schedule(); } __set_current_state(TASK_RUNNING); } EXPORT_SYMBOL_GPL(klist_remove); /** * klist_node_attached - Say whether a node is bound to a list or not. * @n: Node that we're testing. */ int klist_node_attached(struct klist_node *n) { return (n->n_klist != NULL); } EXPORT_SYMBOL_GPL(klist_node_attached); /** * klist_iter_init_node - Initialize a klist_iter structure. * @k: klist we're iterating. * @i: klist_iter we're filling. * @n: node to start with. * * Similar to klist_iter_init(), but starts the action off with @n, * instead of with the list head. */ void klist_iter_init_node(struct klist *k, struct klist_iter *i, struct klist_node *n) { i->i_klist = k; i->i_cur = NULL; if (n && kref_get_unless_zero(&n->n_ref)) i->i_cur = n; } EXPORT_SYMBOL_GPL(klist_iter_init_node); /** * klist_iter_init - Iniitalize a klist_iter structure. * @k: klist we're iterating. * @i: klist_iter structure we're filling. * * Similar to klist_iter_init_node(), but start with the list head. */ void klist_iter_init(struct klist *k, struct klist_iter *i) { klist_iter_init_node(k, i, NULL); } EXPORT_SYMBOL_GPL(klist_iter_init); /** * klist_iter_exit - Finish a list iteration. * @i: Iterator structure. * * Must be called when done iterating over list, as it decrements the * refcount of the current node. Necessary in case iteration exited before * the end of the list was reached, and always good form. */ void klist_iter_exit(struct klist_iter *i) { if (i->i_cur) { klist_put(i->i_cur, false); i->i_cur = NULL; } } EXPORT_SYMBOL_GPL(klist_iter_exit); static struct klist_node *to_klist_node(struct list_head *n) { return container_of(n, struct klist_node, n_node); } /** * klist_prev - Ante up prev node in list. * @i: Iterator structure. * * First grab list lock. Decrement the reference count of the previous * node, if there was one. Grab the prev node, increment its reference * count, drop the lock, and return that prev node. */ struct klist_node *klist_prev(struct klist_iter *i) { void (*put)(struct klist_node *) = i->i_klist->put; struct klist_node *last = i->i_cur; struct klist_node *prev; unsigned long flags; spin_lock_irqsave(&i->i_klist->k_lock, flags); if (last) { prev = to_klist_node(last->n_node.prev); if (!klist_dec_and_del(last)) put = NULL; } else prev = to_klist_node(i->i_klist->k_list.prev); i->i_cur = NULL; while (prev != to_klist_node(&i->i_klist->k_list)) { if (likely(!knode_dead(prev))) { kref_get(&prev->n_ref); i->i_cur = prev; break; } prev = to_klist_node(prev->n_node.prev); } spin_unlock_irqrestore(&i->i_klist->k_lock, flags); if (put && last) put(last); return i->i_cur; } EXPORT_SYMBOL_GPL(klist_prev); /** * klist_next - Ante up next node in list. * @i: Iterator structure. * * First grab list lock. Decrement the reference count of the previous * node, if there was one. Grab the next node, increment its reference * count, drop the lock, and return that next node. */ struct klist_node *klist_next(struct klist_iter *i) { void (*put)(struct klist_node *) = i->i_klist->put; struct klist_node *last = i->i_cur; struct klist_node *next; unsigned long flags; spin_lock_irqsave(&i->i_klist->k_lock, flags); if (last) { next = to_klist_node(last->n_node.next); if (!klist_dec_and_del(last)) put = NULL; } else next = to_klist_node(i->i_klist->k_list.next); i->i_cur = NULL; while (next != to_klist_node(&i->i_klist->k_list)) { if (likely(!knode_dead(next))) { kref_get(&next->n_ref); i->i_cur = next; break; } next = to_klist_node(next->n_node.next); } spin_unlock_irqrestore(&i->i_klist->k_lock, flags); if (put && last) put(last); return i->i_cur; } EXPORT_SYMBOL_GPL(klist_next); |
| 11 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 | // SPDX-License-Identifier: GPL-2.0 #ifndef IORING_CANCEL_H #define IORING_CANCEL_H #include <linux/io_uring_types.h> struct io_cancel_data { struct io_ring_ctx *ctx; union { u64 data; struct file *file; }; u8 opcode; u32 flags; int seq; }; int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags); int io_try_cancel(struct io_uring_task *tctx, struct io_cancel_data *cd, unsigned int issue_flags); int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg); bool io_cancel_req_match(struct io_kiocb *req, struct io_cancel_data *cd); bool io_match_task_safe(struct io_kiocb *head, struct io_uring_task *tctx, bool cancel_all); bool io_cancel_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, struct hlist_head *list, bool cancel_all, bool (*cancel)(struct io_kiocb *)); int io_cancel_remove(struct io_ring_ctx *ctx, struct io_cancel_data *cd, unsigned int issue_flags, struct hlist_head *list, bool (*cancel)(struct io_kiocb *)); __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, struct io_uring_task *tctx, bool cancel_all, bool is_sqpoll_thread); __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); __cold bool io_cancel_ctx_cb(struct io_wq_work *work, void *data); static inline bool io_cancel_match_sequence(struct io_kiocb *req, int sequence) { if (req->cancel_seq_set && sequence == req->work.cancel_seq) return true; req->cancel_seq_set = true; req->work.cancel_seq = sequence; return false; } #endif |
| 282 282 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 | // SPDX-License-Identifier: GPL-2.0 /* * Helpers for IOMMU drivers implementing SVA */ #include <linux/mmu_context.h> #include <linux/mutex.h> #include <linux/sched/mm.h> #include <linux/iommu.h> #include "iommu-priv.h" static DEFINE_MUTEX(iommu_sva_lock); static bool iommu_sva_present; static LIST_HEAD(iommu_sva_mms); static struct iommu_domain *iommu_sva_domain_alloc(struct device *dev, struct mm_struct *mm); /* Allocate a PASID for the mm within range (inclusive) */ static struct iommu_mm_data *iommu_alloc_mm_data(struct mm_struct *mm, struct device *dev) { struct iommu_mm_data *iommu_mm; ioasid_t pasid; lockdep_assert_held(&iommu_sva_lock); if (!arch_pgtable_dma_compat(mm)) return ERR_PTR(-EBUSY); iommu_mm = mm->iommu_mm; /* Is a PASID already associated with this mm? */ if (iommu_mm) { if (iommu_mm->pasid >= dev->iommu->max_pasids) return ERR_PTR(-EOVERFLOW); return iommu_mm; } iommu_mm = kzalloc(sizeof(struct iommu_mm_data), GFP_KERNEL); if (!iommu_mm) return ERR_PTR(-ENOMEM); pasid = iommu_alloc_global_pasid(dev); if (pasid == IOMMU_PASID_INVALID) { kfree(iommu_mm); return ERR_PTR(-ENOSPC); } iommu_mm->pasid = pasid; iommu_mm->mm = mm; INIT_LIST_HEAD(&iommu_mm->sva_domains); /* * Make sure the write to mm->iommu_mm is not reordered in front of * initialization to iommu_mm fields. If it does, readers may see a * valid iommu_mm with uninitialized values. */ smp_store_release(&mm->iommu_mm, iommu_mm); return iommu_mm; } /** * iommu_sva_bind_device() - Bind a process address space to a device * @dev: the device * @mm: the mm to bind, caller must hold a reference to mm_users * * Create a bond between device and address space, allowing the device to * access the mm using the PASID returned by iommu_sva_get_pasid(). If a * bond already exists between @device and @mm, an additional internal * reference is taken. Caller must call iommu_sva_unbind_device() * to release each reference. * * On error, returns an ERR_PTR value. */ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm) { struct iommu_group *group = dev->iommu_group; struct iommu_attach_handle *attach_handle; struct iommu_mm_data *iommu_mm; struct iommu_domain *domain; struct iommu_sva *handle; int ret; if (!group) return ERR_PTR(-ENODEV); mutex_lock(&iommu_sva_lock); /* Allocate mm->pasid if necessary. */ iommu_mm = iommu_alloc_mm_data(mm, dev); if (IS_ERR(iommu_mm)) { ret = PTR_ERR(iommu_mm); goto out_unlock; } /* A bond already exists, just take a reference`. */ attach_handle = iommu_attach_handle_get(group, iommu_mm->pasid, IOMMU_DOMAIN_SVA); if (!IS_ERR(attach_handle)) { handle = container_of(attach_handle, struct iommu_sva, handle); if (attach_handle->domain->mm != mm) { ret = -EBUSY; goto out_unlock; } refcount_inc(&handle->users); mutex_unlock(&iommu_sva_lock); return handle; } if (PTR_ERR(attach_handle) != -ENOENT) { ret = PTR_ERR(attach_handle); goto out_unlock; } handle = kzalloc(sizeof(*handle), GFP_KERNEL); if (!handle) { ret = -ENOMEM; goto out_unlock; } /* Search for an existing domain. */ list_for_each_entry(domain, &mm->iommu_mm->sva_domains, next) { ret = iommu_attach_device_pasid(domain, dev, iommu_mm->pasid, &handle->handle); if (!ret) { domain->users++; goto out; } } /* Allocate a new domain and set it on device pasid. */ domain = iommu_sva_domain_alloc(dev, mm); if (IS_ERR(domain)) { ret = PTR_ERR(domain); goto out_free_handle; } ret = iommu_attach_device_pasid(domain, dev, iommu_mm->pasid, &handle->handle); if (ret) goto out_free_domain; domain->users = 1; if (list_empty(&iommu_mm->sva_domains)) { if (list_empty(&iommu_sva_mms)) iommu_sva_present = true; list_add(&iommu_mm->mm_list_elm, &iommu_sva_mms); } list_add(&domain->next, &iommu_mm->sva_domains); out: refcount_set(&handle->users, 1); mutex_unlock(&iommu_sva_lock); handle->dev = dev; return handle; out_free_domain: iommu_domain_free(domain); out_free_handle: kfree(handle); out_unlock: mutex_unlock(&iommu_sva_lock); return ERR_PTR(ret); } EXPORT_SYMBOL_GPL(iommu_sva_bind_device); /** * iommu_sva_unbind_device() - Remove a bond created with iommu_sva_bind_device * @handle: the handle returned by iommu_sva_bind_device() * * Put reference to a bond between device and address space. The device should * not be issuing any more transaction for this PASID. All outstanding page * requests for this PASID must have been flushed to the IOMMU. */ void iommu_sva_unbind_device(struct iommu_sva *handle) { struct iommu_domain *domain = handle->handle.domain; struct iommu_mm_data *iommu_mm = domain->mm->iommu_mm; struct device *dev = handle->dev; mutex_lock(&iommu_sva_lock); if (!refcount_dec_and_test(&handle->users)) { mutex_unlock(&iommu_sva_lock); return; } iommu_detach_device_pasid(domain, dev, iommu_mm->pasid); if (--domain->users == 0) { list_del(&domain->next); iommu_domain_free(domain); } if (list_empty(&iommu_mm->sva_domains)) { list_del(&iommu_mm->mm_list_elm); if (list_empty(&iommu_sva_mms)) iommu_sva_present = false; } mutex_unlock(&iommu_sva_lock); kfree(handle); } EXPORT_SYMBOL_GPL(iommu_sva_unbind_device); u32 iommu_sva_get_pasid(struct iommu_sva *handle) { struct iommu_domain *domain = handle->handle.domain; return mm_get_enqcmd_pasid(domain->mm); } EXPORT_SYMBOL_GPL(iommu_sva_get_pasid); void mm_pasid_drop(struct mm_struct *mm) { struct iommu_mm_data *iommu_mm = mm->iommu_mm; if (!iommu_mm) return; iommu_free_global_pasid(iommu_mm->pasid); kfree(iommu_mm); } /* * I/O page fault handler for SVA */ static enum iommu_page_response_code iommu_sva_handle_mm(struct iommu_fault *fault, struct mm_struct *mm) { vm_fault_t ret; struct vm_area_struct *vma; unsigned int access_flags = 0; unsigned int fault_flags = FAULT_FLAG_REMOTE; struct iommu_fault_page_request *prm = &fault->prm; enum iommu_page_response_code status = IOMMU_PAGE_RESP_INVALID; if (!(prm->flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID)) return status; if (!mmget_not_zero(mm)) return status; mmap_read_lock(mm); vma = vma_lookup(mm, prm->addr); if (!vma) /* Unmapped area */ goto out_put_mm; if (prm->perm & IOMMU_FAULT_PERM_READ) access_flags |= VM_READ; if (prm->perm & IOMMU_FAULT_PERM_WRITE) { access_flags |= VM_WRITE; fault_flags |= FAULT_FLAG_WRITE; } if (prm->perm & IOMMU_FAULT_PERM_EXEC) { access_flags |= VM_EXEC; fault_flags |= FAULT_FLAG_INSTRUCTION; } if (!(prm->perm & IOMMU_FAULT_PERM_PRIV)) fault_flags |= FAULT_FLAG_USER; if (access_flags & ~vma->vm_flags) /* Access fault */ goto out_put_mm; ret = handle_mm_fault(vma, prm->addr, fault_flags, NULL); status = ret & VM_FAULT_ERROR ? IOMMU_PAGE_RESP_INVALID : IOMMU_PAGE_RESP_SUCCESS; out_put_mm: mmap_read_unlock(mm); mmput(mm); return status; } static void iommu_sva_handle_iopf(struct work_struct *work) { struct iopf_fault *iopf; struct iopf_group *group; enum iommu_page_response_code status = IOMMU_PAGE_RESP_SUCCESS; group = container_of(work, struct iopf_group, work); list_for_each_entry(iopf, &group->faults, list) { /* * For the moment, errors are sticky: don't handle subsequent * faults in the group if there is an error. */ if (status != IOMMU_PAGE_RESP_SUCCESS) break; status = iommu_sva_handle_mm(&iopf->fault, group->attach_handle->domain->mm); } iopf_group_response(group, status); iopf_free_group(group); } static int iommu_sva_iopf_handler(struct iopf_group *group) { struct iommu_fault_param *fault_param = group->fault_param; INIT_WORK(&group->work, iommu_sva_handle_iopf); if (!queue_work(fault_param->queue->wq, &group->work)) return -EBUSY; return 0; } static struct iommu_domain *iommu_sva_domain_alloc(struct device *dev, struct mm_struct *mm) { const struct iommu_ops *ops = dev_iommu_ops(dev); struct iommu_domain *domain; if (!ops->domain_alloc_sva) return ERR_PTR(-EOPNOTSUPP); domain = ops->domain_alloc_sva(dev, mm); if (IS_ERR(domain)) return domain; domain->type = IOMMU_DOMAIN_SVA; domain->cookie_type = IOMMU_COOKIE_SVA; mmgrab(mm); domain->mm = mm; domain->owner = ops; domain->iopf_handler = iommu_sva_iopf_handler; return domain; } void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end) { struct iommu_mm_data *iommu_mm; guard(mutex)(&iommu_sva_lock); if (!iommu_sva_present) return; list_for_each_entry(iommu_mm, &iommu_sva_mms, mm_list_elm) mmu_notifier_arch_invalidate_secondary_tlbs(iommu_mm->mm, start, end); } |
| 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 | // SPDX-License-Identifier: GPL-2.0-or-later /* * HID driver for SiGma Micro-based keyboards * * Copyright (c) 2016 Kinglong Mee * Copyright (c) 2021 Desmond Lim */ #include <linux/device.h> #include <linux/hid.h> #include <linux/module.h> #include "hid-ids.h" static const __u8 sm_0059_rdesc[] = { 0x05, 0x0c, /* Usage Page (Consumer Devices) 0 */ 0x09, 0x01, /* Usage (Consumer Control) 2 */ 0xa1, 0x01, /* Collection (Application) 4 */ 0x85, 0x01, /* Report ID (1) 6 */ 0x19, 0x00, /* Usage Minimum (0) 8 */ 0x2a, 0x3c, 0x02, /* Usage Maximum (572) 10 */ 0x15, 0x00, /* Logical Minimum (0) 13 */ 0x26, 0x3c, 0x02, /* Logical Maximum (572) 15 */ 0x95, 0x01, /* Report Count (1) 18 */ 0x75, 0x10, /* Report Size (16) 20 */ 0x81, 0x00, /* Input (Data,Arr,Abs) 22 */ 0xc0, /* End Collection 24 */ 0x05, 0x01, /* Usage Page (Generic Desktop) 25 */ 0x09, 0x80, /* Usage (System Control) 27 */ 0xa1, 0x01, /* Collection (Application) 29 */ 0x85, 0x02, /* Report ID (2) 31 */ 0x19, 0x81, /* Usage Minimum (129) 33 */ 0x29, 0x83, /* Usage Maximum (131) 35 */ 0x25, 0x01, /* Logical Maximum (1) 37 */ 0x75, 0x01, /* Report Size (1) 39 */ 0x95, 0x03, /* Report Count (3) 41 */ 0x81, 0x02, /* Input (Data,Var,Abs) 43 */ 0x95, 0x05, /* Report Count (5) 45 */ 0x81, 0x01, /* Input (Cnst,Arr,Abs) 47 */ 0xc0, /* End Collection 49 */ 0x06, 0x00, 0xff, /* Usage Page (Vendor Defined Page 1) 50 */ 0x09, 0x01, /* Usage (Vendor Usage 1) 53 */ 0xa1, 0x01, /* Collection (Application) 55 */ 0x85, 0x03, /* Report ID (3) 57 */ 0x1a, 0xf1, 0x00, /* Usage Minimum (241) 59 */ 0x2a, 0xf8, 0x00, /* Usage Maximum (248) 62 */ 0x15, 0x00, /* Logical Minimum (0) 65 */ 0x25, 0x01, /* Logical Maximum (1) 67 */ 0x75, 0x01, /* Report Size (1) 69 */ 0x95, 0x08, /* Report Count (8) 71 */ 0x81, 0x02, /* Input (Data,Var,Abs) 73 */ 0xc0, /* End Collection 75 */ 0x05, 0x01, /* Usage Page (Generic Desktop) 76 */ 0x09, 0x06, /* Usage (Keyboard) 78 */ 0xa1, 0x01, /* Collection (Application) 80 */ 0x85, 0x04, /* Report ID (4) 82 */ 0x05, 0x07, /* Usage Page (Keyboard) 84 */ 0x19, 0xe0, /* Usage Minimum (224) 86 */ 0x29, 0xe7, /* Usage Maximum (231) 88 */ 0x15, 0x00, /* Logical Minimum (0) 90 */ 0x25, 0x01, /* Logical Maximum (1) 92 */ 0x75, 0x01, /* Report Size (1) 94 */ 0x95, 0x08, /* Report Count (8) 96 */ 0x81, 0x00, /* Input (Data,Arr,Abs) 98 */ 0x95, 0x30, /* Report Count (48) 100 */ 0x75, 0x01, /* Report Size (1) 102 */ 0x15, 0x00, /* Logical Minimum (0) 104 */ 0x25, 0x01, /* Logical Maximum (1) 106 */ 0x05, 0x07, /* Usage Page (Keyboard) 108 */ 0x19, 0x00, /* Usage Minimum (0) 110 */ 0x29, 0x2f, /* Usage Maximum (47) 112 */ 0x81, 0x02, /* Input (Data,Var,Abs) 114 */ 0xc0, /* End Collection 116 */ 0x05, 0x01, /* Usage Page (Generic Desktop) 117 */ 0x09, 0x06, /* Usage (Keyboard) 119 */ 0xa1, 0x01, /* Collection (Application) 121 */ 0x85, 0x05, /* Report ID (5) 123 */ 0x95, 0x38, /* Report Count (56) 125 */ 0x75, 0x01, /* Report Size (1) 127 */ 0x15, 0x00, /* Logical Minimum (0) 129 */ 0x25, 0x01, /* Logical Maximum (1) 131 */ 0x05, 0x07, /* Usage Page (Keyboard) 133 */ 0x19, 0x30, /* Usage Minimum (48) 135 */ 0x29, 0x67, /* Usage Maximum (103) 137 */ 0x81, 0x02, /* Input (Data,Var,Abs) 139 */ 0xc0, /* End Collection 141 */ 0x05, 0x01, /* Usage Page (Generic Desktop) 142 */ 0x09, 0x06, /* Usage (Keyboard) 144 */ 0xa1, 0x01, /* Collection (Application) 146 */ 0x85, 0x06, /* Report ID (6) 148 */ 0x95, 0x38, /* Report Count (56) 150 */ 0x75, 0x01, /* Report Size (1) 152 */ 0x15, 0x00, /* Logical Minimum (0) 154 */ 0x25, 0x01, /* Logical Maximum (1) 156 */ 0x05, 0x07, /* Usage Page (Keyboard) 158 */ 0x19, 0x68, /* Usage Minimum (104) 160 */ 0x29, 0x9f, /* Usage Maximum (159) 162 */ 0x81, 0x02, /* Input (Data,Var,Abs) 164 */ 0xc0, /* End Collection 166 */ }; static const __u8 *sm_report_fixup(struct hid_device *hdev, __u8 *rdesc, unsigned int *rsize) { if (*rsize == sizeof(sm_0059_rdesc) && !memcmp(sm_0059_rdesc, rdesc, *rsize)) { hid_info(hdev, "Fixing up SiGma Micro report descriptor\n"); rdesc[99] = 0x02; } return rdesc; } static const struct hid_device_id sm_devices[] = { { HID_USB_DEVICE(USB_VENDOR_ID_SIGMA_MICRO, USB_DEVICE_ID_SIGMA_MICRO_KEYBOARD2) }, { } }; MODULE_DEVICE_TABLE(hid, sm_devices); static struct hid_driver sm_driver = { .name = "sigmamicro", .id_table = sm_devices, .report_fixup = sm_report_fixup, }; module_hid_driver(sm_driver); MODULE_AUTHOR("Kinglong Mee <kinglongmee@gmail.com>"); MODULE_AUTHOR("Desmond Lim <peckishrine@gmail.com>"); MODULE_DESCRIPTION("SiGma Micro HID driver"); MODULE_LICENSE("GPL"); |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * AppArmor security module * * This file contains AppArmor auditing function definitions. * * Copyright (C) 1998-2008 Novell/SUSE * Copyright 2009-2010 Canonical Ltd. */ #ifndef __AA_AUDIT_H #define __AA_AUDIT_H #include <linux/audit.h> #include <linux/fs.h> #include <linux/lsm_audit.h> #include <linux/sched.h> #include <linux/slab.h> #include "file.h" #include "label.h" extern const char *const audit_mode_names[]; #define AUDIT_MAX_INDEX 5 enum audit_mode { AUDIT_NORMAL, /* follow normal auditing of accesses */ AUDIT_QUIET_DENIED, /* quiet all denied access messages */ AUDIT_QUIET, /* quiet all messages */ AUDIT_NOQUIET, /* do not quiet audit messages */ AUDIT_ALL /* audit all accesses */ }; enum audit_type { AUDIT_APPARMOR_AUDIT, AUDIT_APPARMOR_ALLOWED, AUDIT_APPARMOR_DENIED, AUDIT_APPARMOR_HINT, AUDIT_APPARMOR_STATUS, AUDIT_APPARMOR_ERROR, AUDIT_APPARMOR_KILL, AUDIT_APPARMOR_AUTO }; #define OP_NULL NULL #define OP_SYSCTL "sysctl" #define OP_CAPABLE "capable" #define OP_UNLINK "unlink" #define OP_MKDIR "mkdir" #define OP_RMDIR "rmdir" #define OP_MKNOD "mknod" #define OP_TRUNC "truncate" #define OP_LINK "link" #define OP_SYMLINK "symlink" #define OP_RENAME_SRC "rename_src" #define OP_RENAME_DEST "rename_dest" #define OP_CHMOD "chmod" #define OP_CHOWN "chown" #define OP_GETATTR "getattr" #define OP_OPEN "open" #define OP_FRECEIVE "file_receive" #define OP_FPERM "file_perm" #define OP_FLOCK "file_lock" #define OP_FMMAP "file_mmap" #define OP_FMPROT "file_mprotect" #define OP_INHERIT "file_inherit" #define OP_PIVOTROOT "pivotroot" #define OP_MOUNT "mount" #define OP_UMOUNT "umount" #define OP_CREATE "create" #define OP_POST_CREATE "post_create" #define OP_BIND "bind" #define OP_CONNECT "connect" #define OP_LISTEN "listen" #define OP_ACCEPT "accept" #define OP_SENDMSG "sendmsg" #define OP_RECVMSG "recvmsg" #define OP_GETSOCKNAME "getsockname" #define OP_GETPEERNAME "getpeername" #define OP_GETSOCKOPT "getsockopt" #define OP_SETSOCKOPT "setsockopt" #define OP_SHUTDOWN "socket_shutdown" #define OP_PTRACE "ptrace" #define OP_SIGNAL "signal" #define OP_EXEC "exec" #define OP_CHANGE_HAT "change_hat" #define OP_CHANGE_PROFILE "change_profile" #define OP_CHANGE_ONEXEC "change_onexec" #define OP_STACK "stack" #define OP_STACK_ONEXEC "stack_onexec" #define OP_SETPROCATTR "setprocattr" #define OP_SETRLIMIT "setrlimit" #define OP_PROF_REPL "profile_replace" #define OP_PROF_LOAD "profile_load" #define OP_PROF_RM "profile_remove" #define OP_USERNS_CREATE "userns_create" #define OP_URING_OVERRIDE "uring_override" #define OP_URING_SQPOLL "uring_sqpoll" struct apparmor_audit_data { int error; int type; u16 class; const char *op; const struct cred *subj_cred; struct aa_label *subj_label; const char *name; const char *info; u32 request; u32 denied; union { /* these entries require a custom callback fn */ struct { struct aa_label *peer; union { struct { const char *target; kuid_t ouid; } fs; struct { int rlim; unsigned long max; } rlim; struct { int signal; int unmappedsig; }; struct { int type, protocol; void *addr; int addrlen; struct { void *addr; int addrlen; } peer; } net; }; }; struct { struct aa_profile *profile; const char *ns; long pos; } iface; struct { const char *src_name; const char *type; const char *trans; const char *data; unsigned long flags; } mnt; struct { struct aa_label *target; } uring; }; struct common_audit_data common; }; /* macros for dealing with apparmor_audit_data structure */ #define aad(SA) (container_of(SA, struct apparmor_audit_data, common)) #define aad_of_va(VA) aad((struct common_audit_data *)(VA)) #define DEFINE_AUDIT_DATA(NAME, T, C, X) \ /* TODO: cleanup audit init so we don't need _aad = {0,} */ \ struct apparmor_audit_data NAME = { \ .class = (C), \ .op = (X), \ .common.type = (T), \ .common.u.tsk = NULL, \ .common.apparmor_audit_data = &NAME, \ }; void aa_audit_msg(int type, struct apparmor_audit_data *ad, void (*cb) (struct audit_buffer *, void *)); int aa_audit(int type, struct aa_profile *profile, struct apparmor_audit_data *ad, void (*cb) (struct audit_buffer *, void *)); #define aa_audit_error(ERROR, AD, CB) \ ({ \ (AD)->error = (ERROR); \ aa_audit_msg(AUDIT_APPARMOR_ERROR, (AD), (CB)); \ (AD)->error; \ }) static inline int complain_error(int error) { if (error == -EPERM || error == -EACCES) return 0; return error; } void aa_audit_rule_free(void *vrule); int aa_audit_rule_init(u32 field, u32 op, char *rulestr, void **vrule, gfp_t gfp); int aa_audit_rule_known(struct audit_krule *rule); int aa_audit_rule_match(struct lsm_prop *prop, u32 field, u32 op, void *vrule); #endif /* __AA_AUDIT_H */ |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _FS_CEPH_SUPER_H #define _FS_CEPH_SUPER_H #include <linux/ceph/ceph_debug.h> #include <linux/ceph/osd_client.h> #include <linux/unaligned.h> #include <linux/backing-dev.h> #include <linux/completion.h> #include <linux/exportfs.h> #include <linux/fs.h> #include <linux/mempool.h> #include <linux/pagemap.h> #include <linux/wait.h> #include <linux/writeback.h> #include <linux/slab.h> #include <linux/posix_acl.h> #include <linux/refcount.h> #include <linux/security.h> #include <linux/netfs.h> #include <linux/fscache.h> #include <linux/hashtable.h> #include <linux/ceph/libceph.h> #include "crypto.h" /* large granularity for statfs utilization stats to facilitate * large volume sizes on 32-bit machines. */ #define CEPH_BLOCK_SHIFT 22 /* 4 MB */ #define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT) #define CEPH_4K_BLOCK_SHIFT 12 /* 4 KB */ #define CEPH_MOUNT_OPT_CLEANRECOVER (1<<1) /* auto reonnect (clean mode) after blocklisted */ #define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */ #define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */ #define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */ #define CEPH_MOUNT_OPT_INO32 (1<<8) /* 32 bit inos */ #define CEPH_MOUNT_OPT_DCACHE (1<<9) /* use dcache for readdir etc */ #define CEPH_MOUNT_OPT_FSCACHE (1<<10) /* use fscache */ #define CEPH_MOUNT_OPT_NOPOOLPERM (1<<11) /* no pool permission check */ #define CEPH_MOUNT_OPT_MOUNTWAIT (1<<12) /* mount waits if no mds is up */ #define CEPH_MOUNT_OPT_NOQUOTADF (1<<13) /* no root dir quota in statfs */ #define CEPH_MOUNT_OPT_NOCOPYFROM (1<<14) /* don't use RADOS 'copy-from' op */ #define CEPH_MOUNT_OPT_ASYNC_DIROPS (1<<15) /* allow async directory ops */ #define CEPH_MOUNT_OPT_NOPAGECACHE (1<<16) /* bypass pagecache altogether */ #define CEPH_MOUNT_OPT_SPARSEREAD (1<<17) /* always do sparse reads */ #define CEPH_MOUNT_OPT_DEFAULT \ (CEPH_MOUNT_OPT_DCACHE | \ CEPH_MOUNT_OPT_NOCOPYFROM | \ CEPH_MOUNT_OPT_ASYNC_DIROPS) #define ceph_set_mount_opt(fsc, opt) \ (fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt #define ceph_clear_mount_opt(fsc, opt) \ (fsc)->mount_options->flags &= ~CEPH_MOUNT_OPT_##opt #define ceph_test_mount_opt(fsc, opt) \ (!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt)) /* max size of osd read request, limited by libceph */ #define CEPH_MAX_READ_SIZE CEPH_MSG_MAX_DATA_LEN /* osd has a configurable limitation of max write size. * CEPH_MSG_MAX_DATA_LEN should be small enough. */ #define CEPH_MAX_WRITE_SIZE CEPH_MSG_MAX_DATA_LEN #define CEPH_RASIZE_DEFAULT (8192*1024) /* max readahead */ #define CEPH_MAX_READDIR_DEFAULT 1024 #define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024) #define CEPH_SNAPDIRNAME_DEFAULT ".snap" /* * Delay telling the MDS we no longer want caps, in case we reopen * the file. Delay a minimum amount of time, even if we send a cap * message for some other reason. Otherwise, take the oppotunity to * update the mds to avoid sending another message later. */ #define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */ #define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */ struct ceph_mount_options { unsigned int flags; unsigned int wsize; /* max write size */ unsigned int rsize; /* max read size */ unsigned int rasize; /* max readahead */ unsigned int congestion_kb; /* max writeback in flight */ unsigned int caps_wanted_delay_min, caps_wanted_delay_max; int caps_max; unsigned int max_readdir; /* max readdir result (entries) */ unsigned int max_readdir_bytes; /* max readdir result (bytes) */ bool new_dev_syntax; /* * everything above this point can be memcmp'd; everything below * is handled in compare_mount_options() */ char *snapdir_name; /* default ".snap" */ char *mds_namespace; /* default NULL */ char *server_path; /* default NULL (means "/") */ char *fscache_uniq; /* default NULL */ char *mon_addr; struct fscrypt_dummy_policy dummy_enc_policy; }; /* * Check if the mds namespace in ceph_mount_options matches * the passed in namespace string. First time match (when * ->mds_namespace is NULL) is treated specially, since * ->mds_namespace needs to be initialized by the caller. */ static inline int namespace_equals(struct ceph_mount_options *fsopt, const char *namespace, size_t len) { return !(fsopt->mds_namespace && (strlen(fsopt->mds_namespace) != len || strncmp(fsopt->mds_namespace, namespace, len))); } /* mount state */ enum { CEPH_MOUNT_MOUNTING, CEPH_MOUNT_MOUNTED, CEPH_MOUNT_UNMOUNTING, CEPH_MOUNT_UNMOUNTED, CEPH_MOUNT_SHUTDOWN, CEPH_MOUNT_RECOVER, CEPH_MOUNT_FENCE_IO, }; #define CEPH_ASYNC_CREATE_CONFLICT_BITS 8 struct ceph_fs_client { struct super_block *sb; struct list_head metric_wakeup; struct ceph_mount_options *mount_options; struct ceph_client *client; int mount_state; bool blocklisted; bool have_copy_from2; u32 filp_gen; loff_t max_file_size; struct ceph_mds_client *mdsc; atomic_long_t writeback_count; bool write_congested; struct workqueue_struct *inode_wq; struct workqueue_struct *cap_wq; DECLARE_HASHTABLE(async_unlink_conflict, CEPH_ASYNC_CREATE_CONFLICT_BITS); spinlock_t async_unlink_conflict_lock; #ifdef CONFIG_DEBUG_FS struct dentry *debugfs_dentry_lru, *debugfs_caps; struct dentry *debugfs_congestion_kb; struct dentry *debugfs_bdi; struct dentry *debugfs_mdsc, *debugfs_mdsmap; struct dentry *debugfs_status; struct dentry *debugfs_mds_sessions; struct dentry *debugfs_metrics_dir; #endif #ifdef CONFIG_CEPH_FSCACHE struct fscache_volume *fscache; #endif #ifdef CONFIG_FS_ENCRYPTION struct fscrypt_dummy_policy fsc_dummy_enc_policy; #endif }; /* * File i/o capability. This tracks shared state with the metadata * server that allows us to cache or writeback attributes or to read * and write data. For any given inode, we should have one or more * capabilities, one issued by each metadata server, and our * cumulative access is the OR of all issued capabilities. * * Each cap is referenced by the inode's i_caps rbtree and by per-mds * session capability lists. */ struct ceph_cap { struct ceph_inode_info *ci; struct rb_node ci_node; /* per-ci cap tree */ struct ceph_mds_session *session; struct list_head session_caps; /* per-session caplist */ u64 cap_id; /* unique cap id (mds provided) */ union { /* in-use caps */ struct { int issued; /* latest, from the mds */ int implemented; /* implemented superset of issued (for revocation) */ int mds; /* mds index for this cap */ int mds_wanted; /* caps wanted from this mds */ }; /* caps to release */ struct { u64 cap_ino; int queue_release; }; }; u32 seq, issue_seq, mseq; u32 cap_gen; /* active/stale cycle */ unsigned long last_used; struct list_head caps_item; }; #define CHECK_CAPS_AUTHONLY 1 /* only check auth cap */ #define CHECK_CAPS_FLUSH 2 /* flush any dirty caps */ #define CHECK_CAPS_NOINVAL 4 /* don't invalidate pagecache */ #define CHECK_CAPS_FLUSH_FORCE 8 /* force flush any caps */ struct ceph_cap_flush { u64 tid; int caps; bool wake; /* wake up flush waiters when finish ? */ bool is_capsnap; /* true means capsnap */ struct list_head g_list; // global struct list_head i_list; // per inode }; /* * Snapped cap state that is pending flush to mds. When a snapshot occurs, * we first complete any in-process sync writes and writeback any dirty * data before flushing the snapped state (tracked here) back to the MDS. */ struct ceph_cap_snap { refcount_t nref; struct list_head ci_item; struct ceph_cap_flush cap_flush; u64 follows; int issued, dirty; struct ceph_snap_context *context; umode_t mode; kuid_t uid; kgid_t gid; struct ceph_buffer *xattr_blob; u64 xattr_version; u64 size; u64 change_attr; struct timespec64 mtime, atime, ctime, btime; u64 time_warp_seq; u64 truncate_size; u32 truncate_seq; int writing; /* a sync write is still in progress */ int dirty_pages; /* dirty pages awaiting writeback */ bool inline_data; bool need_flush; }; static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap) { if (refcount_dec_and_test(&capsnap->nref)) { if (capsnap->xattr_blob) ceph_buffer_put(capsnap->xattr_blob); kmem_cache_free(ceph_cap_snap_cachep, capsnap); } } /* * The frag tree describes how a directory is fragmented, potentially across * multiple metadata servers. It is also used to indicate points where * metadata authority is delegated, and whether/where metadata is replicated. * * A _leaf_ frag will be present in the i_fragtree IFF there is * delegation info. That is, if mds >= 0 || ndist > 0. */ #define CEPH_MAX_DIRFRAG_REP 4 struct ceph_inode_frag { struct rb_node node; /* fragtree state */ u32 frag; int split_by; /* i.e. 2^(split_by) children */ /* delegation and replication info */ int mds; /* -1 if same authority as parent */ int ndist; /* >0 if replicated */ int dist[CEPH_MAX_DIRFRAG_REP]; }; /* * We cache inode xattrs as an encoded blob until they are first used, * at which point we parse them into an rbtree. */ struct ceph_inode_xattr { struct rb_node node; const char *name; int name_len; const char *val; int val_len; int dirty; int should_free_name; int should_free_val; }; /* * Ceph dentry state */ struct ceph_dentry_info { struct dentry *dentry; struct ceph_mds_session *lease_session; struct list_head lease_list; struct hlist_node hnode; unsigned long flags; int lease_shared_gen; u32 lease_gen; u32 lease_seq; unsigned long lease_renew_after, lease_renew_from; unsigned long time; u64 offset; }; #define CEPH_DENTRY_REFERENCED (1 << 0) #define CEPH_DENTRY_LEASE_LIST (1 << 1) #define CEPH_DENTRY_SHRINK_LIST (1 << 2) #define CEPH_DENTRY_PRIMARY_LINK (1 << 3) #define CEPH_DENTRY_ASYNC_UNLINK_BIT (4) #define CEPH_DENTRY_ASYNC_UNLINK (1 << CEPH_DENTRY_ASYNC_UNLINK_BIT) #define CEPH_DENTRY_ASYNC_CREATE_BIT (5) #define CEPH_DENTRY_ASYNC_CREATE (1 << CEPH_DENTRY_ASYNC_CREATE_BIT) struct ceph_inode_xattrs_info { /* * (still encoded) xattr blob. we avoid the overhead of parsing * this until someone actually calls getxattr, etc. * * blob->vec.iov_len == 4 implies there are no xattrs; blob == * NULL means we don't know. */ struct ceph_buffer *blob, *prealloc_blob; struct rb_root index; bool dirty; int count; int names_size; int vals_size; u64 version, index_version; }; /* * Ceph inode. */ struct ceph_inode_info { struct netfs_inode netfs; /* Netfslib context and vfs inode */ struct ceph_vino i_vino; /* ceph ino + snap */ spinlock_t i_ceph_lock; u64 i_version; u64 i_inline_version; u32 i_time_warp_seq; unsigned long i_ceph_flags; atomic64_t i_release_count; atomic64_t i_ordered_count; atomic64_t i_complete_seq[2]; struct ceph_dir_layout i_dir_layout; struct ceph_file_layout i_layout; struct ceph_file_layout i_cached_layout; // for async creates char *i_symlink; /* for dirs */ struct timespec64 i_rctime; u64 i_rbytes, i_rfiles, i_rsubdirs, i_rsnaps; u64 i_files, i_subdirs; /* quotas */ u64 i_max_bytes, i_max_files; s32 i_dir_pin; struct rb_root i_fragtree; int i_fragtree_nsplits; struct mutex i_fragtree_mutex; struct ceph_inode_xattrs_info i_xattrs; /* capabilities. protected _both_ by i_ceph_lock and cap->session's * s_mutex. */ struct rb_root i_caps; /* cap list */ struct ceph_cap *i_auth_cap; /* authoritative cap, if any */ unsigned i_dirty_caps, i_flushing_caps; /* mask of dirtied fields */ /* * Link to the auth cap's session's s_cap_dirty list. s_cap_dirty * is protected by the mdsc->cap_dirty_lock, but each individual item * is also protected by the inode's i_ceph_lock. Walking s_cap_dirty * requires the mdsc->cap_dirty_lock. List presence for an item can * be tested under the i_ceph_lock. Changing anything requires both. */ struct list_head i_dirty_item; /* * Link to session's s_cap_flushing list. Protected in a similar * fashion to i_dirty_item, but also by the s_mutex for changes. The * s_cap_flushing list can be walked while holding either the s_mutex * or msdc->cap_dirty_lock. List presence can also be checked while * holding the i_ceph_lock for this inode. */ struct list_head i_flushing_item; /* we need to track cap writeback on a per-cap-bit basis, to allow * overlapping, pipelined cap flushes to the mds. we can probably * reduce the tid to 8 bits if we're concerned about inode size. */ struct ceph_cap_flush *i_prealloc_cap_flush; struct list_head i_cap_flush_list; wait_queue_head_t i_cap_wq; /* threads waiting on a capability */ unsigned long i_hold_caps_max; /* jiffies */ struct list_head i_cap_delay_list; /* for delayed cap release to mds */ struct ceph_cap_reservation i_cap_migration_resv; struct list_head i_cap_snaps; /* snapped state pending flush to mds */ struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 or dirty|flushing caps */ unsigned i_snap_caps; /* cap bits for snapped files */ unsigned long i_last_rd; unsigned long i_last_wr; int i_nr_by_mode[CEPH_FILE_MODE_BITS]; /* open file counts */ struct mutex i_truncate_mutex; u32 i_truncate_seq; /* last truncate to smaller size */ u64 i_truncate_size; /* and the size we last truncated down to */ int i_truncate_pending; /* still need to call vmtruncate */ /* * For none fscrypt case it equals to i_truncate_size or it will * equals to fscrypt_file_size */ u64 i_truncate_pagecache_size; u64 i_max_size; /* max file size authorized by mds */ u64 i_reported_size; /* (max_)size reported to or requested of mds */ u64 i_wanted_max_size; /* offset we'd like to write too */ u64 i_requested_max_size; /* max_size we've requested */ /* held references to caps */ int i_pin_ref; int i_rd_ref, i_rdcache_ref, i_wr_ref, i_wb_ref, i_fx_ref; int i_wrbuffer_ref, i_wrbuffer_ref_head; atomic_t i_filelock_ref; atomic_t i_shared_gen; /* increment each time we get FILE_SHARED */ u32 i_rdcache_gen; /* incremented each time we get FILE_CACHE. */ u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */ struct list_head i_unsafe_dirops; /* uncommitted mds dir ops */ struct list_head i_unsafe_iops; /* uncommitted mds inode ops */ spinlock_t i_unsafe_lock; union { struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */ struct ceph_snapid_map *i_snapid_map; /* snapid -> dev_t */ }; struct list_head i_snap_realm_item; struct list_head i_snap_flush_item; struct timespec64 i_btime; struct timespec64 i_snap_btime; struct work_struct i_work; unsigned long i_work_mask; #ifdef CONFIG_FS_ENCRYPTION struct fscrypt_inode_info *i_crypt_info; u32 fscrypt_auth_len; u32 fscrypt_file_len; u8 *fscrypt_auth; u8 *fscrypt_file; #endif }; struct ceph_netfs_request_data { int caps; /* * Maximum size of a file readahead request. * The fadvise could update the bdi's default ra_pages. */ unsigned int file_ra_pages; /* Set it if fadvise disables file readahead entirely */ bool file_ra_disabled; }; static inline struct ceph_inode_info * ceph_inode(const struct inode *inode) { return container_of(inode, struct ceph_inode_info, netfs.inode); } static inline struct ceph_fs_client * ceph_inode_to_fs_client(const struct inode *inode) { return (struct ceph_fs_client *)inode->i_sb->s_fs_info; } static inline struct ceph_fs_client * ceph_sb_to_fs_client(const struct super_block *sb) { return (struct ceph_fs_client *)sb->s_fs_info; } static inline struct ceph_mds_client * ceph_sb_to_mdsc(const struct super_block *sb) { return (struct ceph_mds_client *)ceph_sb_to_fs_client(sb)->mdsc; } static inline struct ceph_client * ceph_inode_to_client(const struct inode *inode) { return (struct ceph_client *)ceph_inode_to_fs_client(inode)->client; } static inline struct ceph_vino ceph_vino(const struct inode *inode) { return ceph_inode(inode)->i_vino; } static inline u32 ceph_ino_to_ino32(u64 vino) { u32 ino = vino & 0xffffffff; ino ^= vino >> 32; if (!ino) ino = 2; return ino; } /* * Inode numbers in cephfs are 64 bits, but inode->i_ino is 32-bits on * some arches. We generally do not use this value inside the ceph driver, but * we do want to set it to something, so that generic vfs code has an * appropriate value for tracepoints and the like. */ static inline ino_t ceph_vino_to_ino_t(struct ceph_vino vino) { if (sizeof(ino_t) == sizeof(u32)) return ceph_ino_to_ino32(vino.ino); return (ino_t)vino.ino; } /* for printf-style formatting */ #define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap static inline u64 ceph_ino(struct inode *inode) { return ceph_inode(inode)->i_vino.ino; } static inline u64 ceph_snap(struct inode *inode) { return ceph_inode(inode)->i_vino.snap; } /** * ceph_present_ino - format an inode number for presentation to userland * @sb: superblock where the inode lives * @ino: inode number to (possibly) convert * * If the user mounted with the ino32 option, then the 64-bit value needs * to be converted to something that can fit inside 32 bits. Note that * internal kernel code never uses this value, so this is entirely for * userland consumption. */ static inline u64 ceph_present_ino(struct super_block *sb, u64 ino) { if (unlikely(ceph_test_mount_opt(ceph_sb_to_fs_client(sb), INO32))) return ceph_ino_to_ino32(ino); return ino; } static inline u64 ceph_present_inode(struct inode *inode) { return ceph_present_ino(inode->i_sb, ceph_ino(inode)); } static inline int ceph_ino_compare(struct inode *inode, void *data) { struct ceph_vino *pvino = (struct ceph_vino *)data; struct ceph_inode_info *ci = ceph_inode(inode); return ci->i_vino.ino == pvino->ino && ci->i_vino.snap == pvino->snap; } /* * The MDS reserves a set of inodes for its own usage. These should never * be accessible by clients, and so the MDS has no reason to ever hand these * out. The range is CEPH_MDS_INO_MDSDIR_OFFSET..CEPH_INO_SYSTEM_BASE. * * These come from src/mds/mdstypes.h in the ceph sources. */ #define CEPH_MAX_MDS 0x100 #define CEPH_NUM_STRAY 10 #define CEPH_MDS_INO_MDSDIR_OFFSET (1 * CEPH_MAX_MDS) #define CEPH_MDS_INO_LOG_OFFSET (2 * CEPH_MAX_MDS) #define CEPH_INO_SYSTEM_BASE ((6*CEPH_MAX_MDS) + (CEPH_MAX_MDS * CEPH_NUM_STRAY)) static inline bool ceph_vino_is_reserved(const struct ceph_vino vino) { if (vino.ino >= CEPH_INO_SYSTEM_BASE || vino.ino < CEPH_MDS_INO_MDSDIR_OFFSET) return false; /* Don't warn on mdsdirs */ WARN_RATELIMIT(vino.ino >= CEPH_MDS_INO_LOG_OFFSET, "Attempt to access reserved inode number 0x%llx", vino.ino); return true; } static inline struct inode *ceph_find_inode(struct super_block *sb, struct ceph_vino vino) { if (ceph_vino_is_reserved(vino)) return NULL; /* * NB: The hashval will be run through the fs/inode.c hash function * anyway, so there is no need to squash the inode number down to * 32-bits first. Just use low-order bits on arches with 32-bit long. */ return ilookup5(sb, (unsigned long)vino.ino, ceph_ino_compare, &vino); } /* * Ceph inode. */ #define CEPH_I_DIR_ORDERED (1 << 0) /* dentries in dir are ordered */ #define CEPH_I_FLUSH (1 << 2) /* do not delay flush of dirty metadata */ #define CEPH_I_POOL_PERM (1 << 3) /* pool rd/wr bits are valid */ #define CEPH_I_POOL_RD (1 << 4) /* can read from pool */ #define CEPH_I_POOL_WR (1 << 5) /* can write to pool */ #define CEPH_I_SEC_INITED (1 << 6) /* security initialized */ #define CEPH_I_KICK_FLUSH (1 << 7) /* kick flushing caps */ #define CEPH_I_FLUSH_SNAPS (1 << 8) /* need flush snapss */ #define CEPH_I_ERROR_WRITE (1 << 9) /* have seen write errors */ #define CEPH_I_ERROR_FILELOCK (1 << 10) /* have seen file lock errors */ #define CEPH_I_ODIRECT_BIT (11) /* inode in direct I/O mode */ #define CEPH_I_ODIRECT (1 << CEPH_I_ODIRECT_BIT) #define CEPH_ASYNC_CREATE_BIT (12) /* async create in flight for this */ #define CEPH_I_ASYNC_CREATE (1 << CEPH_ASYNC_CREATE_BIT) #define CEPH_I_SHUTDOWN (1 << 13) /* inode is no longer usable */ #define CEPH_I_ASYNC_CHECK_CAPS (1 << 14) /* check caps immediately after async creating finishes */ /* * Masks of ceph inode work. */ #define CEPH_I_WORK_WRITEBACK 0 #define CEPH_I_WORK_INVALIDATE_PAGES 1 #define CEPH_I_WORK_VMTRUNCATE 2 #define CEPH_I_WORK_CHECK_CAPS 3 #define CEPH_I_WORK_FLUSH_SNAPS 4 /* * We set the ERROR_WRITE bit when we start seeing write errors on an inode * and then clear it when they start succeeding. Note that we do a lockless * check first, and only take the lock if it looks like it needs to be changed. * The write submission code just takes this as a hint, so we're not too * worried if a few slip through in either direction. */ static inline void ceph_set_error_write(struct ceph_inode_info *ci) { if (!(READ_ONCE(ci->i_ceph_flags) & CEPH_I_ERROR_WRITE)) { spin_lock(&ci->i_ceph_lock); ci->i_ceph_flags |= CEPH_I_ERROR_WRITE; spin_unlock(&ci->i_ceph_lock); } } static inline void ceph_clear_error_write(struct ceph_inode_info *ci) { if (READ_ONCE(ci->i_ceph_flags) & CEPH_I_ERROR_WRITE) { spin_lock(&ci->i_ceph_lock); ci->i_ceph_flags &= ~CEPH_I_ERROR_WRITE; spin_unlock(&ci->i_ceph_lock); } } static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci, long long release_count, long long ordered_count) { /* * Makes sure operations that setup readdir cache (update page * cache and i_size) are strongly ordered w.r.t. the following * atomic64_set() operations. */ smp_mb(); atomic64_set(&ci->i_complete_seq[0], release_count); atomic64_set(&ci->i_complete_seq[1], ordered_count); } static inline void __ceph_dir_clear_complete(struct ceph_inode_info *ci) { atomic64_inc(&ci->i_release_count); } static inline void __ceph_dir_clear_ordered(struct ceph_inode_info *ci) { atomic64_inc(&ci->i_ordered_count); } static inline bool __ceph_dir_is_complete(struct ceph_inode_info *ci) { return atomic64_read(&ci->i_complete_seq[0]) == atomic64_read(&ci->i_release_count); } static inline bool __ceph_dir_is_complete_ordered(struct ceph_inode_info *ci) { return atomic64_read(&ci->i_complete_seq[0]) == atomic64_read(&ci->i_release_count) && atomic64_read(&ci->i_complete_seq[1]) == atomic64_read(&ci->i_ordered_count); } static inline void ceph_dir_clear_complete(struct inode *inode) { __ceph_dir_clear_complete(ceph_inode(inode)); } static inline void ceph_dir_clear_ordered(struct inode *inode) { __ceph_dir_clear_ordered(ceph_inode(inode)); } static inline bool ceph_dir_is_complete_ordered(struct inode *inode) { bool ret = __ceph_dir_is_complete_ordered(ceph_inode(inode)); smp_rmb(); return ret; } /* find a specific frag @f */ extern struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, u32 f); /* * choose fragment for value @v. copy frag content to pfrag, if leaf * exists */ extern u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v, struct ceph_inode_frag *pfrag, int *found); static inline struct ceph_dentry_info *ceph_dentry(const struct dentry *dentry) { return (struct ceph_dentry_info *)dentry->d_fsdata; } /* * caps helpers */ static inline bool __ceph_is_any_real_caps(struct ceph_inode_info *ci) { return !RB_EMPTY_ROOT(&ci->i_caps); } extern int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented); extern int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int t); extern int __ceph_caps_issued_mask_metric(struct ceph_inode_info *ci, int mask, int t); extern int __ceph_caps_issued_other(struct ceph_inode_info *ci, struct ceph_cap *cap); static inline int ceph_caps_issued(struct ceph_inode_info *ci) { int issued; spin_lock(&ci->i_ceph_lock); issued = __ceph_caps_issued(ci, NULL); spin_unlock(&ci->i_ceph_lock); return issued; } static inline int ceph_caps_issued_mask_metric(struct ceph_inode_info *ci, int mask, int touch) { int r; spin_lock(&ci->i_ceph_lock); r = __ceph_caps_issued_mask_metric(ci, mask, touch); spin_unlock(&ci->i_ceph_lock); return r; } static inline int __ceph_caps_dirty(struct ceph_inode_info *ci) { return ci->i_dirty_caps | ci->i_flushing_caps; } extern struct ceph_cap_flush *ceph_alloc_cap_flush(void); extern void ceph_free_cap_flush(struct ceph_cap_flush *cf); extern int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask, struct ceph_cap_flush **pcf); extern int __ceph_caps_revoking_other(struct ceph_inode_info *ci, struct ceph_cap *ocap, int mask); extern int __ceph_caps_used(struct ceph_inode_info *ci); static inline bool __ceph_is_file_opened(struct ceph_inode_info *ci) { return ci->i_nr_by_mode[0]; } extern int __ceph_caps_file_wanted(struct ceph_inode_info *ci); extern int __ceph_caps_wanted(struct ceph_inode_info *ci); /* what the mds thinks we want */ extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check); extern void ceph_caps_init(struct ceph_mds_client *mdsc); extern void ceph_caps_finalize(struct ceph_mds_client *mdsc); extern void ceph_adjust_caps_max_min(struct ceph_mds_client *mdsc, struct ceph_mount_options *fsopt); extern int ceph_reserve_caps(struct ceph_mds_client *mdsc, struct ceph_cap_reservation *ctx, int need); extern void ceph_unreserve_caps(struct ceph_mds_client *mdsc, struct ceph_cap_reservation *ctx); extern void ceph_reservation_status(struct ceph_fs_client *client, int *total, int *avail, int *used, int *reserved, int *min); extern void change_auth_cap_ses(struct ceph_inode_info *ci, struct ceph_mds_session *session); /* * we keep buffered readdir results attached to file->private_data */ #define CEPH_F_SYNC 1 #define CEPH_F_ATEND 2 struct ceph_file_info { short fmode; /* initialized on open */ short flags; /* CEPH_F_* */ spinlock_t rw_contexts_lock; struct list_head rw_contexts; u32 filp_gen; }; struct ceph_dir_file_info { struct ceph_file_info file_info; /* readdir: position within the dir */ u32 frag; struct ceph_mds_request *last_readdir; /* readdir: position within a frag */ unsigned next_offset; /* offset of next chunk (last_name's + 1) */ char *last_name; /* last entry in previous chunk */ long long dir_release_count; long long dir_ordered_count; int readdir_cache_idx; /* used for -o dirstat read() on directory thing */ char *dir_info; int dir_info_len; }; struct ceph_rw_context { struct list_head list; struct task_struct *thread; int caps; }; #define CEPH_DEFINE_RW_CONTEXT(_name, _caps) \ struct ceph_rw_context _name = { \ .thread = current, \ .caps = _caps, \ } static inline void ceph_add_rw_context(struct ceph_file_info *cf, struct ceph_rw_context *ctx) { spin_lock(&cf->rw_contexts_lock); list_add(&ctx->list, &cf->rw_contexts); spin_unlock(&cf->rw_contexts_lock); } static inline void ceph_del_rw_context(struct ceph_file_info *cf, struct ceph_rw_context *ctx) { spin_lock(&cf->rw_contexts_lock); list_del(&ctx->list); spin_unlock(&cf->rw_contexts_lock); } static inline struct ceph_rw_context* ceph_find_rw_context(struct ceph_file_info *cf) { struct ceph_rw_context *ctx, *found = NULL; spin_lock(&cf->rw_contexts_lock); list_for_each_entry(ctx, &cf->rw_contexts, list) { if (ctx->thread == current) { found = ctx; break; } } spin_unlock(&cf->rw_contexts_lock); return found; } struct ceph_readdir_cache_control { struct folio *folio; struct dentry **dentries; int index; }; /* * A "snap realm" describes a subset of the file hierarchy sharing * the same set of snapshots that apply to it. The realms themselves * are organized into a hierarchy, such that children inherit (some of) * the snapshots of their parents. * * All inodes within the realm that have capabilities are linked into a * per-realm list. */ struct ceph_snap_realm { u64 ino; struct inode *inode; atomic_t nref; struct rb_node node; u64 created, seq; u64 parent_ino; u64 parent_since; /* snapid when our current parent became so */ u64 *prior_parent_snaps; /* snaps inherited from any parents we */ u32 num_prior_parent_snaps; /* had prior to parent_since */ u64 *snaps; /* snaps specific to this realm */ u32 num_snaps; struct ceph_snap_realm *parent; struct list_head children; /* list of child realms */ struct list_head child_item; struct list_head empty_item; /* if i have ref==0 */ struct list_head dirty_item; /* if realm needs new context */ struct list_head rebuild_item; /* rebuild snap realms _downward_ in hierarchy */ /* the current set of snaps for this realm */ struct ceph_snap_context *cached_context; struct list_head inodes_with_caps; spinlock_t inodes_with_caps_lock; }; static inline int default_congestion_kb(void) { int congestion_kb; /* * Copied from NFS * * congestion size, scale with available memory. * * 64MB: 8192k * 128MB: 11585k * 256MB: 16384k * 512MB: 23170k * 1GB: 32768k * 2GB: 46340k * 4GB: 65536k * 8GB: 92681k * 16GB: 131072k * * This allows larger machines to have larger/more transfers. * Limit the default to 256M */ congestion_kb = (16*int_sqrt(totalram_pages())) << (PAGE_SHIFT-10); if (congestion_kb > 256*1024) congestion_kb = 256*1024; return congestion_kb; } /* super.c */ extern int ceph_force_reconnect(struct super_block *sb); /* snap.c */ struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc, u64 ino); extern void ceph_get_snap_realm(struct ceph_mds_client *mdsc, struct ceph_snap_realm *realm); extern void ceph_put_snap_realm(struct ceph_mds_client *mdsc, struct ceph_snap_realm *realm); extern int ceph_update_snap_trace(struct ceph_mds_client *m, void *p, void *e, bool deletion, struct ceph_snap_realm **realm_ret); void ceph_change_snap_realm(struct inode *inode, struct ceph_snap_realm *realm); extern void ceph_handle_snap(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, struct ceph_msg *msg); extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci, struct ceph_cap_snap *capsnap); extern void ceph_cleanup_global_and_empty_realms(struct ceph_mds_client *mdsc); extern struct ceph_snapid_map *ceph_get_snapid_map(struct ceph_mds_client *mdsc, u64 snap); extern void ceph_put_snapid_map(struct ceph_mds_client* mdsc, struct ceph_snapid_map *sm); extern void ceph_trim_snapid_map(struct ceph_mds_client *mdsc); extern void ceph_cleanup_snapid_map(struct ceph_mds_client *mdsc); void ceph_umount_begin(struct super_block *sb); /* * a cap_snap is "pending" if it is still awaiting an in-progress * sync write (that may/may not still update size, mtime, etc.). */ static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci) { return !list_empty(&ci->i_cap_snaps) && list_last_entry(&ci->i_cap_snaps, struct ceph_cap_snap, ci_item)->writing; } /* inode.c */ struct ceph_mds_reply_info_in; struct ceph_mds_reply_dirfrag; struct ceph_acl_sec_ctx; extern const struct inode_operations ceph_file_iops; extern struct inode *ceph_alloc_inode(struct super_block *sb); extern void ceph_evict_inode(struct inode *inode); extern void ceph_free_inode(struct inode *inode); struct inode *ceph_new_inode(struct inode *dir, struct dentry *dentry, umode_t *mode, struct ceph_acl_sec_ctx *as_ctx); void ceph_as_ctx_to_req(struct ceph_mds_request *req, struct ceph_acl_sec_ctx *as_ctx); extern struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino, struct inode *newino); extern struct inode *ceph_get_snapdir(struct inode *parent); extern int ceph_fill_file_size(struct inode *inode, int issued, u32 truncate_seq, u64 truncate_size, u64 size); extern void ceph_fill_file_time(struct inode *inode, int issued, u64 time_warp_seq, struct timespec64 *ctime, struct timespec64 *mtime, struct timespec64 *atime); extern int ceph_fill_inode(struct inode *inode, struct page *locked_page, struct ceph_mds_reply_info_in *iinfo, struct ceph_mds_reply_dirfrag *dirinfo, struct ceph_mds_session *session, int cap_fmode, struct ceph_cap_reservation *caps_reservation); extern int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req); extern int ceph_readdir_prepopulate(struct ceph_mds_request *req, struct ceph_mds_session *session); extern bool ceph_inode_set_size(struct inode *inode, loff_t size); extern void __ceph_do_pending_vmtruncate(struct inode *inode); void ceph_queue_inode_work(struct inode *inode, int work_bit); static inline void ceph_queue_vmtruncate(struct inode *inode) { ceph_queue_inode_work(inode, CEPH_I_WORK_VMTRUNCATE); } static inline void ceph_queue_invalidate(struct inode *inode) { ceph_queue_inode_work(inode, CEPH_I_WORK_INVALIDATE_PAGES); } static inline void ceph_queue_writeback(struct inode *inode) { ceph_queue_inode_work(inode, CEPH_I_WORK_WRITEBACK); } static inline void ceph_queue_check_caps(struct inode *inode) { ceph_queue_inode_work(inode, CEPH_I_WORK_CHECK_CAPS); } static inline void ceph_queue_flush_snaps(struct inode *inode) { ceph_queue_inode_work(inode, CEPH_I_WORK_FLUSH_SNAPS); } extern int ceph_try_to_choose_auth_mds(struct inode *inode, int mask); extern int __ceph_do_getattr(struct inode *inode, struct page *locked_page, int mask, bool force); static inline int ceph_do_getattr(struct inode *inode, int mask, bool force) { return __ceph_do_getattr(inode, NULL, mask, force); } extern int ceph_permission(struct mnt_idmap *idmap, struct inode *inode, int mask); struct ceph_iattr { struct ceph_fscrypt_auth *fscrypt_auth; }; extern int __ceph_setattr(struct mnt_idmap *idmap, struct inode *inode, struct iattr *attr, struct ceph_iattr *cia); extern int ceph_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); extern int ceph_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags); void ceph_inode_shutdown(struct inode *inode); static inline bool ceph_inode_is_shutdown(struct inode *inode) { unsigned long flags = READ_ONCE(ceph_inode(inode)->i_ceph_flags); struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); int state = READ_ONCE(fsc->mount_state); return (flags & CEPH_I_SHUTDOWN) || state >= CEPH_MOUNT_SHUTDOWN; } /* xattr.c */ int __ceph_setxattr(struct inode *, const char *, const void *, size_t, int); int ceph_do_getvxattr(struct inode *inode, const char *name, void *value, size_t size); ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t); extern ssize_t ceph_listxattr(struct dentry *, char *, size_t); extern struct ceph_buffer *__ceph_build_xattrs_blob(struct ceph_inode_info *ci); extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci); extern const struct xattr_handler * const ceph_xattr_handlers[]; struct ceph_acl_sec_ctx { #ifdef CONFIG_CEPH_FS_POSIX_ACL void *default_acl; void *acl; #endif #ifdef CONFIG_CEPH_FS_SECURITY_LABEL struct lsm_context lsmctx; #endif #ifdef CONFIG_FS_ENCRYPTION struct ceph_fscrypt_auth *fscrypt_auth; #endif struct ceph_pagelist *pagelist; }; #ifdef CONFIG_SECURITY extern bool ceph_security_xattr_deadlock(struct inode *in); extern bool ceph_security_xattr_wanted(struct inode *in); #else static inline bool ceph_security_xattr_deadlock(struct inode *in) { return false; } static inline bool ceph_security_xattr_wanted(struct inode *in) { return false; } #endif #ifdef CONFIG_CEPH_FS_SECURITY_LABEL extern int ceph_security_init_secctx(struct dentry *dentry, umode_t mode, struct ceph_acl_sec_ctx *ctx); static inline void ceph_security_invalidate_secctx(struct inode *inode) { security_inode_invalidate_secctx(inode); } #else static inline int ceph_security_init_secctx(struct dentry *dentry, umode_t mode, struct ceph_acl_sec_ctx *ctx) { return 0; } static inline void ceph_security_invalidate_secctx(struct inode *inode) { } #endif void ceph_release_acl_sec_ctx(struct ceph_acl_sec_ctx *as_ctx); /* acl.c */ #ifdef CONFIG_CEPH_FS_POSIX_ACL struct posix_acl *ceph_get_acl(struct inode *, int, bool); int ceph_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type); int ceph_pre_init_acls(struct inode *dir, umode_t *mode, struct ceph_acl_sec_ctx *as_ctx); void ceph_init_inode_acls(struct inode *inode, struct ceph_acl_sec_ctx *as_ctx); static inline void ceph_forget_all_cached_acls(struct inode *inode) { forget_all_cached_acls(inode); } #else #define ceph_get_acl NULL #define ceph_set_acl NULL static inline int ceph_pre_init_acls(struct inode *dir, umode_t *mode, struct ceph_acl_sec_ctx *as_ctx) { return 0; } static inline void ceph_init_inode_acls(struct inode *inode, struct ceph_acl_sec_ctx *as_ctx) { } static inline void ceph_forget_all_cached_acls(struct inode *inode) { } #endif /* caps.c */ extern const char *ceph_cap_string(int c); extern void ceph_handle_caps(struct ceph_mds_session *session, struct ceph_msg *msg); extern struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc, struct ceph_cap_reservation *ctx); extern void ceph_add_cap(struct inode *inode, struct ceph_mds_session *session, u64 cap_id, unsigned issued, unsigned wanted, unsigned cap, unsigned seq, u64 realmino, int flags, struct ceph_cap **new_cap); extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release); extern void ceph_remove_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, bool queue_release); extern void __ceph_remove_caps(struct ceph_inode_info *ci); extern void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap); extern int ceph_is_any_caps(struct inode *inode); extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc); extern int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync); extern void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session); extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session); void ceph_kick_flushing_inode_caps(struct ceph_mds_session *session, struct ceph_inode_info *ci); extern struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds); extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, int mds); extern void ceph_take_cap_refs(struct ceph_inode_info *ci, int caps, bool snap_rwsem_locked); extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps); extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had); extern void ceph_put_cap_refs_async(struct ceph_inode_info *ci, int had); extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, struct ceph_snap_context *snapc); extern void __ceph_remove_capsnap(struct inode *inode, struct ceph_cap_snap *capsnap, bool *wake_ci, bool *wake_mdsc); extern void ceph_remove_capsnap(struct inode *inode, struct ceph_cap_snap *capsnap, bool *wake_ci, bool *wake_mdsc); extern void ceph_flush_snaps(struct ceph_inode_info *ci, struct ceph_mds_session **psession); extern bool __ceph_should_report_size(struct ceph_inode_info *ci); extern void ceph_check_caps(struct ceph_inode_info *ci, int flags); extern unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc); extern void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc); extern void ceph_flush_cap_releases(struct ceph_mds_client *mdsc); extern int ceph_drop_caps_for_unlink(struct inode *inode); extern int ceph_encode_inode_release(void **p, struct inode *inode, int mds, int drop, int unless, int force); extern int ceph_encode_dentry_release(void **p, struct dentry *dn, struct inode *dir, int mds, int drop, int unless); extern int __ceph_get_caps(struct inode *inode, struct ceph_file_info *fi, int need, int want, loff_t endoff, int *got); extern int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got); extern int ceph_try_get_caps(struct inode *inode, int need, int want, bool nonblock, int *got); /* for counting open files by mode */ extern void ceph_get_fmode(struct ceph_inode_info *ci, int mode, int count); extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode, int count); extern void __ceph_touch_fmode(struct ceph_inode_info *ci, struct ceph_mds_client *mdsc, int fmode); /* addr.c */ extern const struct address_space_operations ceph_aops; extern const struct netfs_request_ops ceph_netfs_ops; int ceph_mmap_prepare(struct vm_area_desc *desc); extern int ceph_uninline_data(struct file *file); extern int ceph_pool_perm_check(struct inode *inode, int need); extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc); int ceph_purge_inode_cap(struct inode *inode, struct ceph_cap *cap, bool *invalidate); static inline bool ceph_has_inline_data(struct ceph_inode_info *ci) { if (ci->i_inline_version == CEPH_INLINE_NONE || ci->i_inline_version == 1) /* initial version, no data */ return false; return true; } /* file.c */ extern const struct file_operations ceph_file_fops; extern int ceph_renew_caps(struct inode *inode, int fmode); extern int ceph_open(struct inode *inode, struct file *file); extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry, struct file *file, unsigned flags, umode_t mode); extern ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, struct iov_iter *to, int *retry_op, u64 *last_objver); extern int ceph_release(struct inode *inode, struct file *filp); extern void ceph_fill_inline_data(struct inode *inode, struct page *locked_page, char *data, size_t len); /* dir.c */ extern const struct file_operations ceph_dir_fops; extern const struct file_operations ceph_snapdir_fops; extern const struct inode_operations ceph_dir_iops; extern const struct inode_operations ceph_snapdir_iops; extern const struct dentry_operations ceph_dentry_ops; extern loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order); extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry); extern struct dentry *ceph_handle_snapdir(struct ceph_mds_request *req, struct dentry *dentry); extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, struct dentry *dentry, int err); extern void __ceph_dentry_lease_touch(struct ceph_dentry_info *di); extern void __ceph_dentry_dir_lease_touch(struct ceph_dentry_info *di); extern void ceph_invalidate_dentry_lease(struct dentry *dentry); extern int ceph_trim_dentries(struct ceph_mds_client *mdsc); extern unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn); extern void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl); /* ioctl.c */ extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg); /* export.c */ extern const struct export_operations ceph_export_ops; struct inode *ceph_lookup_inode(struct super_block *sb, u64 ino); /* locks.c */ extern __init void ceph_flock_init(void); extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl); extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl); extern void ceph_count_locks(struct inode *inode, int *p_num, int *f_num); extern int ceph_encode_locks_to_buffer(struct inode *inode, struct ceph_filelock *flocks, int num_fcntl_locks, int num_flock_locks); extern int ceph_locks_to_pagelist(struct ceph_filelock *flocks, struct ceph_pagelist *pagelist, int num_fcntl_locks, int num_flock_locks); /* debugfs.c */ extern void ceph_fs_debugfs_init(struct ceph_fs_client *client); extern void ceph_fs_debugfs_cleanup(struct ceph_fs_client *client); /* quota.c */ enum quota_get_realm { QUOTA_GET_MAX_FILES, QUOTA_GET_MAX_BYTES, QUOTA_GET_ANY }; static inline bool __ceph_has_quota(struct ceph_inode_info *ci, enum quota_get_realm which) { bool has_quota = false; switch (which) { case QUOTA_GET_MAX_BYTES: has_quota = !!ci->i_max_bytes; break; case QUOTA_GET_MAX_FILES: has_quota = !!ci->i_max_files; break; default: has_quota = !!(ci->i_max_files || ci->i_max_bytes); } return has_quota; } extern void ceph_adjust_quota_realms_count(struct inode *inode, bool inc); static inline void __ceph_update_quota(struct ceph_inode_info *ci, u64 max_bytes, u64 max_files) { bool had_quota, has_quota; had_quota = __ceph_has_quota(ci, QUOTA_GET_ANY); ci->i_max_bytes = max_bytes; ci->i_max_files = max_files; has_quota = __ceph_has_quota(ci, QUOTA_GET_ANY); if (had_quota != has_quota) ceph_adjust_quota_realms_count(&ci->netfs.inode, has_quota); } static inline int __ceph_sparse_read_ext_count(struct inode *inode, u64 len) { int cnt = 0; if (IS_ENCRYPTED(inode)) { cnt = len >> CEPH_FSCRYPT_BLOCK_SHIFT; if (cnt > CEPH_SPARSE_EXT_ARRAY_INITIAL) cnt = 0; } return cnt; } extern void ceph_handle_quota(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, struct ceph_msg *msg); extern bool ceph_quota_is_max_files_exceeded(struct inode *inode); extern bool ceph_quota_is_same_realm(struct inode *old, struct inode *new); extern bool ceph_quota_is_max_bytes_exceeded(struct inode *inode, loff_t newlen); extern bool ceph_quota_is_max_bytes_approaching(struct inode *inode, loff_t newlen); extern bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, struct kstatfs *buf); extern void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc); bool ceph_inc_mds_stopping_blocker(struct ceph_mds_client *mdsc, struct ceph_mds_session *session); void ceph_dec_mds_stopping_blocker(struct ceph_mds_client *mdsc); bool ceph_inc_osd_stopping_blocker(struct ceph_mds_client *mdsc); void ceph_dec_osd_stopping_blocker(struct ceph_mds_client *mdsc); #endif /* _FS_CEPH_SUPER_H */ |
| 262 262 4 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 | // SPDX-License-Identifier: GPL-2.0 #include <linux/cache.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/pid_namespace.h> #include "internal.h" /* * /proc/self: */ static const char *proc_self_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { struct pid_namespace *ns = proc_pid_ns(inode->i_sb); pid_t tgid = task_tgid_nr_ns(current, ns); char *name; if (!tgid) return ERR_PTR(-ENOENT); /* max length of unsigned int in decimal + NULL term */ name = kmalloc(10 + 1, dentry ? GFP_KERNEL : GFP_ATOMIC); if (unlikely(!name)) return dentry ? ERR_PTR(-ENOMEM) : ERR_PTR(-ECHILD); sprintf(name, "%u", tgid); set_delayed_call(done, kfree_link, name); return name; } static const struct inode_operations proc_self_inode_operations = { .get_link = proc_self_get_link, }; unsigned self_inum __ro_after_init; int proc_setup_self(struct super_block *s) { struct inode *root_inode = d_inode(s->s_root); struct dentry *self; int ret = -ENOMEM; inode_lock(root_inode); self = d_alloc_name(s->s_root, "self"); if (self) { struct inode *inode = new_inode(s); if (inode) { inode->i_ino = self_inum; simple_inode_init_ts(inode); inode->i_mode = S_IFLNK | S_IRWXUGO; inode->i_uid = GLOBAL_ROOT_UID; inode->i_gid = GLOBAL_ROOT_GID; inode->i_op = &proc_self_inode_operations; d_make_persistent(self, inode); ret = 0; } dput(self); } inode_unlock(root_inode); if (ret) pr_err("proc_fill_super: can't allocate /proc/self\n"); return ret; } void __init proc_self_init(void) { proc_alloc_inum(&self_inum); } |
| 515 521 521 1 3 2 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 | /* SPDX-License-Identifier: GPL-2.0-only */ #ifndef __NET_PSP_HELPERS_H #define __NET_PSP_HELPERS_H #include <linux/skbuff.h> #include <linux/rcupdate.h> #include <linux/udp.h> #include <net/sock.h> #include <net/tcp.h> #include <net/psp/types.h> struct inet_timewait_sock; /* Driver-facing API */ struct psp_dev * psp_dev_create(struct net_device *netdev, struct psp_dev_ops *psd_ops, struct psp_dev_caps *psd_caps, void *priv_ptr); void psp_dev_unregister(struct psp_dev *psd); bool psp_dev_encapsulate(struct net *net, struct sk_buff *skb, __be32 spi, u8 ver, __be16 sport); int psp_dev_rcv(struct sk_buff *skb, u16 dev_id, u8 generation, bool strip_icv); /* Kernel-facing API */ void psp_assoc_put(struct psp_assoc *pas); static inline void *psp_assoc_drv_data(struct psp_assoc *pas) { return pas->drv_data; } #if IS_ENABLED(CONFIG_INET_PSP) unsigned int psp_key_size(u32 version); void psp_sk_assoc_free(struct sock *sk); void psp_twsk_init(struct inet_timewait_sock *tw, const struct sock *sk); void psp_twsk_assoc_free(struct inet_timewait_sock *tw); void psp_reply_set_decrypted(const struct sock *sk, struct sk_buff *skb); static inline struct psp_assoc *psp_sk_assoc(const struct sock *sk) { return rcu_dereference_check(sk->psp_assoc, lockdep_sock_is_held(sk)); } static inline void psp_enqueue_set_decrypted(struct sock *sk, struct sk_buff *skb) { struct psp_assoc *pas; pas = psp_sk_assoc(sk); if (pas && pas->tx.spi) skb->decrypted = 1; } static inline unsigned long __psp_skb_coalesce_diff(const struct sk_buff *one, const struct sk_buff *two, unsigned long diffs) { struct psp_skb_ext *a, *b; a = skb_ext_find(one, SKB_EXT_PSP); b = skb_ext_find(two, SKB_EXT_PSP); diffs |= (!!a) ^ (!!b); if (!diffs && unlikely(a)) diffs |= memcmp(a, b, sizeof(*a)); return diffs; } static inline bool psp_is_allowed_nondata(struct sk_buff *skb, struct psp_assoc *pas) { bool fin = !!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN); u32 end_seq = TCP_SKB_CB(skb)->end_seq; u32 seq = TCP_SKB_CB(skb)->seq; bool pure_fin; pure_fin = fin && end_seq - seq == 1; return seq == end_seq || (pure_fin && seq == pas->upgrade_seq); } static inline bool psp_pse_matches_pas(struct psp_skb_ext *pse, struct psp_assoc *pas) { return pse && pas->rx.spi == pse->spi && pas->generation == pse->generation && pas->version == pse->version && pas->dev_id == pse->dev_id; } static inline enum skb_drop_reason __psp_sk_rx_policy_check(struct sk_buff *skb, struct psp_assoc *pas) { struct psp_skb_ext *pse = skb_ext_find(skb, SKB_EXT_PSP); if (!pas) return pse ? SKB_DROP_REASON_PSP_INPUT : 0; if (likely(psp_pse_matches_pas(pse, pas))) { if (unlikely(!pas->peer_tx)) pas->peer_tx = 1; return 0; } if (!pse) { if (!pas->tx.spi || (!pas->peer_tx && psp_is_allowed_nondata(skb, pas))) return 0; } return SKB_DROP_REASON_PSP_INPUT; } static inline enum skb_drop_reason psp_sk_rx_policy_check(struct sock *sk, struct sk_buff *skb) { return __psp_sk_rx_policy_check(skb, psp_sk_assoc(sk)); } static inline enum skb_drop_reason psp_twsk_rx_policy_check(struct inet_timewait_sock *tw, struct sk_buff *skb) { return __psp_sk_rx_policy_check(skb, rcu_dereference(tw->psp_assoc)); } static inline struct psp_assoc *psp_sk_get_assoc_rcu(const struct sock *sk) { struct psp_assoc *pas; int state; state = READ_ONCE(sk->sk_state); if (!sk_is_inet(sk) || state == TCP_NEW_SYN_RECV) return NULL; pas = state == TCP_TIME_WAIT ? rcu_dereference(inet_twsk(sk)->psp_assoc) : rcu_dereference(sk->psp_assoc); return pas; } static inline struct psp_assoc *psp_skb_get_assoc_rcu(struct sk_buff *skb) { if (!skb->decrypted || !skb->sk) return NULL; return psp_sk_get_assoc_rcu(skb->sk); } static inline unsigned int psp_sk_overhead(const struct sock *sk) { int psp_encap = sizeof(struct udphdr) + PSP_HDR_SIZE + PSP_TRL_SIZE; bool has_psp = rcu_access_pointer(sk->psp_assoc); return has_psp ? psp_encap : 0; } #else static inline void psp_sk_assoc_free(struct sock *sk) { } static inline void psp_twsk_init(struct inet_timewait_sock *tw, const struct sock *sk) { } static inline void psp_twsk_assoc_free(struct inet_timewait_sock *tw) { } static inline void psp_reply_set_decrypted(const struct sock *sk, struct sk_buff *skb) { } static inline struct psp_assoc *psp_sk_assoc(const struct sock *sk) { return NULL; } static inline void psp_enqueue_set_decrypted(struct sock *sk, struct sk_buff *skb) { } static inline unsigned long __psp_skb_coalesce_diff(const struct sk_buff *one, const struct sk_buff *two, unsigned long diffs) { return diffs; } static inline enum skb_drop_reason psp_sk_rx_policy_check(struct sock *sk, struct sk_buff *skb) { return 0; } static inline enum skb_drop_reason psp_twsk_rx_policy_check(struct inet_timewait_sock *tw, struct sk_buff *skb) { return 0; } static inline struct psp_assoc *psp_skb_get_assoc_rcu(struct sk_buff *skb) { return NULL; } static inline unsigned int psp_sk_overhead(const struct sock *sk) { return 0; } #endif static inline unsigned long psp_skb_coalesce_diff(const struct sk_buff *one, const struct sk_buff *two) { return __psp_skb_coalesce_diff(one, two, 0); } #endif /* __NET_PSP_HELPERS_H */ |
| 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2008-2010, 2013 Dave Chinner * All Rights Reserved. */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_trans.h" #include "xfs_trans_priv.h" #include "xfs_icreate_item.h" #include "xfs_log.h" #include "xfs_log_priv.h" #include "xfs_log_recover.h" #include "xfs_ialloc.h" #include "xfs_trace.h" struct kmem_cache *xfs_icreate_cache; /* inode create item */ static inline struct xfs_icreate_item *ICR_ITEM(struct xfs_log_item *lip) { return container_of(lip, struct xfs_icreate_item, ic_item); } /* * This returns the number of iovecs needed to log the given inode item. * * We only need one iovec for the icreate log structure. */ STATIC void xfs_icreate_item_size( struct xfs_log_item *lip, int *nvecs, int *nbytes) { *nvecs += 1; *nbytes += sizeof(struct xfs_icreate_log); } /* * This is called to fill in the vector of log iovecs for the * given inode create log item. */ STATIC void xfs_icreate_item_format( struct xfs_log_item *lip, struct xfs_log_vec *lv) { struct xfs_icreate_item *icp = ICR_ITEM(lip); struct xfs_log_iovec *vecp = NULL; xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ICREATE, &icp->ic_format, sizeof(struct xfs_icreate_log)); } STATIC void xfs_icreate_item_release( struct xfs_log_item *lip) { kvfree(ICR_ITEM(lip)->ic_item.li_lv_shadow); kmem_cache_free(xfs_icreate_cache, ICR_ITEM(lip)); } static const struct xfs_item_ops xfs_icreate_item_ops = { .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED, .iop_size = xfs_icreate_item_size, .iop_format = xfs_icreate_item_format, .iop_release = xfs_icreate_item_release, }; /* * Initialize the inode log item for a newly allocated (in-core) inode. * * Inode extents can only reside within an AG. Hence specify the starting * block for the inode chunk by offset within an AG as well as the * length of the allocated extent. * * This joins the item to the transaction and marks it dirty so * that we don't need a separate call to do this, nor does the * caller need to know anything about the icreate item. */ void xfs_icreate_log( struct xfs_trans *tp, xfs_agnumber_t agno, xfs_agblock_t agbno, unsigned int count, unsigned int inode_size, xfs_agblock_t length, unsigned int generation) { struct xfs_icreate_item *icp; icp = kmem_cache_zalloc(xfs_icreate_cache, GFP_KERNEL | __GFP_NOFAIL); xfs_log_item_init(tp->t_mountp, &icp->ic_item, XFS_LI_ICREATE, &xfs_icreate_item_ops); icp->ic_format.icl_type = XFS_LI_ICREATE; icp->ic_format.icl_size = 1; /* single vector */ icp->ic_format.icl_ag = cpu_to_be32(agno); icp->ic_format.icl_agbno = cpu_to_be32(agbno); icp->ic_format.icl_count = cpu_to_be32(count); icp->ic_format.icl_isize = cpu_to_be32(inode_size); icp->ic_format.icl_length = cpu_to_be32(length); icp->ic_format.icl_gen = cpu_to_be32(generation); xfs_trans_add_item(tp, &icp->ic_item); tp->t_flags |= XFS_TRANS_DIRTY; set_bit(XFS_LI_DIRTY, &icp->ic_item.li_flags); } static enum xlog_recover_reorder xlog_recover_icreate_reorder( struct xlog_recover_item *item) { /* * Inode allocation buffers must be replayed before subsequent inode * items try to modify those buffers. ICREATE items are the logical * equivalent of logging a newly initialized inode buffer, so recover * these at the same time that we recover logged buffers. */ return XLOG_REORDER_BUFFER_LIST; } /* * This routine is called when an inode create format structure is found in a * committed transaction in the log. It's purpose is to initialise the inodes * being allocated on disk. This requires us to get inode cluster buffers that * match the range to be initialised, stamped with inode templates and written * by delayed write so that subsequent modifications will hit the cached buffer * and only need writing out at the end of recovery. */ STATIC int xlog_recover_icreate_commit_pass2( struct xlog *log, struct list_head *buffer_list, struct xlog_recover_item *item, xfs_lsn_t lsn) { struct xfs_mount *mp = log->l_mp; struct xfs_icreate_log *icl; struct xfs_ino_geometry *igeo = M_IGEO(mp); xfs_agnumber_t agno; xfs_agblock_t agbno; unsigned int count; unsigned int isize; xfs_agblock_t length; int bb_per_cluster; int cancel_count; int nbufs; int i; icl = (struct xfs_icreate_log *)item->ri_buf[0].iov_base; if (icl->icl_type != XFS_LI_ICREATE) { xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad type"); return -EINVAL; } if (icl->icl_size != 1) { xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad icl size"); return -EINVAL; } agno = be32_to_cpu(icl->icl_ag); if (agno >= mp->m_sb.sb_agcount) { xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agno"); return -EINVAL; } agbno = be32_to_cpu(icl->icl_agbno); if (!agbno || agbno == NULLAGBLOCK || agbno >= mp->m_sb.sb_agblocks) { xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agbno"); return -EINVAL; } isize = be32_to_cpu(icl->icl_isize); if (isize != mp->m_sb.sb_inodesize) { xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad isize"); return -EINVAL; } count = be32_to_cpu(icl->icl_count); if (!count) { xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count"); return -EINVAL; } length = be32_to_cpu(icl->icl_length); if (!length || length >= mp->m_sb.sb_agblocks) { xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad length"); return -EINVAL; } /* * The inode chunk is either full or sparse and we only support * m_ino_geo.ialloc_min_blks sized sparse allocations at this time. */ if (length != igeo->ialloc_blks && length != igeo->ialloc_min_blks) { xfs_warn(log->l_mp, "%s: unsupported chunk length", __func__); return -EINVAL; } /* verify inode count is consistent with extent length */ if ((count >> mp->m_sb.sb_inopblog) != length) { xfs_warn(log->l_mp, "%s: inconsistent inode count and chunk length", __func__); return -EINVAL; } /* * The icreate transaction can cover multiple cluster buffers and these * buffers could have been freed and reused. Check the individual * buffers for cancellation so we don't overwrite anything written after * a cancellation. */ bb_per_cluster = XFS_FSB_TO_BB(mp, igeo->blocks_per_cluster); nbufs = length / igeo->blocks_per_cluster; for (i = 0, cancel_count = 0; i < nbufs; i++) { xfs_daddr_t daddr; daddr = XFS_AGB_TO_DADDR(mp, agno, agbno + i * igeo->blocks_per_cluster); if (xlog_is_buffer_cancelled(log, daddr, bb_per_cluster)) cancel_count++; } /* * We currently only use icreate for a single allocation at a time. This * means we should expect either all or none of the buffers to be * cancelled. Be conservative and skip replay if at least one buffer is * cancelled, but warn the user that something is awry if the buffers * are not consistent. * * XXX: This must be refined to only skip cancelled clusters once we use * icreate for multiple chunk allocations. */ ASSERT(!cancel_count || cancel_count == nbufs); if (cancel_count) { if (cancel_count != nbufs) xfs_warn(mp, "WARNING: partial inode chunk cancellation, skipped icreate."); trace_xfs_log_recover_icreate_cancel(log, icl); return 0; } trace_xfs_log_recover_icreate_recover(log, icl); return xfs_ialloc_inode_init(mp, NULL, buffer_list, count, agno, agbno, length, be32_to_cpu(icl->icl_gen)); } const struct xlog_recover_item_ops xlog_icreate_item_ops = { .item_type = XFS_LI_ICREATE, .reorder = xlog_recover_icreate_reorder, .commit_pass2 = xlog_recover_icreate_commit_pass2, }; |
| 5 19 19 19 10 7 14 14 13 1 1 1 1 1 37 37 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 | // SPDX-License-Identifier: GPL-2.0-only /* Helper handling for netfilter. */ /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> * (C) 2006-2012 Patrick McHardy <kaber@trash.net> */ #include <linux/types.h> #include <linux/netfilter.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/vmalloc.h> #include <linux/stddef.h> #include <linux/random.h> #include <linux/err.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/rculist.h> #include <linux/rtnetlink.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_extend.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_seqadj.h> #include <net/netfilter/nf_log.h> #include <net/ip.h> static DEFINE_MUTEX(nf_ct_helper_mutex); struct hlist_head *nf_ct_helper_hash __read_mostly; EXPORT_SYMBOL_GPL(nf_ct_helper_hash); unsigned int nf_ct_helper_hsize __read_mostly; EXPORT_SYMBOL_GPL(nf_ct_helper_hsize); static unsigned int nf_ct_helper_count __read_mostly; static DEFINE_MUTEX(nf_ct_nat_helpers_mutex); static struct list_head nf_ct_nat_helpers __read_mostly; /* Stupid hash, but collision free for the default registrations of the * helpers currently in the kernel. */ static unsigned int helper_hash(const struct nf_conntrack_tuple *tuple) { return (((tuple->src.l3num << 8) | tuple->dst.protonum) ^ (__force __u16)tuple->src.u.all) % nf_ct_helper_hsize; } struct nf_conntrack_helper * __nf_conntrack_helper_find(const char *name, u16 l3num, u8 protonum) { struct nf_conntrack_helper *h; unsigned int i; for (i = 0; i < nf_ct_helper_hsize; i++) { hlist_for_each_entry_rcu(h, &nf_ct_helper_hash[i], hnode) { if (strcmp(h->name, name)) continue; if (h->tuple.src.l3num != NFPROTO_UNSPEC && h->tuple.src.l3num != l3num) continue; if (h->tuple.dst.protonum == protonum) return h; } } return NULL; } EXPORT_SYMBOL_GPL(__nf_conntrack_helper_find); struct nf_conntrack_helper * nf_conntrack_helper_try_module_get(const char *name, u16 l3num, u8 protonum) { struct nf_conntrack_helper *h; rcu_read_lock(); h = __nf_conntrack_helper_find(name, l3num, protonum); #ifdef CONFIG_MODULES if (h == NULL) { rcu_read_unlock(); if (request_module("nfct-helper-%s", name) == 0) { rcu_read_lock(); h = __nf_conntrack_helper_find(name, l3num, protonum); } else { return h; } } #endif if (h != NULL && !try_module_get(h->me)) h = NULL; if (h != NULL && !refcount_inc_not_zero(&h->refcnt)) { module_put(h->me); h = NULL; } rcu_read_unlock(); return h; } EXPORT_SYMBOL_GPL(nf_conntrack_helper_try_module_get); void nf_conntrack_helper_put(struct nf_conntrack_helper *helper) { refcount_dec(&helper->refcnt); module_put(helper->me); } EXPORT_SYMBOL_GPL(nf_conntrack_helper_put); static struct nf_conntrack_nat_helper * nf_conntrack_nat_helper_find(const char *mod_name) { struct nf_conntrack_nat_helper *cur; bool found = false; list_for_each_entry_rcu(cur, &nf_ct_nat_helpers, list) { if (!strcmp(cur->mod_name, mod_name)) { found = true; break; } } return found ? cur : NULL; } int nf_nat_helper_try_module_get(const char *name, u16 l3num, u8 protonum) { struct nf_conntrack_helper *h; struct nf_conntrack_nat_helper *nat; char mod_name[NF_CT_HELPER_NAME_LEN]; int ret = 0; rcu_read_lock(); h = __nf_conntrack_helper_find(name, l3num, protonum); if (!h) { rcu_read_unlock(); return -ENOENT; } nat = nf_conntrack_nat_helper_find(h->nat_mod_name); if (!nat) { snprintf(mod_name, sizeof(mod_name), "%s", h->nat_mod_name); rcu_read_unlock(); request_module("%s", mod_name); rcu_read_lock(); nat = nf_conntrack_nat_helper_find(mod_name); if (!nat) { rcu_read_unlock(); return -ENOENT; } } if (!try_module_get(nat->module)) ret = -ENOENT; rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(nf_nat_helper_try_module_get); void nf_nat_helper_put(struct nf_conntrack_helper *helper) { struct nf_conntrack_nat_helper *nat; nat = nf_conntrack_nat_helper_find(helper->nat_mod_name); if (WARN_ON_ONCE(!nat)) return; module_put(nat->module); } EXPORT_SYMBOL_GPL(nf_nat_helper_put); struct nf_conn_help * nf_ct_helper_ext_add(struct nf_conn *ct, gfp_t gfp) { struct nf_conn_help *help; help = nf_ct_ext_add(ct, NF_CT_EXT_HELPER, gfp); if (help) INIT_HLIST_HEAD(&help->expectations); else pr_debug("failed to add helper extension area"); return help; } EXPORT_SYMBOL_GPL(nf_ct_helper_ext_add); int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl, gfp_t flags) { struct nf_conntrack_helper *helper = NULL; struct nf_conn_help *help; /* We already got a helper explicitly attached (e.g. nft_ct) */ if (test_bit(IPS_HELPER_BIT, &ct->status)) return 0; if (WARN_ON_ONCE(!tmpl)) return 0; help = nfct_help(tmpl); if (help != NULL) { helper = rcu_dereference(help->helper); set_bit(IPS_HELPER_BIT, &ct->status); } help = nfct_help(ct); if (helper == NULL) { if (help) RCU_INIT_POINTER(help->helper, NULL); return 0; } if (help == NULL) { help = nf_ct_helper_ext_add(ct, flags); if (help == NULL) return -ENOMEM; } else { /* We only allow helper re-assignment of the same sort since * we cannot reallocate the helper extension area. */ struct nf_conntrack_helper *tmp = rcu_dereference(help->helper); if (tmp && tmp->help != helper->help) { RCU_INIT_POINTER(help->helper, NULL); return 0; } } rcu_assign_pointer(help->helper, helper); return 0; } EXPORT_SYMBOL_GPL(__nf_ct_try_assign_helper); /* appropriate ct lock protecting must be taken by caller */ static int unhelp(struct nf_conn *ct, void *me) { struct nf_conn_help *help = nfct_help(ct); if (help && rcu_dereference_raw(help->helper) == me) { nf_conntrack_event(IPCT_HELPER, ct); RCU_INIT_POINTER(help->helper, NULL); } /* We are not intended to delete this conntrack. */ return 0; } void nf_ct_helper_destroy(struct nf_conn *ct) { struct nf_conn_help *help = nfct_help(ct); struct nf_conntrack_helper *helper; if (help) { rcu_read_lock(); helper = rcu_dereference(help->helper); if (helper && helper->destroy) helper->destroy(ct); rcu_read_unlock(); } } static LIST_HEAD(nf_ct_helper_expectfn_list); void nf_ct_helper_expectfn_register(struct nf_ct_helper_expectfn *n) { spin_lock_bh(&nf_conntrack_expect_lock); list_add_rcu(&n->head, &nf_ct_helper_expectfn_list); spin_unlock_bh(&nf_conntrack_expect_lock); } EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_register); void nf_ct_helper_expectfn_unregister(struct nf_ct_helper_expectfn *n) { spin_lock_bh(&nf_conntrack_expect_lock); list_del_rcu(&n->head); spin_unlock_bh(&nf_conntrack_expect_lock); } EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_unregister); /* Caller should hold the rcu lock */ struct nf_ct_helper_expectfn * nf_ct_helper_expectfn_find_by_name(const char *name) { struct nf_ct_helper_expectfn *cur; bool found = false; list_for_each_entry_rcu(cur, &nf_ct_helper_expectfn_list, head) { if (!strcmp(cur->name, name)) { found = true; break; } } return found ? cur : NULL; } EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_find_by_name); /* Caller should hold the rcu lock */ struct nf_ct_helper_expectfn * nf_ct_helper_expectfn_find_by_symbol(const void *symbol) { struct nf_ct_helper_expectfn *cur; bool found = false; list_for_each_entry_rcu(cur, &nf_ct_helper_expectfn_list, head) { if (cur->expectfn == symbol) { found = true; break; } } return found ? cur : NULL; } EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_find_by_symbol); __printf(3, 4) void nf_ct_helper_log(struct sk_buff *skb, const struct nf_conn *ct, const char *fmt, ...) { const struct nf_conn_help *help; const struct nf_conntrack_helper *helper; struct va_format vaf; va_list args; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; /* Called from the helper function, this call never fails */ help = nfct_help(ct); /* rcu_read_lock()ed by nf_hook_thresh */ helper = rcu_dereference(help->helper); nf_log_packet(nf_ct_net(ct), nf_ct_l3num(ct), 0, skb, NULL, NULL, NULL, "nf_ct_%s: dropping packet: %pV ", helper->name, &vaf); va_end(args); } EXPORT_SYMBOL_GPL(nf_ct_helper_log); int nf_conntrack_helper_register(struct nf_conntrack_helper *me) { struct nf_conntrack_tuple_mask mask = { .src.u.all = htons(0xFFFF) }; unsigned int h = helper_hash(&me->tuple); struct nf_conntrack_helper *cur; int ret = 0, i; BUG_ON(me->expect_policy == NULL); BUG_ON(me->expect_class_max >= NF_CT_MAX_EXPECT_CLASSES); BUG_ON(strlen(me->name) > NF_CT_HELPER_NAME_LEN - 1); if (!nf_ct_helper_hash) return -ENOENT; if (me->expect_policy->max_expected > NF_CT_EXPECT_MAX_CNT) return -EINVAL; mutex_lock(&nf_ct_helper_mutex); for (i = 0; i < nf_ct_helper_hsize; i++) { hlist_for_each_entry(cur, &nf_ct_helper_hash[i], hnode) { if (!strcmp(cur->name, me->name) && (cur->tuple.src.l3num == NFPROTO_UNSPEC || cur->tuple.src.l3num == me->tuple.src.l3num) && cur->tuple.dst.protonum == me->tuple.dst.protonum) { ret = -EBUSY; goto out; } } } /* avoid unpredictable behaviour for auto_assign_helper */ if (!(me->flags & NF_CT_HELPER_F_USERSPACE)) { hlist_for_each_entry(cur, &nf_ct_helper_hash[h], hnode) { if (nf_ct_tuple_src_mask_cmp(&cur->tuple, &me->tuple, &mask)) { ret = -EBUSY; goto out; } } } refcount_set(&me->refcnt, 1); hlist_add_head_rcu(&me->hnode, &nf_ct_helper_hash[h]); nf_ct_helper_count++; out: mutex_unlock(&nf_ct_helper_mutex); return ret; } EXPORT_SYMBOL_GPL(nf_conntrack_helper_register); static bool expect_iter_me(struct nf_conntrack_expect *exp, void *data) { struct nf_conn_help *help = nfct_help(exp->master); const struct nf_conntrack_helper *me = data; const struct nf_conntrack_helper *this; if (exp->helper == me) return true; this = rcu_dereference_protected(help->helper, lockdep_is_held(&nf_conntrack_expect_lock)); return this == me; } void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me) { mutex_lock(&nf_ct_helper_mutex); hlist_del_rcu(&me->hnode); nf_ct_helper_count--; mutex_unlock(&nf_ct_helper_mutex); /* Make sure every nothing is still using the helper unless its a * connection in the hash. */ synchronize_rcu(); nf_ct_expect_iterate_destroy(expect_iter_me, NULL); nf_ct_iterate_destroy(unhelp, me); } EXPORT_SYMBOL_GPL(nf_conntrack_helper_unregister); void nf_ct_helper_init(struct nf_conntrack_helper *helper, u16 l3num, u16 protonum, const char *name, u16 default_port, u16 spec_port, u32 id, const struct nf_conntrack_expect_policy *exp_pol, u32 expect_class_max, int (*help)(struct sk_buff *skb, unsigned int protoff, struct nf_conn *ct, enum ip_conntrack_info ctinfo), int (*from_nlattr)(struct nlattr *attr, struct nf_conn *ct), struct module *module) { helper->tuple.src.l3num = l3num; helper->tuple.dst.protonum = protonum; helper->tuple.src.u.all = htons(spec_port); helper->expect_policy = exp_pol; helper->expect_class_max = expect_class_max; helper->help = help; helper->from_nlattr = from_nlattr; helper->me = module; snprintf(helper->nat_mod_name, sizeof(helper->nat_mod_name), NF_NAT_HELPER_PREFIX "%s", name); if (spec_port == default_port) snprintf(helper->name, sizeof(helper->name), "%s", name); else snprintf(helper->name, sizeof(helper->name), "%s-%u", name, id); } EXPORT_SYMBOL_GPL(nf_ct_helper_init); int nf_conntrack_helpers_register(struct nf_conntrack_helper *helper, unsigned int n) { unsigned int i; int err = 0; for (i = 0; i < n; i++) { err = nf_conntrack_helper_register(&helper[i]); if (err < 0) goto err; } return err; err: if (i > 0) nf_conntrack_helpers_unregister(helper, i); return err; } EXPORT_SYMBOL_GPL(nf_conntrack_helpers_register); void nf_conntrack_helpers_unregister(struct nf_conntrack_helper *helper, unsigned int n) { while (n-- > 0) nf_conntrack_helper_unregister(&helper[n]); } EXPORT_SYMBOL_GPL(nf_conntrack_helpers_unregister); void nf_nat_helper_register(struct nf_conntrack_nat_helper *nat) { mutex_lock(&nf_ct_nat_helpers_mutex); list_add_rcu(&nat->list, &nf_ct_nat_helpers); mutex_unlock(&nf_ct_nat_helpers_mutex); } EXPORT_SYMBOL_GPL(nf_nat_helper_register); void nf_nat_helper_unregister(struct nf_conntrack_nat_helper *nat) { mutex_lock(&nf_ct_nat_helpers_mutex); list_del_rcu(&nat->list); mutex_unlock(&nf_ct_nat_helpers_mutex); } EXPORT_SYMBOL_GPL(nf_nat_helper_unregister); int nf_conntrack_helper_init(void) { nf_ct_helper_hsize = 1; /* gets rounded up to use one page */ nf_ct_helper_hash = nf_ct_alloc_hashtable(&nf_ct_helper_hsize, 0); if (!nf_ct_helper_hash) return -ENOMEM; INIT_LIST_HEAD(&nf_ct_nat_helpers); return 0; } void nf_conntrack_helper_fini(void) { kvfree(nf_ct_helper_hash); nf_ct_helper_hash = NULL; } |
| 549 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef IOCONTEXT_H #define IOCONTEXT_H #include <linux/radix-tree.h> #include <linux/rcupdate.h> #include <linux/workqueue.h> enum { ICQ_EXITED = 1 << 2, ICQ_DESTROYED = 1 << 3, }; /* * An io_cq (icq) is association between an io_context (ioc) and a * request_queue (q). This is used by elevators which need to track * information per ioc - q pair. * * Elevator can request use of icq by setting elevator_type->icq_size and * ->icq_align. Both size and align must be larger than that of struct * io_cq and elevator can use the tail area for private information. The * recommended way to do this is defining a struct which contains io_cq as * the first member followed by private members and using its size and * align. For example, * * struct snail_io_cq { * struct io_cq icq; * int poke_snail; * int feed_snail; * }; * * struct elevator_type snail_elv_type { * .ops = { ... }, * .icq_size = sizeof(struct snail_io_cq), * .icq_align = __alignof__(struct snail_io_cq), * ... * }; * * If icq_size is set, block core will manage icq's. All requests will * have its ->elv.icq field set before elevator_ops->elevator_set_req_fn() * is called and be holding a reference to the associated io_context. * * Whenever a new icq is created, elevator_ops->elevator_init_icq_fn() is * called and, on destruction, ->elevator_exit_icq_fn(). Both functions * are called with both the associated io_context and queue locks held. * * Elevator is allowed to lookup icq using ioc_lookup_icq() while holding * queue lock but the returned icq is valid only until the queue lock is * released. Elevators can not and should not try to create or destroy * icq's. * * As icq's are linked from both ioc and q, the locking rules are a bit * complex. * * - ioc lock nests inside q lock. * * - ioc->icq_list and icq->ioc_node are protected by ioc lock. * q->icq_list and icq->q_node by q lock. * * - ioc->icq_tree and ioc->icq_hint are protected by ioc lock, while icq * itself is protected by q lock. However, both the indexes and icq * itself are also RCU managed and lookup can be performed holding only * the q lock. * * - icq's are not reference counted. They are destroyed when either the * ioc or q goes away. Each request with icq set holds an extra * reference to ioc to ensure it stays until the request is completed. * * - Linking and unlinking icq's are performed while holding both ioc and q * locks. Due to the lock ordering, q exit is simple but ioc exit * requires reverse-order double lock dance. */ struct io_cq { struct request_queue *q; struct io_context *ioc; /* * q_node and ioc_node link io_cq through icq_list of q and ioc * respectively. Both fields are unused once ioc_exit_icq() is * called and shared with __rcu_icq_cache and __rcu_head which are * used for RCU free of io_cq. */ union { struct list_head q_node; struct kmem_cache *__rcu_icq_cache; }; union { struct hlist_node ioc_node; struct rcu_head __rcu_head; }; unsigned int flags; }; /* * I/O subsystem state of the associated processes. It is refcounted * and kmalloc'ed. These could be shared between processes. */ struct io_context { atomic_long_t refcount; atomic_t active_ref; unsigned short ioprio; #ifdef CONFIG_BLK_ICQ /* all the fields below are protected by this lock */ spinlock_t lock; struct radix_tree_root icq_tree; struct io_cq __rcu *icq_hint; struct hlist_head icq_list; struct work_struct release_work; #endif /* CONFIG_BLK_ICQ */ }; struct task_struct; #ifdef CONFIG_BLOCK void put_io_context(struct io_context *ioc); void exit_io_context(struct task_struct *task); int __copy_io(u64 clone_flags, struct task_struct *tsk); static inline int copy_io(u64 clone_flags, struct task_struct *tsk) { if (!current->io_context) return 0; return __copy_io(clone_flags, tsk); } #else struct io_context; static inline void put_io_context(struct io_context *ioc) { } static inline void exit_io_context(struct task_struct *task) { } static inline int copy_io(u64 clone_flags, struct task_struct *tsk) { return 0; } #endif /* CONFIG_BLOCK */ #endif /* IOCONTEXT_H */ |
| 35 6 7 7 3 6 7 7 5 4 2 4 4 2 2 4 7 3 6 1 3 2 5 2 1 4 5 2 2 6 5 4 5 6 6 6 7 7 6 6 6 4 5 5 5 5 2 1 2 2 4 3 1 4 4 19 25 5 1 19 6 6 6 3 1 4 2 2 2 1 1 1 2 4 5 5 41 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 | // SPDX-License-Identifier: GPL-2.0 #include <linux/err.h> #include <linux/slab.h> #include <linux/spinlock.h> #include "messages.h" #include "ctree.h" #include "extent_map.h" #include "compression.h" #include "btrfs_inode.h" #include "disk-io.h" static struct kmem_cache *extent_map_cache; int __init btrfs_extent_map_init(void) { extent_map_cache = kmem_cache_create("btrfs_extent_map", sizeof(struct extent_map), 0, 0, NULL); if (!extent_map_cache) return -ENOMEM; return 0; } void __cold btrfs_extent_map_exit(void) { kmem_cache_destroy(extent_map_cache); } /* * Initialize the extent tree @tree. Should be called for each new inode or * other user of the extent_map interface. */ void btrfs_extent_map_tree_init(struct extent_map_tree *tree) { tree->root = RB_ROOT; INIT_LIST_HEAD(&tree->modified_extents); rwlock_init(&tree->lock); } /* * Allocate a new extent_map structure. The new structure is returned with a * reference count of one and needs to be freed using free_extent_map() */ struct extent_map *btrfs_alloc_extent_map(void) { struct extent_map *em; em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS); if (!em) return NULL; RB_CLEAR_NODE(&em->rb_node); refcount_set(&em->refs, 1); INIT_LIST_HEAD(&em->list); return em; } /* * Drop the reference out on @em by one and free the structure if the reference * count hits zero. */ void btrfs_free_extent_map(struct extent_map *em) { if (!em) return; if (refcount_dec_and_test(&em->refs)) { WARN_ON(btrfs_extent_map_in_tree(em)); WARN_ON(!list_empty(&em->list)); kmem_cache_free(extent_map_cache, em); } } /* Do the math around the end of an extent, handling wrapping. */ static u64 range_end(u64 start, u64 len) { if (start + len < start) return (u64)-1; return start + len; } static void remove_em(struct btrfs_inode *inode, struct extent_map *em) { struct btrfs_fs_info *fs_info = inode->root->fs_info; rb_erase(&em->rb_node, &inode->extent_tree.root); RB_CLEAR_NODE(&em->rb_node); if (!btrfs_is_testing(fs_info) && btrfs_is_fstree(btrfs_root_id(inode->root))) percpu_counter_dec(&fs_info->evictable_extent_maps); } static int tree_insert(struct rb_root *root, struct extent_map *em) { struct rb_node **p = &root->rb_node; struct rb_node *parent = NULL; struct extent_map *entry = NULL; struct rb_node *orig_parent = NULL; u64 end = range_end(em->start, em->len); while (*p) { parent = *p; entry = rb_entry(parent, struct extent_map, rb_node); if (em->start < entry->start) p = &(*p)->rb_left; else if (em->start >= btrfs_extent_map_end(entry)) p = &(*p)->rb_right; else return -EEXIST; } orig_parent = parent; while (parent && em->start >= btrfs_extent_map_end(entry)) { parent = rb_next(parent); entry = rb_entry(parent, struct extent_map, rb_node); } if (parent) if (end > entry->start && em->start < btrfs_extent_map_end(entry)) return -EEXIST; parent = orig_parent; entry = rb_entry(parent, struct extent_map, rb_node); while (parent && em->start < entry->start) { parent = rb_prev(parent); entry = rb_entry(parent, struct extent_map, rb_node); } if (parent) if (end > entry->start && em->start < btrfs_extent_map_end(entry)) return -EEXIST; rb_link_node(&em->rb_node, orig_parent, p); rb_insert_color(&em->rb_node, root); return 0; } /* * Search through the tree for an extent_map with a given offset. If it can't * be found, try to find some neighboring extents */ static struct rb_node *tree_search(struct rb_root *root, u64 offset, struct rb_node **prev_or_next_ret) { struct rb_node *n = root->rb_node; struct rb_node *prev = NULL; struct rb_node *orig_prev = NULL; struct extent_map *entry; struct extent_map *prev_entry = NULL; ASSERT(prev_or_next_ret); while (n) { entry = rb_entry(n, struct extent_map, rb_node); prev = n; prev_entry = entry; if (offset < entry->start) n = n->rb_left; else if (offset >= btrfs_extent_map_end(entry)) n = n->rb_right; else return n; } orig_prev = prev; while (prev && offset >= btrfs_extent_map_end(prev_entry)) { prev = rb_next(prev); prev_entry = rb_entry(prev, struct extent_map, rb_node); } /* * Previous extent map found, return as in this case the caller does not * care about the next one. */ if (prev) { *prev_or_next_ret = prev; return NULL; } prev = orig_prev; prev_entry = rb_entry(prev, struct extent_map, rb_node); while (prev && offset < prev_entry->start) { prev = rb_prev(prev); prev_entry = rb_entry(prev, struct extent_map, rb_node); } *prev_or_next_ret = prev; return NULL; } static inline u64 extent_map_block_len(const struct extent_map *em) { if (btrfs_extent_map_is_compressed(em)) return em->disk_num_bytes; return em->len; } static inline u64 extent_map_block_end(const struct extent_map *em) { const u64 block_start = btrfs_extent_map_block_start(em); const u64 block_end = block_start + extent_map_block_len(em); if (block_end < block_start) return (u64)-1; return block_end; } static bool can_merge_extent_map(const struct extent_map *em) { if (em->flags & EXTENT_FLAG_PINNED) return false; /* Don't merge compressed extents, we need to know their actual size. */ if (btrfs_extent_map_is_compressed(em)) return false; if (em->flags & EXTENT_FLAG_LOGGING) return false; /* * We don't want to merge stuff that hasn't been written to the log yet * since it may not reflect exactly what is on disk, and that would be * bad. */ if (!list_empty(&em->list)) return false; return true; } /* Check to see if two extent_map structs are adjacent and safe to merge. */ static bool mergeable_maps(const struct extent_map *prev, const struct extent_map *next) { if (btrfs_extent_map_end(prev) != next->start) return false; /* * The merged flag is not an on-disk flag, it just indicates we had the * extent maps of 2 (or more) adjacent extents merged, so factor it out. */ if ((prev->flags & ~EXTENT_FLAG_MERGED) != (next->flags & ~EXTENT_FLAG_MERGED)) return false; if (next->disk_bytenr < EXTENT_MAP_LAST_BYTE - 1) return btrfs_extent_map_block_start(next) == extent_map_block_end(prev); /* HOLES and INLINE extents. */ return next->disk_bytenr == prev->disk_bytenr; } /* * Handle the on-disk data extents merge for @prev and @next. * * @prev: left extent to merge * @next: right extent to merge * @merged: the extent we will not discard after the merge; updated with new values * * After this, one of the two extents is the new merged extent and the other is * removed from the tree and likely freed. Note that @merged is one of @prev/@next * so there is const/non-const aliasing occurring here. * * Only touches disk_bytenr/disk_num_bytes/offset/ram_bytes. * For now only uncompressed regular extent can be merged. */ static void merge_ondisk_extents(const struct extent_map *prev, const struct extent_map *next, struct extent_map *merged) { u64 new_disk_bytenr; u64 new_disk_num_bytes; u64 new_offset; /* @prev and @next should not be compressed. */ ASSERT(!btrfs_extent_map_is_compressed(prev)); ASSERT(!btrfs_extent_map_is_compressed(next)); /* * There are two different cases where @prev and @next can be merged. * * 1) They are referring to the same data extent: * * |<----- data extent A ----->| * |<- prev ->|<- next ->| * * 2) They are referring to different data extents but still adjacent: * * |<-- data extent A -->|<-- data extent B -->| * |<- prev ->|<- next ->| * * The calculation here always merges the data extents first, then updates * @offset using the new data extents. * * For case 1), the merged data extent would be the same. * For case 2), we just merge the two data extents into one. */ new_disk_bytenr = min(prev->disk_bytenr, next->disk_bytenr); new_disk_num_bytes = max(prev->disk_bytenr + prev->disk_num_bytes, next->disk_bytenr + next->disk_num_bytes) - new_disk_bytenr; new_offset = prev->disk_bytenr + prev->offset - new_disk_bytenr; merged->disk_bytenr = new_disk_bytenr; merged->disk_num_bytes = new_disk_num_bytes; merged->ram_bytes = new_disk_num_bytes; merged->offset = new_offset; } static void dump_extent_map(struct btrfs_fs_info *fs_info, const char *prefix, struct extent_map *em) { if (!IS_ENABLED(CONFIG_BTRFS_DEBUG)) return; btrfs_crit(fs_info, "%s, start=%llu len=%llu disk_bytenr=%llu disk_num_bytes=%llu ram_bytes=%llu offset=%llu flags=0x%x", prefix, em->start, em->len, em->disk_bytenr, em->disk_num_bytes, em->ram_bytes, em->offset, em->flags); ASSERT(0); } /* Internal sanity checks for btrfs debug builds. */ static void validate_extent_map(struct btrfs_fs_info *fs_info, struct extent_map *em) { if (!IS_ENABLED(CONFIG_BTRFS_DEBUG)) return; if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) { if (em->disk_num_bytes == 0) dump_extent_map(fs_info, "zero disk_num_bytes", em); if (em->offset + em->len > em->ram_bytes) dump_extent_map(fs_info, "ram_bytes too small", em); if (em->offset + em->len > em->disk_num_bytes && !btrfs_extent_map_is_compressed(em)) dump_extent_map(fs_info, "disk_num_bytes too small", em); if (!btrfs_extent_map_is_compressed(em) && em->ram_bytes != em->disk_num_bytes) dump_extent_map(fs_info, "ram_bytes mismatch with disk_num_bytes for non-compressed em", em); } else if (em->offset) { dump_extent_map(fs_info, "non-zero offset for hole/inline", em); } } static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_map *merge = NULL; struct rb_node *rb; /* * We can't modify an extent map that is in the tree and that is being * used by another task, as it can cause that other task to see it in * inconsistent state during the merging. We always have 1 reference for * the tree and 1 for this task (which is unpinning the extent map or * clearing the logging flag), so anything > 2 means it's being used by * other tasks too. */ if (refcount_read(&em->refs) > 2) return; if (!can_merge_extent_map(em)) return; if (em->start != 0) { rb = rb_prev(&em->rb_node); merge = rb_entry_safe(rb, struct extent_map, rb_node); if (rb && can_merge_extent_map(merge) && mergeable_maps(merge, em)) { em->start = merge->start; em->len += merge->len; em->generation = max(em->generation, merge->generation); if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) merge_ondisk_extents(merge, em, em); em->flags |= EXTENT_FLAG_MERGED; validate_extent_map(fs_info, em); remove_em(inode, merge); btrfs_free_extent_map(merge); } } rb = rb_next(&em->rb_node); merge = rb_entry_safe(rb, struct extent_map, rb_node); if (rb && can_merge_extent_map(merge) && mergeable_maps(em, merge)) { em->len += merge->len; if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) merge_ondisk_extents(em, merge, em); validate_extent_map(fs_info, em); em->generation = max(em->generation, merge->generation); em->flags |= EXTENT_FLAG_MERGED; remove_em(inode, merge); btrfs_free_extent_map(merge); } } /* * Unpin an extent from the cache. * * @inode: the inode from which we are unpinning an extent range * @start: logical offset in the file * @len: length of the extent * @gen: generation that this extent has been modified in * * Called after an extent has been written to disk properly. Set the generation * to the generation that actually added the file item to the inode so we know * we need to sync this extent when we call fsync(). * * Returns: 0 on success * -ENOENT when the extent is not found in the tree * -EUCLEAN if the found extent does not match the expected start */ int btrfs_unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_map_tree *tree = &inode->extent_tree; int ret = 0; struct extent_map *em; write_lock(&tree->lock); em = btrfs_lookup_extent_mapping(tree, start, len); if (WARN_ON(!em)) { btrfs_warn(fs_info, "no extent map found for inode %llu (root %lld) when unpinning extent range [%llu, %llu), generation %llu", btrfs_ino(inode), btrfs_root_id(inode->root), start, start + len, gen); ret = -ENOENT; goto out; } if (WARN_ON(em->start != start)) { btrfs_warn(fs_info, "found extent map for inode %llu (root %lld) with unexpected start offset %llu when unpinning extent range [%llu, %llu), generation %llu", btrfs_ino(inode), btrfs_root_id(inode->root), em->start, start, start + len, gen); ret = -EUCLEAN; goto out; } em->generation = gen; em->flags &= ~EXTENT_FLAG_PINNED; try_merge_map(inode, em); out: write_unlock(&tree->lock); btrfs_free_extent_map(em); return ret; } void btrfs_clear_em_logging(struct btrfs_inode *inode, struct extent_map *em) { lockdep_assert_held_write(&inode->extent_tree.lock); em->flags &= ~EXTENT_FLAG_LOGGING; if (btrfs_extent_map_in_tree(em)) try_merge_map(inode, em); } static inline void setup_extent_mapping(struct btrfs_inode *inode, struct extent_map *em, bool modified) { refcount_inc(&em->refs); ASSERT(list_empty(&em->list)); if (modified) list_add(&em->list, &inode->extent_tree.modified_extents); else try_merge_map(inode, em); } /* * Add a new extent map to an inode's extent map tree. * * @inode: the target inode * @em: map to insert * @modified: indicate whether the given @em should be added to the * modified list, which indicates the extent needs to be logged * * Insert @em into the @inode's extent map tree or perform a simple * forward/backward merge with existing mappings. The extent_map struct passed * in will be inserted into the tree directly, with an additional reference * taken, or a reference dropped if the merge attempt was successful. */ static int add_extent_mapping(struct btrfs_inode *inode, struct extent_map *em, bool modified) { struct extent_map_tree *tree = &inode->extent_tree; struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; int ret; lockdep_assert_held_write(&tree->lock); validate_extent_map(fs_info, em); ret = tree_insert(&tree->root, em); if (ret) return ret; setup_extent_mapping(inode, em, modified); if (!btrfs_is_testing(fs_info) && btrfs_is_fstree(btrfs_root_id(root))) percpu_counter_inc(&fs_info->evictable_extent_maps); return 0; } static struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, u64 start, u64 len, bool strict) { struct extent_map *em; struct rb_node *rb_node; struct rb_node *prev_or_next = NULL; u64 end = range_end(start, len); rb_node = tree_search(&tree->root, start, &prev_or_next); if (!rb_node) { if (prev_or_next) rb_node = prev_or_next; else return NULL; } em = rb_entry(rb_node, struct extent_map, rb_node); if (strict && !(end > em->start && start < btrfs_extent_map_end(em))) return NULL; refcount_inc(&em->refs); return em; } /* * Lookup extent_map that intersects @start + @len range. * * @tree: tree to lookup in * @start: byte offset to start the search * @len: length of the lookup range * * Find and return the first extent_map struct in @tree that intersects the * [start, len] range. There may be additional objects in the tree that * intersect, so check the object returned carefully to make sure that no * additional lookups are needed. */ struct extent_map *btrfs_lookup_extent_mapping(struct extent_map_tree *tree, u64 start, u64 len) { return lookup_extent_mapping(tree, start, len, true); } /* * Find a nearby extent map intersecting @start + @len (not an exact search). * * @tree: tree to lookup in * @start: byte offset to start the search * @len: length of the lookup range * * Find and return the first extent_map struct in @tree that intersects the * [start, len] range. * * If one can't be found, any nearby extent may be returned */ struct extent_map *btrfs_search_extent_mapping(struct extent_map_tree *tree, u64 start, u64 len) { return lookup_extent_mapping(tree, start, len, false); } /* * Remove an extent_map from its inode's extent tree. * * @inode: the inode the extent map belongs to * @em: extent map being removed * * Remove @em from the extent tree of @inode. No reference counts are dropped, * and no checks are done to see if the range is in use. */ void btrfs_remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *em) { struct extent_map_tree *tree = &inode->extent_tree; lockdep_assert_held_write(&tree->lock); WARN_ON(em->flags & EXTENT_FLAG_PINNED); if (!(em->flags & EXTENT_FLAG_LOGGING)) list_del_init(&em->list); remove_em(inode, em); } static void replace_extent_mapping(struct btrfs_inode *inode, struct extent_map *cur, struct extent_map *new, bool modified) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_map_tree *tree = &inode->extent_tree; lockdep_assert_held_write(&tree->lock); validate_extent_map(fs_info, new); WARN_ON(cur->flags & EXTENT_FLAG_PINNED); ASSERT(btrfs_extent_map_in_tree(cur)); if (!(cur->flags & EXTENT_FLAG_LOGGING)) list_del_init(&cur->list); rb_replace_node(&cur->rb_node, &new->rb_node, &tree->root); RB_CLEAR_NODE(&cur->rb_node); setup_extent_mapping(inode, new, modified); } static struct extent_map *next_extent_map(const struct extent_map *em) { struct rb_node *next; next = rb_next(&em->rb_node); if (!next) return NULL; return container_of(next, struct extent_map, rb_node); } static struct extent_map *prev_extent_map(struct extent_map *em) { struct rb_node *prev; prev = rb_prev(&em->rb_node); if (!prev) return NULL; return container_of(prev, struct extent_map, rb_node); } /* * Helper for btrfs_get_extent. Given an existing extent in the tree, * the existing extent is the nearest extent to map_start, * and an extent that you want to insert, deal with overlap and insert * the best fitted new extent into the tree. */ static noinline int merge_extent_mapping(struct btrfs_inode *inode, struct extent_map *existing, struct extent_map *em, u64 map_start) { struct extent_map *prev; struct extent_map *next; u64 start; u64 end; u64 start_diff; if (map_start < em->start || map_start >= btrfs_extent_map_end(em)) return -EINVAL; if (existing->start > map_start) { next = existing; prev = prev_extent_map(next); } else { prev = existing; next = next_extent_map(prev); } start = prev ? btrfs_extent_map_end(prev) : em->start; start = max_t(u64, start, em->start); end = next ? next->start : btrfs_extent_map_end(em); end = min_t(u64, end, btrfs_extent_map_end(em)); start_diff = start - em->start; em->start = start; em->len = end - start; if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) em->offset += start_diff; return add_extent_mapping(inode, em, false); } /* * Add extent mapping into an inode's extent map tree. * * @inode: target inode * @em_in: extent we are inserting * @start: start of the logical range btrfs_get_extent() is requesting * @len: length of the logical range btrfs_get_extent() is requesting * * Note that @em_in's range may be different from [start, start+len), * but they must be overlapped. * * Insert @em_in into the inode's extent map tree. In case there is an * overlapping range, handle the -EEXIST by either: * a) Returning the existing extent in @em_in if @start is within the * existing em. * b) Merge the existing extent with @em_in passed in. * * Return 0 on success, otherwise -EEXIST. * */ int btrfs_add_extent_mapping(struct btrfs_inode *inode, struct extent_map **em_in, u64 start, u64 len) { int ret; struct extent_map *em = *em_in; struct btrfs_fs_info *fs_info = inode->root->fs_info; /* * Tree-checker should have rejected any inline extent with non-zero * file offset. Here just do a sanity check. */ if (em->disk_bytenr == EXTENT_MAP_INLINE) ASSERT(em->start == 0); ret = add_extent_mapping(inode, em, false); /* it is possible that someone inserted the extent into the tree * while we had the lock dropped. It is also possible that * an overlapping map exists in the tree */ if (ret == -EEXIST) { struct extent_map *existing; existing = btrfs_search_extent_mapping(&inode->extent_tree, start, len); trace_btrfs_handle_em_exist(fs_info, existing, em, start, len); /* * existing will always be non-NULL, since there must be * extent causing the -EEXIST. */ if (start >= existing->start && start < btrfs_extent_map_end(existing)) { btrfs_free_extent_map(em); *em_in = existing; ret = 0; } else { u64 orig_start = em->start; u64 orig_len = em->len; /* * The existing extent map is the one nearest to * the [start, start + len) range which overlaps */ ret = merge_extent_mapping(inode, existing, em, start); if (WARN_ON(ret)) { btrfs_free_extent_map(em); *em_in = NULL; btrfs_warn(fs_info, "extent map merge error existing [%llu, %llu) with em [%llu, %llu) start %llu", existing->start, btrfs_extent_map_end(existing), orig_start, orig_start + orig_len, start); } btrfs_free_extent_map(existing); } } ASSERT(ret == 0 || ret == -EEXIST); return ret; } /* * Drop all extent maps from a tree in the fastest possible way, rescheduling * if needed. This avoids searching the tree, from the root down to the first * extent map, before each deletion. */ static void drop_all_extent_maps_fast(struct btrfs_inode *inode) { struct extent_map_tree *tree = &inode->extent_tree; struct rb_node *node; write_lock(&tree->lock); node = rb_first(&tree->root); while (node) { struct extent_map *em; struct rb_node *next = rb_next(node); em = rb_entry(node, struct extent_map, rb_node); em->flags &= ~(EXTENT_FLAG_PINNED | EXTENT_FLAG_LOGGING); btrfs_remove_extent_mapping(inode, em); btrfs_free_extent_map(em); if (cond_resched_rwlock_write(&tree->lock)) node = rb_first(&tree->root); else node = next; } write_unlock(&tree->lock); } /* * Drop all extent maps in a given range. * * @inode: The target inode. * @start: Start offset of the range. * @end: End offset of the range (inclusive value). * @skip_pinned: Indicate if pinned extent maps should be ignored or not. * * This drops all the extent maps that intersect the given range [@start, @end]. * Extent maps that partially overlap the range and extend behind or beyond it, * are split. * The caller should have locked an appropriate file range in the inode's io * tree before calling this function. */ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, bool skip_pinned) { struct extent_map *split; struct extent_map *split2; struct extent_map *em; struct extent_map_tree *em_tree = &inode->extent_tree; u64 len = end - start + 1; WARN_ON(end < start); if (end == (u64)-1) { if (start == 0 && !skip_pinned) { drop_all_extent_maps_fast(inode); return; } len = (u64)-1; } else { /* Make end offset exclusive for use in the loop below. */ end++; } /* * It's ok if we fail to allocate the extent maps, see the comment near * the bottom of the loop below. We only need two spare extent maps in * the worst case, where the first extent map that intersects our range * starts before the range and the last extent map that intersects our * range ends after our range (and they might be the same extent map), * because we need to split those two extent maps at the boundaries. */ split = btrfs_alloc_extent_map(); split2 = btrfs_alloc_extent_map(); write_lock(&em_tree->lock); em = btrfs_lookup_extent_mapping(em_tree, start, len); while (em) { /* extent_map_end() returns exclusive value (last byte + 1). */ const u64 em_end = btrfs_extent_map_end(em); struct extent_map *next_em = NULL; u64 gen; unsigned long flags; bool modified; if (em_end < end) { next_em = next_extent_map(em); if (next_em) { if (next_em->start < end) refcount_inc(&next_em->refs); else next_em = NULL; } } if (skip_pinned && (em->flags & EXTENT_FLAG_PINNED)) { start = em_end; goto next; } flags = em->flags; /* * In case we split the extent map, we want to preserve the * EXTENT_FLAG_LOGGING flag on our extent map, but we don't want * it on the new extent maps. */ em->flags &= ~(EXTENT_FLAG_PINNED | EXTENT_FLAG_LOGGING); modified = !list_empty(&em->list); /* * The extent map does not cross our target range, so no need to * split it, we can remove it directly. */ if (em->start >= start && em_end <= end) goto remove_em; gen = em->generation; if (em->start < start) { if (!split) { split = split2; split2 = NULL; if (!split) goto remove_em; } split->start = em->start; split->len = start - em->start; if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) { split->disk_bytenr = em->disk_bytenr; split->disk_num_bytes = em->disk_num_bytes; split->offset = em->offset; split->ram_bytes = em->ram_bytes; } else { split->disk_bytenr = em->disk_bytenr; split->disk_num_bytes = 0; split->offset = 0; split->ram_bytes = split->len; } split->generation = gen; split->flags = flags; replace_extent_mapping(inode, em, split, modified); btrfs_free_extent_map(split); split = split2; split2 = NULL; } if (em_end > end) { if (!split) { split = split2; split2 = NULL; if (!split) goto remove_em; } split->start = end; split->len = em_end - end; split->disk_bytenr = em->disk_bytenr; split->flags = flags; split->generation = gen; if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) { split->disk_num_bytes = em->disk_num_bytes; split->offset = em->offset + end - em->start; split->ram_bytes = em->ram_bytes; } else { split->disk_num_bytes = 0; split->offset = 0; split->ram_bytes = split->len; } if (btrfs_extent_map_in_tree(em)) { replace_extent_mapping(inode, em, split, modified); } else { int ret; ret = add_extent_mapping(inode, split, modified); /* Logic error, shouldn't happen. */ ASSERT(ret == 0); if (WARN_ON(ret != 0) && modified) btrfs_set_inode_full_sync(inode); } btrfs_free_extent_map(split); split = NULL; } remove_em: if (btrfs_extent_map_in_tree(em)) { /* * If the extent map is still in the tree it means that * either of the following is true: * * 1) It fits entirely in our range (doesn't end beyond * it or starts before it); * * 2) It starts before our range and/or ends after our * range, and we were not able to allocate the extent * maps for split operations, @split and @split2. * * If we are at case 2) then we just remove the entire * extent map - this is fine since if anyone needs it to * access the subranges outside our range, will just * load it again from the subvolume tree's file extent * item. However if the extent map was in the list of * modified extents, then we must mark the inode for a * full fsync, otherwise a fast fsync will miss this * extent if it's new and needs to be logged. */ if ((em->start < start || em_end > end) && modified) { ASSERT(!split); btrfs_set_inode_full_sync(inode); } btrfs_remove_extent_mapping(inode, em); } /* * Once for the tree reference (we replaced or removed the * extent map from the tree). */ btrfs_free_extent_map(em); next: /* Once for us (for our lookup reference). */ btrfs_free_extent_map(em); em = next_em; } write_unlock(&em_tree->lock); btrfs_free_extent_map(split); btrfs_free_extent_map(split2); } /* * Replace a range in the inode's extent map tree with a new extent map. * * @inode: The target inode. * @new_em: The new extent map to add to the inode's extent map tree. * @modified: Indicate if the new extent map should be added to the list of * modified extents (for fast fsync tracking). * * Drops all the extent maps in the inode's extent map tree that intersect the * range of the new extent map and adds the new extent map to the tree. * The caller should have locked an appropriate file range in the inode's io * tree before calling this function. */ int btrfs_replace_extent_map_range(struct btrfs_inode *inode, struct extent_map *new_em, bool modified) { const u64 end = new_em->start + new_em->len - 1; struct extent_map_tree *tree = &inode->extent_tree; int ret; ASSERT(!btrfs_extent_map_in_tree(new_em)); /* * The caller has locked an appropriate file range in the inode's io * tree, but getting -EEXIST when adding the new extent map can still * happen in case there are extents that partially cover the range, and * this is due to two tasks operating on different parts of the extent. * See commit 18e83ac75bfe67 ("Btrfs: fix unexpected EEXIST from * btrfs_get_extent") for an example and details. */ do { btrfs_drop_extent_map_range(inode, new_em->start, end, false); write_lock(&tree->lock); ret = add_extent_mapping(inode, new_em, modified); write_unlock(&tree->lock); } while (ret == -EEXIST); return ret; } /* * Split off the first pre bytes from the extent_map at [start, start + len], * and set the block_start for it to new_logical. * * This function is used when an ordered_extent needs to be split. */ int btrfs_split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, u64 new_logical) { struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_map *em; struct extent_map *split_pre = NULL; struct extent_map *split_mid = NULL; int ret = 0; unsigned long flags; ASSERT(pre != 0); ASSERT(pre < len); split_pre = btrfs_alloc_extent_map(); if (!split_pre) return -ENOMEM; split_mid = btrfs_alloc_extent_map(); if (!split_mid) { ret = -ENOMEM; goto out_free_pre; } btrfs_lock_extent(&inode->io_tree, start, start + len - 1, NULL); write_lock(&em_tree->lock); em = btrfs_lookup_extent_mapping(em_tree, start, len); if (unlikely(!em)) { ret = -EIO; goto out_unlock; } ASSERT(em->len == len); ASSERT(!btrfs_extent_map_is_compressed(em)); ASSERT(em->disk_bytenr < EXTENT_MAP_LAST_BYTE); ASSERT(em->flags & EXTENT_FLAG_PINNED); ASSERT(!(em->flags & EXTENT_FLAG_LOGGING)); ASSERT(!list_empty(&em->list)); flags = em->flags; em->flags &= ~EXTENT_FLAG_PINNED; /* First, replace the em with a new extent_map starting from * em->start */ split_pre->start = em->start; split_pre->len = pre; split_pre->disk_bytenr = new_logical; split_pre->disk_num_bytes = split_pre->len; split_pre->offset = 0; split_pre->ram_bytes = split_pre->len; split_pre->flags = flags; split_pre->generation = em->generation; replace_extent_mapping(inode, em, split_pre, true); /* * Now we only have an extent_map at: * [em->start, em->start + pre] */ /* Insert the middle extent_map. */ split_mid->start = em->start + pre; split_mid->len = em->len - pre; split_mid->disk_bytenr = btrfs_extent_map_block_start(em) + pre; split_mid->disk_num_bytes = split_mid->len; split_mid->offset = 0; split_mid->ram_bytes = split_mid->len; split_mid->flags = flags; split_mid->generation = em->generation; add_extent_mapping(inode, split_mid, true); /* Once for us */ btrfs_free_extent_map(em); /* Once for the tree */ btrfs_free_extent_map(em); out_unlock: write_unlock(&em_tree->lock); btrfs_unlock_extent(&inode->io_tree, start, start + len - 1, NULL); btrfs_free_extent_map(split_mid); out_free_pre: btrfs_free_extent_map(split_pre); return ret; } struct btrfs_em_shrink_ctx { long nr_to_scan; long scanned; }; static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_ctx *ctx) { struct btrfs_fs_info *fs_info = inode->root->fs_info; const u64 cur_fs_gen = btrfs_get_fs_generation(fs_info); struct extent_map_tree *tree = &inode->extent_tree; long nr_dropped = 0; struct rb_node *node; lockdep_assert_held_write(&tree->lock); /* * Take the mmap lock so that we serialize with the inode logging phase * of fsync because we may need to set the full sync flag on the inode, * in case we have to remove extent maps in the tree's list of modified * extents. If we set the full sync flag in the inode while an fsync is * in progress, we may risk missing new extents because before the flag * is set, fsync decides to only wait for writeback to complete and then * during inode logging it sees the flag set and uses the subvolume tree * to find new extents, which may not be there yet because ordered * extents haven't completed yet. * * We also do a try lock because we don't want to block for too long and * we are holding the extent map tree's lock in write mode. */ if (!down_read_trylock(&inode->i_mmap_lock)) return 0; node = rb_first(&tree->root); while (node) { struct rb_node *next = rb_next(node); struct extent_map *em; em = rb_entry(node, struct extent_map, rb_node); ctx->scanned++; if (em->flags & EXTENT_FLAG_PINNED) goto next; /* * If the inode is in the list of modified extents (new) and its * generation is the same (or is greater than) the current fs * generation, it means it was not yet persisted so we have to * set the full sync flag so that the next fsync will not miss * it. */ if (!list_empty(&em->list) && em->generation >= cur_fs_gen) btrfs_set_inode_full_sync(inode); btrfs_remove_extent_mapping(inode, em); trace_btrfs_extent_map_shrinker_remove_em(inode, em); /* Drop the reference for the tree. */ btrfs_free_extent_map(em); nr_dropped++; next: if (ctx->scanned >= ctx->nr_to_scan) break; /* * Stop if we need to reschedule or there's contention on the * lock. This is to avoid slowing other tasks trying to take the * lock. */ if (need_resched() || rwlock_needbreak(&tree->lock) || btrfs_fs_closing(fs_info)) break; node = next; } up_read(&inode->i_mmap_lock); return nr_dropped; } static struct btrfs_inode *find_first_inode_to_shrink(struct btrfs_root *root, u64 min_ino) { struct btrfs_inode *inode; unsigned long from = min_ino; xa_lock(&root->inodes); while (true) { struct extent_map_tree *tree; inode = xa_find(&root->inodes, &from, ULONG_MAX, XA_PRESENT); if (!inode) break; tree = &inode->extent_tree; /* * We want to be fast so if the lock is busy we don't want to * spend time waiting for it (some task is about to do IO for * the inode). */ if (!write_trylock(&tree->lock)) goto next; /* * Skip inode if it doesn't have loaded extent maps, so we avoid * getting a reference and doing an iput later. This includes * cases like files that were opened for things like stat(2), or * files with all extent maps previously released through the * release folio callback (btrfs_release_folio()) or released in * a previous run, or directories which never have extent maps. */ if (RB_EMPTY_ROOT(&tree->root)) { write_unlock(&tree->lock); goto next; } if (igrab(&inode->vfs_inode)) break; write_unlock(&tree->lock); next: from = btrfs_ino(inode) + 1; cond_resched_lock(&root->inodes.xa_lock); } xa_unlock(&root->inodes); return inode; } static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx *ctx) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_inode *inode; long nr_dropped = 0; u64 min_ino = fs_info->em_shrinker_last_ino + 1; inode = find_first_inode_to_shrink(root, min_ino); while (inode) { nr_dropped += btrfs_scan_inode(inode, ctx); write_unlock(&inode->extent_tree.lock); min_ino = btrfs_ino(inode) + 1; fs_info->em_shrinker_last_ino = btrfs_ino(inode); iput(&inode->vfs_inode); if (ctx->scanned >= ctx->nr_to_scan || btrfs_fs_closing(fs_info)) break; cond_resched(); inode = find_first_inode_to_shrink(root, min_ino); } if (inode) { /* * There are still inodes in this root or we happened to process * the last one and reached the scan limit. In either case set * the current root to this one, so we'll resume from the next * inode if there is one or we will find out this was the last * one and move to the next root. */ fs_info->em_shrinker_last_root = btrfs_root_id(root); } else { /* * No more inodes in this root, set extent_map_shrinker_last_ino to 0 so * that when processing the next root we start from its first inode. */ fs_info->em_shrinker_last_ino = 0; fs_info->em_shrinker_last_root = btrfs_root_id(root) + 1; } return nr_dropped; } static void btrfs_extent_map_shrinker_worker(struct work_struct *work) { struct btrfs_fs_info *fs_info; struct btrfs_em_shrink_ctx ctx; u64 start_root_id; u64 next_root_id; bool cycled = false; long nr_dropped = 0; fs_info = container_of(work, struct btrfs_fs_info, em_shrinker_work); ctx.scanned = 0; ctx.nr_to_scan = atomic64_read(&fs_info->em_shrinker_nr_to_scan); start_root_id = fs_info->em_shrinker_last_root; next_root_id = fs_info->em_shrinker_last_root; if (trace_btrfs_extent_map_shrinker_scan_enter_enabled()) { s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps); trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr); } while (ctx.scanned < ctx.nr_to_scan && !btrfs_fs_closing(fs_info)) { struct btrfs_root *root; unsigned long count; cond_resched(); spin_lock(&fs_info->fs_roots_radix_lock); count = radix_tree_gang_lookup(&fs_info->fs_roots_radix, (void **)&root, (unsigned long)next_root_id, 1); if (count == 0) { spin_unlock(&fs_info->fs_roots_radix_lock); if (start_root_id > 0 && !cycled) { next_root_id = 0; fs_info->em_shrinker_last_root = 0; fs_info->em_shrinker_last_ino = 0; cycled = true; continue; } break; } next_root_id = btrfs_root_id(root) + 1; root = btrfs_grab_root(root); spin_unlock(&fs_info->fs_roots_radix_lock); if (!root) continue; if (btrfs_is_fstree(btrfs_root_id(root))) nr_dropped += btrfs_scan_root(root, &ctx); btrfs_put_root(root); } if (trace_btrfs_extent_map_shrinker_scan_exit_enabled()) { s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps); trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped, nr); } atomic64_set(&fs_info->em_shrinker_nr_to_scan, 0); } void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan) { /* * Do nothing if the shrinker is already running. In case of high memory * pressure we can have a lot of tasks calling us and all passing the * same nr_to_scan value, but in reality we may need only to free * nr_to_scan extent maps (or less). In case we need to free more than * that, we will be called again by the fs shrinker, so no worries about * not doing enough work to reclaim memory from extent maps. * We can also be repeatedly called with the same nr_to_scan value * simply because the shrinker runs asynchronously and multiple calls * to this function are made before the shrinker does enough progress. * * That's why we set the atomic counter to nr_to_scan only if its * current value is zero, instead of incrementing the counter by * nr_to_scan. */ if (atomic64_cmpxchg(&fs_info->em_shrinker_nr_to_scan, 0, nr_to_scan) != 0) return; queue_work(system_dfl_wq, &fs_info->em_shrinker_work); } void btrfs_init_extent_map_shrinker_work(struct btrfs_fs_info *fs_info) { atomic64_set(&fs_info->em_shrinker_nr_to_scan, 0); INIT_WORK(&fs_info->em_shrinker_work, btrfs_extent_map_shrinker_worker); } |
| 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2023 Bootlin * */ #include "common.h" #include "netlink.h" #include <linux/phy.h> #include <linux/phy_link_topology.h> #include <linux/sfp.h> #include <net/netdev_lock.h> struct phy_req_info { struct ethnl_req_info base; }; struct phy_reply_data { struct ethnl_reply_data base; u32 phyindex; char *drvname; char *name; unsigned int upstream_type; char *upstream_sfp_name; unsigned int upstream_index; char *downstream_sfp_name; }; #define PHY_REPDATA(__reply_base) \ container_of(__reply_base, struct phy_reply_data, base) const struct nla_policy ethnl_phy_get_policy[ETHTOOL_A_PHY_HEADER + 1] = { [ETHTOOL_A_PHY_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), }; static int phy_reply_size(const struct ethnl_req_info *req_info, const struct ethnl_reply_data *reply_data) { struct phy_reply_data *rep_data = PHY_REPDATA(reply_data); size_t size = 0; /* ETHTOOL_A_PHY_INDEX */ size += nla_total_size(sizeof(u32)); /* ETHTOOL_A_DRVNAME */ if (rep_data->drvname) size += nla_total_size(strlen(rep_data->drvname) + 1); /* ETHTOOL_A_NAME */ size += nla_total_size(strlen(rep_data->name) + 1); /* ETHTOOL_A_PHY_UPSTREAM_TYPE */ size += nla_total_size(sizeof(u32)); /* ETHTOOL_A_PHY_UPSTREAM_SFP_NAME */ if (rep_data->upstream_sfp_name) size += nla_total_size(strlen(rep_data->upstream_sfp_name) + 1); /* ETHTOOL_A_PHY_UPSTREAM_INDEX */ if (rep_data->upstream_index) size += nla_total_size(sizeof(u32)); /* ETHTOOL_A_PHY_DOWNSTREAM_SFP_NAME */ if (rep_data->downstream_sfp_name) size += nla_total_size(strlen(rep_data->downstream_sfp_name) + 1); return size; } static int phy_prepare_data(const struct ethnl_req_info *req_info, struct ethnl_reply_data *reply_data, const struct genl_info *info) { struct phy_link_topology *topo = reply_data->dev->link_topo; struct phy_reply_data *rep_data = PHY_REPDATA(reply_data); struct nlattr **tb = info->attrs; struct phy_device_node *pdn; struct phy_device *phydev; /* RTNL is held by the caller */ phydev = ethnl_req_get_phydev(req_info, tb, ETHTOOL_A_PHY_HEADER, info->extack); if (IS_ERR_OR_NULL(phydev)) return -EOPNOTSUPP; pdn = xa_load(&topo->phys, phydev->phyindex); if (!pdn) return -EOPNOTSUPP; rep_data->phyindex = phydev->phyindex; rep_data->name = kstrdup(dev_name(&phydev->mdio.dev), GFP_KERNEL); rep_data->drvname = kstrdup(phydev->drv->name, GFP_KERNEL); rep_data->upstream_type = pdn->upstream_type; if (pdn->upstream_type == PHY_UPSTREAM_PHY) { struct phy_device *upstream = pdn->upstream.phydev; rep_data->upstream_index = upstream->phyindex; } if (pdn->parent_sfp_bus) rep_data->upstream_sfp_name = kstrdup(sfp_get_name(pdn->parent_sfp_bus), GFP_KERNEL); if (phydev->sfp_bus) rep_data->downstream_sfp_name = kstrdup(sfp_get_name(phydev->sfp_bus), GFP_KERNEL); return 0; } static int phy_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_info, const struct ethnl_reply_data *reply_data) { struct phy_reply_data *rep_data = PHY_REPDATA(reply_data); if (nla_put_u32(skb, ETHTOOL_A_PHY_INDEX, rep_data->phyindex) || nla_put_string(skb, ETHTOOL_A_PHY_NAME, rep_data->name) || nla_put_u32(skb, ETHTOOL_A_PHY_UPSTREAM_TYPE, rep_data->upstream_type)) return -EMSGSIZE; if (rep_data->drvname && nla_put_string(skb, ETHTOOL_A_PHY_DRVNAME, rep_data->drvname)) return -EMSGSIZE; if (rep_data->upstream_index && nla_put_u32(skb, ETHTOOL_A_PHY_UPSTREAM_INDEX, rep_data->upstream_index)) return -EMSGSIZE; if (rep_data->upstream_sfp_name && nla_put_string(skb, ETHTOOL_A_PHY_UPSTREAM_SFP_NAME, rep_data->upstream_sfp_name)) return -EMSGSIZE; if (rep_data->downstream_sfp_name && nla_put_string(skb, ETHTOOL_A_PHY_DOWNSTREAM_SFP_NAME, rep_data->downstream_sfp_name)) return -EMSGSIZE; return 0; } static void phy_cleanup_data(struct ethnl_reply_data *reply_data) { struct phy_reply_data *rep_data = PHY_REPDATA(reply_data); kfree(rep_data->drvname); kfree(rep_data->name); kfree(rep_data->upstream_sfp_name); kfree(rep_data->downstream_sfp_name); } const struct ethnl_request_ops ethnl_phy_request_ops = { .request_cmd = ETHTOOL_MSG_PHY_GET, .reply_cmd = ETHTOOL_MSG_PHY_GET_REPLY, .hdr_attr = ETHTOOL_A_PHY_HEADER, .req_info_size = sizeof(struct phy_req_info), .reply_data_size = sizeof(struct phy_reply_data), .prepare_data = phy_prepare_data, .reply_size = phy_reply_size, .fill_reply = phy_fill_reply, .cleanup_data = phy_cleanup_data, }; |
| 55 55 55 55 55 55 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) ST-Ericsson AB 2010 * Author: Sjur Brendeland */ #define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ #include <linux/stddef.h> #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/rculist.h> #include <net/caif/cfpkt.h> #include <net/caif/cfmuxl.h> #include <net/caif/cfsrvl.h> #include <net/caif/cffrml.h> #define container_obj(layr) container_of(layr, struct cfmuxl, layer) #define CAIF_CTRL_CHANNEL 0 #define UP_CACHE_SIZE 8 #define DN_CACHE_SIZE 8 struct cfmuxl { struct cflayer layer; struct list_head srvl_list; struct list_head frml_list; struct cflayer *up_cache[UP_CACHE_SIZE]; struct cflayer *dn_cache[DN_CACHE_SIZE]; /* * Set when inserting or removing downwards layers. */ spinlock_t transmit_lock; /* * Set when inserting or removing upwards layers. */ spinlock_t receive_lock; }; static int cfmuxl_receive(struct cflayer *layr, struct cfpkt *pkt); static int cfmuxl_transmit(struct cflayer *layr, struct cfpkt *pkt); static void cfmuxl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl, int phyid); static struct cflayer *get_up(struct cfmuxl *muxl, u16 id); struct cflayer *cfmuxl_create(void) { struct cfmuxl *this = kzalloc(sizeof(struct cfmuxl), GFP_ATOMIC); if (!this) return NULL; this->layer.receive = cfmuxl_receive; this->layer.transmit = cfmuxl_transmit; this->layer.ctrlcmd = cfmuxl_ctrlcmd; INIT_LIST_HEAD(&this->srvl_list); INIT_LIST_HEAD(&this->frml_list); spin_lock_init(&this->transmit_lock); spin_lock_init(&this->receive_lock); snprintf(this->layer.name, CAIF_LAYER_NAME_SZ, "mux"); return &this->layer; } int cfmuxl_set_dnlayer(struct cflayer *layr, struct cflayer *dn, u8 phyid) { struct cfmuxl *muxl = (struct cfmuxl *) layr; spin_lock_bh(&muxl->transmit_lock); list_add_rcu(&dn->node, &muxl->frml_list); spin_unlock_bh(&muxl->transmit_lock); return 0; } static struct cflayer *get_from_id(struct list_head *list, u16 id) { struct cflayer *lyr; list_for_each_entry_rcu(lyr, list, node) { if (lyr->id == id) return lyr; } return NULL; } int cfmuxl_set_uplayer(struct cflayer *layr, struct cflayer *up, u8 linkid) { struct cfmuxl *muxl = container_obj(layr); struct cflayer *old; spin_lock_bh(&muxl->receive_lock); /* Two entries with same id is wrong, so remove old layer from mux */ old = get_from_id(&muxl->srvl_list, linkid); if (old != NULL) list_del_rcu(&old->node); list_add_rcu(&up->node, &muxl->srvl_list); spin_unlock_bh(&muxl->receive_lock); return 0; } struct cflayer *cfmuxl_remove_dnlayer(struct cflayer *layr, u8 phyid) { struct cfmuxl *muxl = container_obj(layr); struct cflayer *dn; int idx = phyid % DN_CACHE_SIZE; spin_lock_bh(&muxl->transmit_lock); RCU_INIT_POINTER(muxl->dn_cache[idx], NULL); dn = get_from_id(&muxl->frml_list, phyid); if (dn == NULL) goto out; list_del_rcu(&dn->node); caif_assert(dn != NULL); out: spin_unlock_bh(&muxl->transmit_lock); return dn; } static struct cflayer *get_up(struct cfmuxl *muxl, u16 id) { struct cflayer *up; int idx = id % UP_CACHE_SIZE; up = rcu_dereference(muxl->up_cache[idx]); if (up == NULL || up->id != id) { spin_lock_bh(&muxl->receive_lock); up = get_from_id(&muxl->srvl_list, id); rcu_assign_pointer(muxl->up_cache[idx], up); spin_unlock_bh(&muxl->receive_lock); } return up; } static struct cflayer *get_dn(struct cfmuxl *muxl, struct dev_info *dev_info) { struct cflayer *dn; int idx = dev_info->id % DN_CACHE_SIZE; dn = rcu_dereference(muxl->dn_cache[idx]); if (dn == NULL || dn->id != dev_info->id) { spin_lock_bh(&muxl->transmit_lock); dn = get_from_id(&muxl->frml_list, dev_info->id); rcu_assign_pointer(muxl->dn_cache[idx], dn); spin_unlock_bh(&muxl->transmit_lock); } return dn; } struct cflayer *cfmuxl_remove_uplayer(struct cflayer *layr, u8 id) { struct cflayer *up; struct cfmuxl *muxl = container_obj(layr); int idx = id % UP_CACHE_SIZE; if (id == 0) { pr_warn("Trying to remove control layer\n"); return NULL; } spin_lock_bh(&muxl->receive_lock); up = get_from_id(&muxl->srvl_list, id); if (up == NULL) goto out; RCU_INIT_POINTER(muxl->up_cache[idx], NULL); list_del_rcu(&up->node); out: spin_unlock_bh(&muxl->receive_lock); return up; } static int cfmuxl_receive(struct cflayer *layr, struct cfpkt *pkt) { int ret; struct cfmuxl *muxl = container_obj(layr); u8 id; struct cflayer *up; if (cfpkt_extr_head(pkt, &id, 1) < 0) { pr_err("erroneous Caif Packet\n"); cfpkt_destroy(pkt); return -EPROTO; } rcu_read_lock(); up = get_up(muxl, id); if (up == NULL) { pr_debug("Received data on unknown link ID = %d (0x%x)" " up == NULL", id, id); cfpkt_destroy(pkt); /* * Don't return ERROR, since modem misbehaves and sends out * flow on before linksetup response. */ rcu_read_unlock(); return /* CFGLU_EPROT; */ 0; } /* We can't hold rcu_lock during receive, so take a ref count instead */ cfsrvl_get(up); rcu_read_unlock(); ret = up->receive(up, pkt); cfsrvl_put(up); return ret; } static int cfmuxl_transmit(struct cflayer *layr, struct cfpkt *pkt) { struct cfmuxl *muxl = container_obj(layr); int err; u8 linkid; struct cflayer *dn; struct caif_payload_info *info = cfpkt_info(pkt); BUG_ON(!info); rcu_read_lock(); dn = get_dn(muxl, info->dev_info); if (dn == NULL) { pr_debug("Send data on unknown phy ID = %d (0x%x)\n", info->dev_info->id, info->dev_info->id); rcu_read_unlock(); cfpkt_destroy(pkt); return -ENOTCONN; } info->hdr_len += 1; linkid = info->channel_id; cfpkt_add_head(pkt, &linkid, 1); /* We can't hold rcu_lock during receive, so take a ref count instead */ cffrml_hold(dn); rcu_read_unlock(); err = dn->transmit(dn, pkt); cffrml_put(dn); return err; } static void cfmuxl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl, int phyid) { struct cfmuxl *muxl = container_obj(layr); struct cflayer *layer; rcu_read_lock(); list_for_each_entry_rcu(layer, &muxl->srvl_list, node) { if (cfsrvl_phyid_match(layer, phyid) && layer->ctrlcmd) { if ((ctrl == _CAIF_CTRLCMD_PHYIF_DOWN_IND || ctrl == CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND) && layer->id != 0) cfmuxl_remove_uplayer(layr, layer->id); /* NOTE: ctrlcmd is not allowed to block */ layer->ctrlcmd(layer, ctrl, phyid); } } rcu_read_unlock(); } |
| 6 6 39 57 102 102 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 | // SPDX-License-Identifier: GPL-2.0-or-later /* * ldm - Support for Windows Logical Disk Manager (Dynamic Disks) * * Copyright (C) 2001,2002 Richard Russon <ldm@flatcap.org> * Copyright (c) 2001-2012 Anton Altaparmakov * Copyright (C) 2001,2002 Jakob Kemi <jakob.kemi@telia.com> * * Documentation is available at http://www.linux-ntfs.org/doku.php?id=downloads */ #include <linux/slab.h> #include <linux/pagemap.h> #include <linux/stringify.h> #include <linux/kernel.h> #include <linux/uuid.h> #include <linux/msdos_partition.h> #include "ldm.h" #include "check.h" /* * ldm_debug/info/error/crit - Output an error message * @f: A printf format string containing the message * @...: Variables to substitute into @f * * ldm_debug() writes a DEBUG level message to the syslog but only if the * driver was compiled with debug enabled. Otherwise, the call turns into a NOP. */ #ifndef CONFIG_LDM_DEBUG #define ldm_debug(...) do {} while (0) #else #define ldm_debug(f, a...) _ldm_printk (KERN_DEBUG, __func__, f, ##a) #endif #define ldm_crit(f, a...) _ldm_printk (KERN_CRIT, __func__, f, ##a) #define ldm_error(f, a...) _ldm_printk (KERN_ERR, __func__, f, ##a) #define ldm_info(f, a...) _ldm_printk (KERN_INFO, __func__, f, ##a) static __printf(3, 4) void _ldm_printk(const char *level, const char *function, const char *fmt, ...) { struct va_format vaf; va_list args; va_start (args, fmt); vaf.fmt = fmt; vaf.va = &args; printk("%s%s(): %pV\n", level, function, &vaf); va_end(args); } /** * ldm_parse_privhead - Read the LDM Database PRIVHEAD structure * @data: Raw database PRIVHEAD structure loaded from the device * @ph: In-memory privhead structure in which to return parsed information * * This parses the LDM database PRIVHEAD structure supplied in @data and * sets up the in-memory privhead structure @ph with the obtained information. * * Return: 'true' @ph contains the PRIVHEAD data * 'false' @ph contents are undefined */ static bool ldm_parse_privhead(const u8 *data, struct privhead *ph) { bool is_vista = false; BUG_ON(!data || !ph); if (MAGIC_PRIVHEAD != get_unaligned_be64(data)) { ldm_error("Cannot find PRIVHEAD structure. LDM database is" " corrupt. Aborting."); return false; } ph->ver_major = get_unaligned_be16(data + 0x000C); ph->ver_minor = get_unaligned_be16(data + 0x000E); ph->logical_disk_start = get_unaligned_be64(data + 0x011B); ph->logical_disk_size = get_unaligned_be64(data + 0x0123); ph->config_start = get_unaligned_be64(data + 0x012B); ph->config_size = get_unaligned_be64(data + 0x0133); /* Version 2.11 is Win2k/XP and version 2.12 is Vista. */ if (ph->ver_major == 2 && ph->ver_minor == 12) is_vista = true; if (!is_vista && (ph->ver_major != 2 || ph->ver_minor != 11)) { ldm_error("Expected PRIVHEAD version 2.11 or 2.12, got %d.%d." " Aborting.", ph->ver_major, ph->ver_minor); return false; } ldm_debug("PRIVHEAD version %d.%d (Windows %s).", ph->ver_major, ph->ver_minor, is_vista ? "Vista" : "2000/XP"); if (ph->config_size != LDM_DB_SIZE) { /* 1 MiB in sectors. */ /* Warn the user and continue, carefully. */ ldm_info("Database is normally %u bytes, it claims to " "be %llu bytes.", LDM_DB_SIZE, (unsigned long long)ph->config_size); } if ((ph->logical_disk_size == 0) || (ph->logical_disk_start + ph->logical_disk_size > ph->config_start)) { ldm_error("PRIVHEAD disk size doesn't match real disk size"); return false; } if (uuid_parse(data + 0x0030, &ph->disk_id)) { ldm_error("PRIVHEAD contains an invalid GUID."); return false; } ldm_debug("Parsed PRIVHEAD successfully."); return true; } /** * ldm_parse_tocblock - Read the LDM Database TOCBLOCK structure * @data: Raw database TOCBLOCK structure loaded from the device * @toc: In-memory toc structure in which to return parsed information * * This parses the LDM Database TOCBLOCK (table of contents) structure supplied * in @data and sets up the in-memory tocblock structure @toc with the obtained * information. * * N.B. The *_start and *_size values returned in @toc are not range-checked. * * Return: 'true' @toc contains the TOCBLOCK data * 'false' @toc contents are undefined */ static bool ldm_parse_tocblock (const u8 *data, struct tocblock *toc) { BUG_ON (!data || !toc); if (MAGIC_TOCBLOCK != get_unaligned_be64(data)) { ldm_crit ("Cannot find TOCBLOCK, database may be corrupt."); return false; } strscpy_pad(toc->bitmap1_name, data + 0x24, sizeof(toc->bitmap1_name)); toc->bitmap1_start = get_unaligned_be64(data + 0x2E); toc->bitmap1_size = get_unaligned_be64(data + 0x36); if (strncmp (toc->bitmap1_name, TOC_BITMAP1, sizeof (toc->bitmap1_name)) != 0) { ldm_crit ("TOCBLOCK's first bitmap is '%s', should be '%s'.", TOC_BITMAP1, toc->bitmap1_name); return false; } strscpy_pad(toc->bitmap2_name, data + 0x46, sizeof(toc->bitmap2_name)); toc->bitmap2_start = get_unaligned_be64(data + 0x50); toc->bitmap2_size = get_unaligned_be64(data + 0x58); if (strncmp (toc->bitmap2_name, TOC_BITMAP2, sizeof (toc->bitmap2_name)) != 0) { ldm_crit ("TOCBLOCK's second bitmap is '%s', should be '%s'.", TOC_BITMAP2, toc->bitmap2_name); return false; } ldm_debug ("Parsed TOCBLOCK successfully."); return true; } /** * ldm_parse_vmdb - Read the LDM Database VMDB structure * @data: Raw database VMDB structure loaded from the device * @vm: In-memory vmdb structure in which to return parsed information * * This parses the LDM Database VMDB structure supplied in @data and sets up * the in-memory vmdb structure @vm with the obtained information. * * N.B. The *_start, *_size and *_seq values will be range-checked later. * * Return: 'true' @vm contains VMDB info * 'false' @vm contents are undefined */ static bool ldm_parse_vmdb (const u8 *data, struct vmdb *vm) { BUG_ON (!data || !vm); if (MAGIC_VMDB != get_unaligned_be32(data)) { ldm_crit ("Cannot find the VMDB, database may be corrupt."); return false; } vm->ver_major = get_unaligned_be16(data + 0x12); vm->ver_minor = get_unaligned_be16(data + 0x14); if ((vm->ver_major != 4) || (vm->ver_minor != 10)) { ldm_error ("Expected VMDB version %d.%d, got %d.%d. " "Aborting.", 4, 10, vm->ver_major, vm->ver_minor); return false; } vm->vblk_size = get_unaligned_be32(data + 0x08); if (vm->vblk_size == 0) { ldm_error ("Illegal VBLK size"); return false; } vm->vblk_offset = get_unaligned_be32(data + 0x0C); vm->last_vblk_seq = get_unaligned_be32(data + 0x04); ldm_debug ("Parsed VMDB successfully."); return true; } /** * ldm_compare_privheads - Compare two privhead objects * @ph1: First privhead * @ph2: Second privhead * * This compares the two privhead structures @ph1 and @ph2. * * Return: 'true' Identical * 'false' Different */ static bool ldm_compare_privheads (const struct privhead *ph1, const struct privhead *ph2) { BUG_ON (!ph1 || !ph2); return ((ph1->ver_major == ph2->ver_major) && (ph1->ver_minor == ph2->ver_minor) && (ph1->logical_disk_start == ph2->logical_disk_start) && (ph1->logical_disk_size == ph2->logical_disk_size) && (ph1->config_start == ph2->config_start) && (ph1->config_size == ph2->config_size) && uuid_equal(&ph1->disk_id, &ph2->disk_id)); } /** * ldm_compare_tocblocks - Compare two tocblock objects * @toc1: First toc * @toc2: Second toc * * This compares the two tocblock structures @toc1 and @toc2. * * Return: 'true' Identical * 'false' Different */ static bool ldm_compare_tocblocks (const struct tocblock *toc1, const struct tocblock *toc2) { BUG_ON (!toc1 || !toc2); return ((toc1->bitmap1_start == toc2->bitmap1_start) && (toc1->bitmap1_size == toc2->bitmap1_size) && (toc1->bitmap2_start == toc2->bitmap2_start) && (toc1->bitmap2_size == toc2->bitmap2_size) && !strncmp (toc1->bitmap1_name, toc2->bitmap1_name, sizeof (toc1->bitmap1_name)) && !strncmp (toc1->bitmap2_name, toc2->bitmap2_name, sizeof (toc1->bitmap2_name))); } /** * ldm_validate_privheads - Compare the primary privhead with its backups * @state: Partition check state including device holding the LDM Database * @ph1: Memory struct to fill with ph contents * * Read and compare all three privheads from disk. * * The privheads on disk show the size and location of the main disk area and * the configuration area (the database). The values are range-checked against * @hd, which contains the real size of the disk. * * Return: 'true' Success * 'false' Error */ static bool ldm_validate_privheads(struct parsed_partitions *state, struct privhead *ph1) { static const int off[3] = { OFF_PRIV1, OFF_PRIV2, OFF_PRIV3 }; struct privhead *ph[3] = { ph1 }; Sector sect; u8 *data; bool result = false; long num_sects; int i; BUG_ON (!state || !ph1); ph[1] = kmalloc (sizeof (*ph[1]), GFP_KERNEL); ph[2] = kmalloc (sizeof (*ph[2]), GFP_KERNEL); if (!ph[1] || !ph[2]) { ldm_crit ("Out of memory."); goto out; } /* off[1 & 2] are relative to ph[0]->config_start */ ph[0]->config_start = 0; /* Read and parse privheads */ for (i = 0; i < 3; i++) { data = read_part_sector(state, ph[0]->config_start + off[i], §); if (!data) { ldm_crit ("Disk read failed."); goto out; } result = ldm_parse_privhead (data, ph[i]); put_dev_sector (sect); if (!result) { ldm_error ("Cannot find PRIVHEAD %d.", i+1); /* Log again */ if (i < 2) goto out; /* Already logged */ else break; /* FIXME ignore for now, 3rd PH can fail on odd-sized disks */ } } num_sects = get_capacity(state->disk); if ((ph[0]->config_start > num_sects) || ((ph[0]->config_start + ph[0]->config_size) > num_sects)) { ldm_crit ("Database extends beyond the end of the disk."); goto out; } if ((ph[0]->logical_disk_start > ph[0]->config_start) || ((ph[0]->logical_disk_start + ph[0]->logical_disk_size) > ph[0]->config_start)) { ldm_crit ("Disk and database overlap."); goto out; } if (!ldm_compare_privheads (ph[0], ph[1])) { ldm_crit ("Primary and backup PRIVHEADs don't match."); goto out; } /* FIXME ignore this for now if (!ldm_compare_privheads (ph[0], ph[2])) { ldm_crit ("Primary and backup PRIVHEADs don't match."); goto out; }*/ ldm_debug ("Validated PRIVHEADs successfully."); result = true; out: kfree (ph[1]); kfree (ph[2]); return result; } /** * ldm_validate_tocblocks - Validate the table of contents and its backups * @state: Partition check state including device holding the LDM Database * @base: Offset, into @state->disk, of the database * @ldb: Cache of the database structures * * Find and compare the four tables of contents of the LDM Database stored on * @state->disk and return the parsed information into @toc1. * * The offsets and sizes of the configs are range-checked against a privhead. * * Return: 'true' @toc1 contains validated TOCBLOCK info * 'false' @toc1 contents are undefined */ static bool ldm_validate_tocblocks(struct parsed_partitions *state, unsigned long base, struct ldmdb *ldb) { static const int off[4] = { OFF_TOCB1, OFF_TOCB2, OFF_TOCB3, OFF_TOCB4}; struct tocblock *tb[4]; struct privhead *ph; Sector sect; u8 *data; int i, nr_tbs; bool result = false; BUG_ON(!state || !ldb); ph = &ldb->ph; tb[0] = &ldb->toc; tb[1] = kmalloc_array(3, sizeof(*tb[1]), GFP_KERNEL); if (!tb[1]) { ldm_crit("Out of memory."); goto err; } tb[2] = (struct tocblock*)((u8*)tb[1] + sizeof(*tb[1])); tb[3] = (struct tocblock*)((u8*)tb[2] + sizeof(*tb[2])); /* * Try to read and parse all four TOCBLOCKs. * * Windows Vista LDM v2.12 does not always have all four TOCBLOCKs so * skip any that fail as long as we get at least one valid TOCBLOCK. */ for (nr_tbs = i = 0; i < 4; i++) { data = read_part_sector(state, base + off[i], §); if (!data) { ldm_error("Disk read failed for TOCBLOCK %d.", i); continue; } if (ldm_parse_tocblock(data, tb[nr_tbs])) nr_tbs++; put_dev_sector(sect); } if (!nr_tbs) { ldm_crit("Failed to find a valid TOCBLOCK."); goto err; } /* Range check the TOCBLOCK against a privhead. */ if (((tb[0]->bitmap1_start + tb[0]->bitmap1_size) > ph->config_size) || ((tb[0]->bitmap2_start + tb[0]->bitmap2_size) > ph->config_size)) { ldm_crit("The bitmaps are out of range. Giving up."); goto err; } /* Compare all loaded TOCBLOCKs. */ for (i = 1; i < nr_tbs; i++) { if (!ldm_compare_tocblocks(tb[0], tb[i])) { ldm_crit("TOCBLOCKs 0 and %d do not match.", i); goto err; } } ldm_debug("Validated %d TOCBLOCKs successfully.", nr_tbs); result = true; err: kfree(tb[1]); return result; } /** * ldm_validate_vmdb - Read the VMDB and validate it * @state: Partition check state including device holding the LDM Database * @base: Offset, into @bdev, of the database * @ldb: Cache of the database structures * * Find the vmdb of the LDM Database stored on @bdev and return the parsed * information in @ldb. * * Return: 'true' @ldb contains validated VBDB info * 'false' @ldb contents are undefined */ static bool ldm_validate_vmdb(struct parsed_partitions *state, unsigned long base, struct ldmdb *ldb) { Sector sect; u8 *data; bool result = false; struct vmdb *vm; struct tocblock *toc; BUG_ON (!state || !ldb); vm = &ldb->vm; toc = &ldb->toc; data = read_part_sector(state, base + OFF_VMDB, §); if (!data) { ldm_crit ("Disk read failed."); return false; } if (!ldm_parse_vmdb (data, vm)) goto out; /* Already logged */ /* Are there uncommitted transactions? */ if (get_unaligned_be16(data + 0x10) != 0x01) { ldm_crit ("Database is not in a consistent state. Aborting."); goto out; } if (vm->vblk_offset != 512) ldm_info ("VBLKs start at offset 0x%04x.", vm->vblk_offset); /* * The last_vblkd_seq can be before the end of the vmdb, just make sure * it is not out of bounds. */ if ((vm->vblk_size * vm->last_vblk_seq) > (toc->bitmap1_size << 9)) { ldm_crit ("VMDB exceeds allowed size specified by TOCBLOCK. " "Database is corrupt. Aborting."); goto out; } result = true; out: put_dev_sector (sect); return result; } /** * ldm_validate_partition_table - Determine whether bdev might be a dynamic disk * @state: Partition check state including device holding the LDM Database * * This function provides a weak test to decide whether the device is a dynamic * disk or not. It looks for an MS-DOS-style partition table containing at * least one partition of type 0x42 (formerly SFS, now used by Windows for * dynamic disks). * * N.B. The only possible error can come from the read_part_sector and that is * only likely to happen if the underlying device is strange. If that IS * the case we should return zero to let someone else try. * * Return: 'true' @state->disk is a dynamic disk * 'false' @state->disk is not a dynamic disk, or an error occurred */ static bool ldm_validate_partition_table(struct parsed_partitions *state) { Sector sect; u8 *data; struct msdos_partition *p; int i; bool result = false; BUG_ON(!state); data = read_part_sector(state, 0, §); if (!data) { ldm_info ("Disk read failed."); return false; } if (*(__le16*) (data + 0x01FE) != cpu_to_le16 (MSDOS_LABEL_MAGIC)) goto out; p = (struct msdos_partition *)(data + 0x01BE); for (i = 0; i < 4; i++, p++) if (p->sys_ind == LDM_PARTITION) { result = true; break; } if (result) ldm_debug ("Found W2K dynamic disk partition type."); out: put_dev_sector (sect); return result; } /** * ldm_get_disk_objid - Search a linked list of vblk's for a given Disk Id * @ldb: Cache of the database structures * * The LDM Database contains a list of all partitions on all dynamic disks. * The primary PRIVHEAD, at the beginning of the physical disk, tells us * the GUID of this disk. This function searches for the GUID in a linked * list of vblk's. * * Return: Pointer, A matching vblk was found * NULL, No match, or an error */ static struct vblk * ldm_get_disk_objid (const struct ldmdb *ldb) { struct list_head *item; BUG_ON (!ldb); list_for_each (item, &ldb->v_disk) { struct vblk *v = list_entry (item, struct vblk, list); if (uuid_equal(&v->vblk.disk.disk_id, &ldb->ph.disk_id)) return v; } return NULL; } /** * ldm_create_data_partitions - Create data partitions for this device * @pp: List of the partitions parsed so far * @ldb: Cache of the database structures * * The database contains ALL the partitions for ALL disk groups, so we need to * filter out this specific disk. Using the disk's object id, we can find all * the partitions in the database that belong to this disk. * * Add each partition in our database, to the parsed_partitions structure. * * N.B. This function creates the partitions in the order it finds partition * objects in the linked list. * * Return: 'true' Partition created * 'false' Error, probably a range checking problem */ static bool ldm_create_data_partitions (struct parsed_partitions *pp, const struct ldmdb *ldb) { struct list_head *item; struct vblk *vb; struct vblk *disk; struct vblk_part *part; int part_num = 1; BUG_ON (!pp || !ldb); disk = ldm_get_disk_objid (ldb); if (!disk) { ldm_crit ("Can't find the ID of this disk in the database."); return false; } strlcat(pp->pp_buf, " [LDM]", PAGE_SIZE); /* Create the data partitions */ list_for_each (item, &ldb->v_part) { vb = list_entry (item, struct vblk, list); part = &vb->vblk.part; if (part->disk_id != disk->obj_id) continue; put_partition (pp, part_num, ldb->ph.logical_disk_start + part->start, part->size); part_num++; } strlcat(pp->pp_buf, "\n", PAGE_SIZE); return true; } /** * ldm_relative - Calculate the next relative offset * @buffer: Block of data being worked on * @buflen: Size of the block of data * @base: Size of the previous fixed width fields * @offset: Cumulative size of the previous variable-width fields * * Because many of the VBLK fields are variable-width, it's necessary * to calculate each offset based on the previous one and the length * of the field it pointed to. * * Return: -1 Error, the calculated offset exceeded the size of the buffer * n OK, a range-checked offset into buffer */ static int ldm_relative(const u8 *buffer, int buflen, int base, int offset) { base += offset; if (!buffer || offset < 0 || base > buflen) { if (!buffer) ldm_error("!buffer"); if (offset < 0) ldm_error("offset (%d) < 0", offset); if (base > buflen) ldm_error("base (%d) > buflen (%d)", base, buflen); return -1; } if (base + buffer[base] >= buflen) { ldm_error("base (%d) + buffer[base] (%d) >= buflen (%d)", base, buffer[base], buflen); return -1; } return buffer[base] + offset + 1; } /** * ldm_get_vnum - Convert a variable-width, big endian number, into cpu order * @block: Pointer to the variable-width number to convert * * Large numbers in the LDM Database are often stored in a packed format. Each * number is prefixed by a one byte width marker. All numbers in the database * are stored in big-endian byte order. This function reads one of these * numbers and returns the result * * N.B. This function DOES NOT perform any range checking, though the most * it will read is eight bytes. * * Return: n A number * 0 Zero, or an error occurred */ static u64 ldm_get_vnum (const u8 *block) { u64 tmp = 0; u8 length; BUG_ON (!block); length = *block++; if (length && length <= 8) while (length--) tmp = (tmp << 8) | *block++; else ldm_error ("Illegal length %d.", length); return tmp; } /** * ldm_get_vstr - Read a length-prefixed string into a buffer * @block: Pointer to the length marker * @buffer: Location to copy string to * @buflen: Size of the output buffer * * Many of the strings in the LDM Database are not NULL terminated. Instead * they are prefixed by a one byte length marker. This function copies one of * these strings into a buffer. * * N.B. This function DOES NOT perform any range checking on the input. * If the buffer is too small, the output will be truncated. * * Return: 0, Error and @buffer contents are undefined * n, String length in characters (excluding NULL) * buflen-1, String was truncated. */ static int ldm_get_vstr (const u8 *block, u8 *buffer, int buflen) { int length; BUG_ON (!block || !buffer); length = block[0]; if (length >= buflen) { ldm_error ("Truncating string %d -> %d.", length, buflen); length = buflen - 1; } memcpy (buffer, block + 1, length); buffer[length] = 0; return length; } /** * ldm_parse_cmp3 - Read a raw VBLK Component object into a vblk structure * @buffer: Block of data being worked on * @buflen: Size of the block of data * @vb: In-memory vblk in which to return information * * Read a raw VBLK Component object (version 3) into a vblk structure. * * Return: 'true' @vb contains a Component VBLK * 'false' @vb contents are not defined */ static bool ldm_parse_cmp3 (const u8 *buffer, int buflen, struct vblk *vb) { int r_objid, r_name, r_vstate, r_child, r_parent, r_stripe, r_cols, len; struct vblk_comp *comp; BUG_ON (!buffer || !vb); r_objid = ldm_relative (buffer, buflen, 0x18, 0); r_name = ldm_relative (buffer, buflen, 0x18, r_objid); r_vstate = ldm_relative (buffer, buflen, 0x18, r_name); r_child = ldm_relative (buffer, buflen, 0x1D, r_vstate); r_parent = ldm_relative (buffer, buflen, 0x2D, r_child); if (buffer[0x12] & VBLK_FLAG_COMP_STRIPE) { r_stripe = ldm_relative (buffer, buflen, 0x2E, r_parent); r_cols = ldm_relative (buffer, buflen, 0x2E, r_stripe); len = r_cols; } else { r_stripe = 0; len = r_parent; } if (len < 0) return false; len += VBLK_SIZE_CMP3; if (len != get_unaligned_be32(buffer + 0x14)) return false; comp = &vb->vblk.comp; ldm_get_vstr (buffer + 0x18 + r_name, comp->state, sizeof (comp->state)); comp->type = buffer[0x18 + r_vstate]; comp->children = ldm_get_vnum (buffer + 0x1D + r_vstate); comp->parent_id = ldm_get_vnum (buffer + 0x2D + r_child); comp->chunksize = r_stripe ? ldm_get_vnum (buffer+r_parent+0x2E) : 0; return true; } /** * ldm_parse_dgr3 - Read a raw VBLK Disk Group object into a vblk structure * @buffer: Block of data being worked on * @buflen: Size of the block of data * @vb: In-memory vblk in which to return information * * Read a raw VBLK Disk Group object (version 3) into a vblk structure. * * Return: 'true' @vb contains a Disk Group VBLK * 'false' @vb contents are not defined */ static int ldm_parse_dgr3 (const u8 *buffer, int buflen, struct vblk *vb) { int r_objid, r_name, r_diskid, r_id1, r_id2, len; struct vblk_dgrp *dgrp; BUG_ON (!buffer || !vb); r_objid = ldm_relative (buffer, buflen, 0x18, 0); r_name = ldm_relative (buffer, buflen, 0x18, r_objid); r_diskid = ldm_relative (buffer, buflen, 0x18, r_name); if (buffer[0x12] & VBLK_FLAG_DGR3_IDS) { r_id1 = ldm_relative (buffer, buflen, 0x24, r_diskid); r_id2 = ldm_relative (buffer, buflen, 0x24, r_id1); len = r_id2; } else len = r_diskid; if (len < 0) return false; len += VBLK_SIZE_DGR3; if (len != get_unaligned_be32(buffer + 0x14)) return false; dgrp = &vb->vblk.dgrp; ldm_get_vstr (buffer + 0x18 + r_name, dgrp->disk_id, sizeof (dgrp->disk_id)); return true; } /** * ldm_parse_dgr4 - Read a raw VBLK Disk Group object into a vblk structure * @buffer: Block of data being worked on * @buflen: Size of the block of data * @vb: In-memory vblk in which to return information * * Read a raw VBLK Disk Group object (version 4) into a vblk structure. * * Return: 'true' @vb contains a Disk Group VBLK * 'false' @vb contents are not defined */ static bool ldm_parse_dgr4 (const u8 *buffer, int buflen, struct vblk *vb) { char buf[64]; int r_objid, r_name, r_id1, r_id2, len; BUG_ON (!buffer || !vb); r_objid = ldm_relative (buffer, buflen, 0x18, 0); r_name = ldm_relative (buffer, buflen, 0x18, r_objid); if (buffer[0x12] & VBLK_FLAG_DGR4_IDS) { r_id1 = ldm_relative (buffer, buflen, 0x44, r_name); r_id2 = ldm_relative (buffer, buflen, 0x44, r_id1); len = r_id2; } else len = r_name; if (len < 0) return false; len += VBLK_SIZE_DGR4; if (len != get_unaligned_be32(buffer + 0x14)) return false; ldm_get_vstr (buffer + 0x18 + r_objid, buf, sizeof (buf)); return true; } /** * ldm_parse_dsk3 - Read a raw VBLK Disk object into a vblk structure * @buffer: Block of data being worked on * @buflen: Size of the block of data * @vb: In-memory vblk in which to return information * * Read a raw VBLK Disk object (version 3) into a vblk structure. * * Return: 'true' @vb contains a Disk VBLK * 'false' @vb contents are not defined */ static bool ldm_parse_dsk3 (const u8 *buffer, int buflen, struct vblk *vb) { int r_objid, r_name, r_diskid, r_altname, len; struct vblk_disk *disk; BUG_ON (!buffer || !vb); r_objid = ldm_relative (buffer, buflen, 0x18, 0); r_name = ldm_relative (buffer, buflen, 0x18, r_objid); r_diskid = ldm_relative (buffer, buflen, 0x18, r_name); r_altname = ldm_relative (buffer, buflen, 0x18, r_diskid); len = r_altname; if (len < 0) return false; len += VBLK_SIZE_DSK3; if (len != get_unaligned_be32(buffer + 0x14)) return false; disk = &vb->vblk.disk; ldm_get_vstr (buffer + 0x18 + r_diskid, disk->alt_name, sizeof (disk->alt_name)); if (uuid_parse(buffer + 0x19 + r_name, &disk->disk_id)) return false; return true; } /** * ldm_parse_dsk4 - Read a raw VBLK Disk object into a vblk structure * @buffer: Block of data being worked on * @buflen: Size of the block of data * @vb: In-memory vblk in which to return information * * Read a raw VBLK Disk object (version 4) into a vblk structure. * * Return: 'true' @vb contains a Disk VBLK * 'false' @vb contents are not defined */ static bool ldm_parse_dsk4 (const u8 *buffer, int buflen, struct vblk *vb) { int r_objid, r_name, len; struct vblk_disk *disk; BUG_ON (!buffer || !vb); r_objid = ldm_relative (buffer, buflen, 0x18, 0); r_name = ldm_relative (buffer, buflen, 0x18, r_objid); len = r_name; if (len < 0) return false; len += VBLK_SIZE_DSK4; if (len != get_unaligned_be32(buffer + 0x14)) return false; disk = &vb->vblk.disk; import_uuid(&disk->disk_id, buffer + 0x18 + r_name); return true; } /** * ldm_parse_prt3 - Read a raw VBLK Partition object into a vblk structure * @buffer: Block of data being worked on * @buflen: Size of the block of data * @vb: In-memory vblk in which to return information * * Read a raw VBLK Partition object (version 3) into a vblk structure. * * Return: 'true' @vb contains a Partition VBLK * 'false' @vb contents are not defined */ static bool ldm_parse_prt3(const u8 *buffer, int buflen, struct vblk *vb) { int r_objid, r_name, r_size, r_parent, r_diskid, r_index, len; struct vblk_part *part; BUG_ON(!buffer || !vb); r_objid = ldm_relative(buffer, buflen, 0x18, 0); if (r_objid < 0) { ldm_error("r_objid %d < 0", r_objid); return false; } r_name = ldm_relative(buffer, buflen, 0x18, r_objid); if (r_name < 0) { ldm_error("r_name %d < 0", r_name); return false; } r_size = ldm_relative(buffer, buflen, 0x34, r_name); if (r_size < 0) { ldm_error("r_size %d < 0", r_size); return false; } r_parent = ldm_relative(buffer, buflen, 0x34, r_size); if (r_parent < 0) { ldm_error("r_parent %d < 0", r_parent); return false; } r_diskid = ldm_relative(buffer, buflen, 0x34, r_parent); if (r_diskid < 0) { ldm_error("r_diskid %d < 0", r_diskid); return false; } if (buffer[0x12] & VBLK_FLAG_PART_INDEX) { r_index = ldm_relative(buffer, buflen, 0x34, r_diskid); if (r_index < 0) { ldm_error("r_index %d < 0", r_index); return false; } len = r_index; } else len = r_diskid; if (len < 0) { ldm_error("len %d < 0", len); return false; } len += VBLK_SIZE_PRT3; if (len > get_unaligned_be32(buffer + 0x14)) { ldm_error("len %d > BE32(buffer + 0x14) %d", len, get_unaligned_be32(buffer + 0x14)); return false; } part = &vb->vblk.part; part->start = get_unaligned_be64(buffer + 0x24 + r_name); part->volume_offset = get_unaligned_be64(buffer + 0x2C + r_name); part->size = ldm_get_vnum(buffer + 0x34 + r_name); part->parent_id = ldm_get_vnum(buffer + 0x34 + r_size); part->disk_id = ldm_get_vnum(buffer + 0x34 + r_parent); if (vb->flags & VBLK_FLAG_PART_INDEX) part->partnum = buffer[0x35 + r_diskid]; else part->partnum = 0; return true; } /** * ldm_parse_vol5 - Read a raw VBLK Volume object into a vblk structure * @buffer: Block of data being worked on * @buflen: Size of the block of data * @vb: In-memory vblk in which to return information * * Read a raw VBLK Volume object (version 5) into a vblk structure. * * Return: 'true' @vb contains a Volume VBLK * 'false' @vb contents are not defined */ static bool ldm_parse_vol5(const u8 *buffer, int buflen, struct vblk *vb) { int r_objid, r_name, r_vtype, r_disable_drive_letter, r_child, r_size; int r_id1, r_id2, r_size2, r_drive, len; struct vblk_volu *volu; BUG_ON(!buffer || !vb); r_objid = ldm_relative(buffer, buflen, 0x18, 0); if (r_objid < 0) { ldm_error("r_objid %d < 0", r_objid); return false; } r_name = ldm_relative(buffer, buflen, 0x18, r_objid); if (r_name < 0) { ldm_error("r_name %d < 0", r_name); return false; } r_vtype = ldm_relative(buffer, buflen, 0x18, r_name); if (r_vtype < 0) { ldm_error("r_vtype %d < 0", r_vtype); return false; } r_disable_drive_letter = ldm_relative(buffer, buflen, 0x18, r_vtype); if (r_disable_drive_letter < 0) { ldm_error("r_disable_drive_letter %d < 0", r_disable_drive_letter); return false; } r_child = ldm_relative(buffer, buflen, 0x2D, r_disable_drive_letter); if (r_child < 0) { ldm_error("r_child %d < 0", r_child); return false; } r_size = ldm_relative(buffer, buflen, 0x3D, r_child); if (r_size < 0) { ldm_error("r_size %d < 0", r_size); return false; } if (buffer[0x12] & VBLK_FLAG_VOLU_ID1) { r_id1 = ldm_relative(buffer, buflen, 0x52, r_size); if (r_id1 < 0) { ldm_error("r_id1 %d < 0", r_id1); return false; } } else r_id1 = r_size; if (buffer[0x12] & VBLK_FLAG_VOLU_ID2) { r_id2 = ldm_relative(buffer, buflen, 0x52, r_id1); if (r_id2 < 0) { ldm_error("r_id2 %d < 0", r_id2); return false; } } else r_id2 = r_id1; if (buffer[0x12] & VBLK_FLAG_VOLU_SIZE) { r_size2 = ldm_relative(buffer, buflen, 0x52, r_id2); if (r_size2 < 0) { ldm_error("r_size2 %d < 0", r_size2); return false; } } else r_size2 = r_id2; if (buffer[0x12] & VBLK_FLAG_VOLU_DRIVE) { r_drive = ldm_relative(buffer, buflen, 0x52, r_size2); if (r_drive < 0) { ldm_error("r_drive %d < 0", r_drive); return false; } } else r_drive = r_size2; len = r_drive; if (len < 0) { ldm_error("len %d < 0", len); return false; } len += VBLK_SIZE_VOL5; if (len > get_unaligned_be32(buffer + 0x14)) { ldm_error("len %d > BE32(buffer + 0x14) %d", len, get_unaligned_be32(buffer + 0x14)); return false; } volu = &vb->vblk.volu; ldm_get_vstr(buffer + 0x18 + r_name, volu->volume_type, sizeof(volu->volume_type)); memcpy(volu->volume_state, buffer + 0x18 + r_disable_drive_letter, sizeof(volu->volume_state)); volu->size = ldm_get_vnum(buffer + 0x3D + r_child); volu->partition_type = buffer[0x41 + r_size]; memcpy(volu->guid, buffer + 0x42 + r_size, sizeof(volu->guid)); if (buffer[0x12] & VBLK_FLAG_VOLU_DRIVE) { ldm_get_vstr(buffer + 0x52 + r_size, volu->drive_hint, sizeof(volu->drive_hint)); } return true; } /** * ldm_parse_vblk - Read a raw VBLK object into a vblk structure * @buf: Block of data being worked on * @len: Size of the block of data * @vb: In-memory vblk in which to return information * * Read a raw VBLK object into a vblk structure. This function just reads the * information common to all VBLK types, then delegates the rest of the work to * helper functions: ldm_parse_*. * * Return: 'true' @vb contains a VBLK * 'false' @vb contents are not defined */ static bool ldm_parse_vblk (const u8 *buf, int len, struct vblk *vb) { bool result = false; int r_objid; BUG_ON (!buf || !vb); r_objid = ldm_relative (buf, len, 0x18, 0); if (r_objid < 0) { ldm_error ("VBLK header is corrupt."); return false; } vb->flags = buf[0x12]; vb->type = buf[0x13]; vb->obj_id = ldm_get_vnum (buf + 0x18); ldm_get_vstr (buf+0x18+r_objid, vb->name, sizeof (vb->name)); switch (vb->type) { case VBLK_CMP3: result = ldm_parse_cmp3 (buf, len, vb); break; case VBLK_DSK3: result = ldm_parse_dsk3 (buf, len, vb); break; case VBLK_DSK4: result = ldm_parse_dsk4 (buf, len, vb); break; case VBLK_DGR3: result = ldm_parse_dgr3 (buf, len, vb); break; case VBLK_DGR4: result = ldm_parse_dgr4 (buf, len, vb); break; case VBLK_PRT3: result = ldm_parse_prt3 (buf, len, vb); break; case VBLK_VOL5: result = ldm_parse_vol5 (buf, len, vb); break; } if (result) ldm_debug ("Parsed VBLK 0x%llx (type: 0x%02x) ok.", (unsigned long long) vb->obj_id, vb->type); else ldm_error ("Failed to parse VBLK 0x%llx (type: 0x%02x).", (unsigned long long) vb->obj_id, vb->type); return result; } /** * ldm_ldmdb_add - Adds a raw VBLK entry to the ldmdb database * @data: Raw VBLK to add to the database * @len: Size of the raw VBLK * @ldb: Cache of the database structures * * The VBLKs are sorted into categories. Partitions are also sorted by offset. * * N.B. This function does not check the validity of the VBLKs. * * Return: 'true' The VBLK was added * 'false' An error occurred */ static bool ldm_ldmdb_add (u8 *data, int len, struct ldmdb *ldb) { struct vblk *vb; struct list_head *item; BUG_ON (!data || !ldb); vb = kmalloc (sizeof (*vb), GFP_KERNEL); if (!vb) { ldm_crit ("Out of memory."); return false; } if (!ldm_parse_vblk (data, len, vb)) { kfree(vb); return false; /* Already logged */ } /* Put vblk into the correct list. */ switch (vb->type) { case VBLK_DGR3: case VBLK_DGR4: list_add (&vb->list, &ldb->v_dgrp); break; case VBLK_DSK3: case VBLK_DSK4: list_add (&vb->list, &ldb->v_disk); break; case VBLK_VOL5: list_add (&vb->list, &ldb->v_volu); break; case VBLK_CMP3: list_add (&vb->list, &ldb->v_comp); break; case VBLK_PRT3: /* Sort by the partition's start sector. */ list_for_each (item, &ldb->v_part) { struct vblk *v = list_entry (item, struct vblk, list); if ((v->vblk.part.disk_id == vb->vblk.part.disk_id) && (v->vblk.part.start > vb->vblk.part.start)) { list_add_tail (&vb->list, &v->list); return true; } } list_add_tail (&vb->list, &ldb->v_part); break; } return true; } /** * ldm_frag_add - Add a VBLK fragment to a list * @data: Raw fragment to be added to the list * @size: Size of the raw fragment * @frags: Linked list of VBLK fragments * * Fragmented VBLKs may not be consecutive in the database, so they are placed * in a list so they can be pieced together later. * * Return: 'true' Success, the VBLK was added to the list * 'false' Error, a problem occurred */ static bool ldm_frag_add (const u8 *data, int size, struct list_head *frags) { struct frag *f; struct list_head *item; int rec, num, group; BUG_ON (!data || !frags); if (size < 2 * VBLK_SIZE_HEAD) { ldm_error("Value of size is too small."); return false; } group = get_unaligned_be32(data + 0x08); rec = get_unaligned_be16(data + 0x0C); num = get_unaligned_be16(data + 0x0E); if ((num < 1) || (num > 4)) { ldm_error ("A VBLK claims to have %d parts.", num); return false; } if (rec >= num) { ldm_error("REC value (%d) exceeds NUM value (%d)", rec, num); return false; } list_for_each (item, frags) { f = list_entry (item, struct frag, list); if (f->group == group) goto found; } f = kmalloc (sizeof (*f) + size*num, GFP_KERNEL); if (!f) { ldm_crit ("Out of memory."); return false; } f->group = group; f->num = num; f->rec = rec; f->map = 0xFF << num; list_add_tail (&f->list, frags); found: if (rec >= f->num) { ldm_error("REC value (%d) exceeds NUM value (%d)", rec, f->num); return false; } if (f->map & (1 << rec)) { ldm_error ("Duplicate VBLK, part %d.", rec); f->map &= 0x7F; /* Mark the group as broken */ return false; } f->map |= (1 << rec); if (!rec) memcpy(f->data, data, VBLK_SIZE_HEAD); data += VBLK_SIZE_HEAD; size -= VBLK_SIZE_HEAD; memcpy(f->data + VBLK_SIZE_HEAD + rec * size, data, size); return true; } /** * ldm_frag_free - Free a linked list of VBLK fragments * @list: Linked list of fragments * * Free a linked list of VBLK fragments * * Return: none */ static void ldm_frag_free (struct list_head *list) { struct list_head *item, *tmp; BUG_ON (!list); list_for_each_safe (item, tmp, list) kfree (list_entry (item, struct frag, list)); } /** * ldm_frag_commit - Validate fragmented VBLKs and add them to the database * @frags: Linked list of VBLK fragments * @ldb: Cache of the database structures * * Now that all the fragmented VBLKs have been collected, they must be added to * the database for later use. * * Return: 'true' All the fragments we added successfully * 'false' One or more of the fragments we invalid */ static bool ldm_frag_commit (struct list_head *frags, struct ldmdb *ldb) { struct frag *f; struct list_head *item; BUG_ON (!frags || !ldb); list_for_each (item, frags) { f = list_entry (item, struct frag, list); if (f->map != 0xFF) { ldm_error ("VBLK group %d is incomplete (0x%02x).", f->group, f->map); return false; } if (!ldm_ldmdb_add (f->data, f->num*ldb->vm.vblk_size, ldb)) return false; /* Already logged */ } return true; } /** * ldm_get_vblks - Read the on-disk database of VBLKs into memory * @state: Partition check state including device holding the LDM Database * @base: Offset, into @state->disk, of the database * @ldb: Cache of the database structures * * To use the information from the VBLKs, they need to be read from the disk, * unpacked and validated. We cache them in @ldb according to their type. * * Return: 'true' All the VBLKs were read successfully * 'false' An error occurred */ static bool ldm_get_vblks(struct parsed_partitions *state, unsigned long base, struct ldmdb *ldb) { int size, perbuf, skip, finish, s, v, recs; u8 *data = NULL; Sector sect; bool result = false; LIST_HEAD (frags); BUG_ON(!state || !ldb); size = ldb->vm.vblk_size; perbuf = 512 / size; skip = ldb->vm.vblk_offset >> 9; /* Bytes to sectors */ finish = (size * ldb->vm.last_vblk_seq) >> 9; for (s = skip; s < finish; s++) { /* For each sector */ data = read_part_sector(state, base + OFF_VMDB + s, §); if (!data) { ldm_crit ("Disk read failed."); goto out; } for (v = 0; v < perbuf; v++, data+=size) { /* For each vblk */ if (MAGIC_VBLK != get_unaligned_be32(data)) { ldm_error ("Expected to find a VBLK."); goto out; } recs = get_unaligned_be16(data + 0x0E); /* Number of records */ if (recs == 1) { if (!ldm_ldmdb_add (data, size, ldb)) goto out; /* Already logged */ } else if (recs > 1) { if (!ldm_frag_add (data, size, &frags)) goto out; /* Already logged */ } /* else Record is not in use, ignore it. */ } put_dev_sector (sect); data = NULL; } result = ldm_frag_commit (&frags, ldb); /* Failures, already logged */ out: if (data) put_dev_sector (sect); ldm_frag_free (&frags); return result; } /** * ldm_free_vblks - Free a linked list of vblk's * @lh: Head of a linked list of struct vblk * * Free a list of vblk's and free the memory used to maintain the list. * * Return: none */ static void ldm_free_vblks (struct list_head *lh) { struct list_head *item, *tmp; BUG_ON (!lh); list_for_each_safe (item, tmp, lh) kfree (list_entry (item, struct vblk, list)); } /** * ldm_partition - Find out whether a device is a dynamic disk and handle it * @state: Partition check state including device holding the LDM Database * * This determines whether the device @bdev is a dynamic disk and if so creates * the partitions necessary in the gendisk structure pointed to by @hd. * * We create a dummy device 1, which contains the LDM database, and then create * each partition described by the LDM database in sequence as devices 2+. For * example, if the device is hda, we would have: hda1: LDM database, hda2, hda3, * and so on: the actual data containing partitions. * * Return: 1 Success, @state->disk is a dynamic disk and we handled it * 0 Success, @state->disk is not a dynamic disk * -1 An error occurred before enough information had been read * Or @state->disk is a dynamic disk, but it may be corrupted */ int ldm_partition(struct parsed_partitions *state) { struct ldmdb *ldb; unsigned long base; int result = -1; BUG_ON(!state); /* Look for signs of a Dynamic Disk */ if (!ldm_validate_partition_table(state)) return 0; ldb = kmalloc (sizeof (*ldb), GFP_KERNEL); if (!ldb) { ldm_crit ("Out of memory."); goto out; } /* Parse and check privheads. */ if (!ldm_validate_privheads(state, &ldb->ph)) goto out; /* Already logged */ /* All further references are relative to base (database start). */ base = ldb->ph.config_start; /* Parse and check tocs and vmdb. */ if (!ldm_validate_tocblocks(state, base, ldb) || !ldm_validate_vmdb(state, base, ldb)) goto out; /* Already logged */ /* Initialize vblk lists in ldmdb struct */ INIT_LIST_HEAD (&ldb->v_dgrp); INIT_LIST_HEAD (&ldb->v_disk); INIT_LIST_HEAD (&ldb->v_volu); INIT_LIST_HEAD (&ldb->v_comp); INIT_LIST_HEAD (&ldb->v_part); if (!ldm_get_vblks(state, base, ldb)) { ldm_crit ("Failed to read the VBLKs from the database."); goto cleanup; } /* Finally, create the data partition devices. */ if (ldm_create_data_partitions(state, ldb)) { ldm_debug ("Parsed LDM database successfully."); result = 1; } /* else Already logged */ cleanup: ldm_free_vblks (&ldb->v_dgrp); ldm_free_vblks (&ldb->v_disk); ldm_free_vblks (&ldb->v_volu); ldm_free_vblks (&ldb->v_comp); ldm_free_vblks (&ldb->v_part); out: kfree (ldb); return result; } |
| 2 6847 2691 508 229 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_JIFFIES_H #define _LINUX_JIFFIES_H #include <linux/cache.h> #include <linux/limits.h> #include <linux/math64.h> #include <linux/minmax.h> #include <linux/types.h> #include <linux/time.h> #include <linux/timex.h> #include <vdso/jiffies.h> #include <asm/param.h> /* for HZ */ #include <generated/timeconst.h> /* * The following defines establish the engineering parameters of the PLL * model. The HZ variable establishes the timer interrupt frequency, 100 Hz * for the SunOS kernel, 256 Hz for the Ultrix kernel and 1024 Hz for the * OSF/1 kernel. The SHIFT_HZ define expresses the same value as the * nearest power of two in order to avoid hardware multiply operations. */ #if HZ >= 12 && HZ < 24 # define SHIFT_HZ 4 #elif HZ >= 24 && HZ < 48 # define SHIFT_HZ 5 #elif HZ >= 48 && HZ < 96 # define SHIFT_HZ 6 #elif HZ >= 96 && HZ < 192 # define SHIFT_HZ 7 #elif HZ >= 192 && HZ < 384 # define SHIFT_HZ 8 #elif HZ >= 384 && HZ < 768 # define SHIFT_HZ 9 #elif HZ >= 768 && HZ < 1536 # define SHIFT_HZ 10 #elif HZ >= 1536 && HZ < 3072 # define SHIFT_HZ 11 #elif HZ >= 3072 && HZ < 6144 # define SHIFT_HZ 12 #elif HZ >= 6144 && HZ < 12288 # define SHIFT_HZ 13 #else # error Invalid value of HZ. #endif /* Suppose we want to divide two numbers NOM and DEN: NOM/DEN, then we can * improve accuracy by shifting LSH bits, hence calculating: * (NOM << LSH) / DEN * This however means trouble for large NOM, because (NOM << LSH) may no * longer fit in 32 bits. The following way of calculating this gives us * some slack, under the following conditions: * - (NOM / DEN) fits in (32 - LSH) bits. * - (NOM % DEN) fits in (32 - LSH) bits. */ #define SH_DIV(NOM,DEN,LSH) ( (((NOM) / (DEN)) << (LSH)) \ + ((((NOM) % (DEN)) << (LSH)) + (DEN) / 2) / (DEN)) /* LATCH is used in the interval timer and ftape setup. */ #define LATCH ((CLOCK_TICK_RATE + HZ/2) / HZ) /* For divider */ extern void register_refined_jiffies(long clock_tick_rate); /* TICK_USEC is the time between ticks in usec */ #define TICK_USEC ((USEC_PER_SEC + HZ/2) / HZ) /* USER_TICK_USEC is the time between ticks in usec assuming fake USER_HZ */ #define USER_TICK_USEC ((1000000UL + USER_HZ/2) / USER_HZ) #ifndef __jiffy_arch_data #define __jiffy_arch_data #endif /* * The 64-bit value is not atomic on 32-bit systems - you MUST NOT read it * without sampling the sequence number in jiffies_lock. * get_jiffies_64() will do this for you as appropriate. * * jiffies and jiffies_64 are at the same address for little-endian systems * and for 64-bit big-endian systems. * On 32-bit big-endian systems, jiffies is the lower 32 bits of jiffies_64 * (i.e., at address @jiffies_64 + 4). * See arch/ARCH/kernel/vmlinux.lds.S */ extern u64 __cacheline_aligned_in_smp jiffies_64; extern unsigned long volatile __cacheline_aligned_in_smp __jiffy_arch_data jiffies; #if (BITS_PER_LONG < 64) u64 get_jiffies_64(void); #else /** * get_jiffies_64 - read the 64-bit non-atomic jiffies_64 value * * When BITS_PER_LONG < 64, this uses sequence number sampling using * jiffies_lock to protect the 64-bit read. * * Return: current 64-bit jiffies value */ static inline u64 get_jiffies_64(void) { return (u64)jiffies; } #endif /** * DOC: General information about time_* inlines * * These inlines deal with timer wrapping correctly. You are strongly encouraged * to use them: * * #. Because people otherwise forget * #. Because if the timer wrap changes in future you won't have to alter your * driver code. */ /** * time_after - returns true if the time a is after time b. * @a: first comparable as unsigned long * @b: second comparable as unsigned long * * Do this with "<0" and ">=0" to only test the sign of the result. A * good compiler would generate better code (and a really good compiler * wouldn't care). Gcc is currently neither. * * Return: %true is time a is after time b, otherwise %false. */ #define time_after(a,b) \ (typecheck(unsigned long, a) && \ typecheck(unsigned long, b) && \ ((long)((b) - (a)) < 0)) /** * time_before - returns true if the time a is before time b. * @a: first comparable as unsigned long * @b: second comparable as unsigned long * * Return: %true is time a is before time b, otherwise %false. */ #define time_before(a,b) time_after(b,a) /** * time_after_eq - returns true if the time a is after or the same as time b. * @a: first comparable as unsigned long * @b: second comparable as unsigned long * * Return: %true is time a is after or the same as time b, otherwise %false. */ #define time_after_eq(a,b) \ (typecheck(unsigned long, a) && \ typecheck(unsigned long, b) && \ ((long)((a) - (b)) >= 0)) /** * time_before_eq - returns true if the time a is before or the same as time b. * @a: first comparable as unsigned long * @b: second comparable as unsigned long * * Return: %true is time a is before or the same as time b, otherwise %false. */ #define time_before_eq(a,b) time_after_eq(b,a) /** * time_in_range - Calculate whether a is in the range of [b, c]. * @a: time to test * @b: beginning of the range * @c: end of the range * * Return: %true is time a is in the range [b, c], otherwise %false. */ #define time_in_range(a,b,c) \ (time_after_eq(a,b) && \ time_before_eq(a,c)) /** * time_in_range_open - Calculate whether a is in the range of [b, c). * @a: time to test * @b: beginning of the range * @c: end of the range * * Return: %true is time a is in the range [b, c), otherwise %false. */ #define time_in_range_open(a,b,c) \ (time_after_eq(a,b) && \ time_before(a,c)) /* Same as above, but does so with platform independent 64bit types. * These must be used when utilizing jiffies_64 (i.e. return value of * get_jiffies_64()). */ /** * time_after64 - returns true if the time a is after time b. * @a: first comparable as __u64 * @b: second comparable as __u64 * * This must be used when utilizing jiffies_64 (i.e. return value of * get_jiffies_64()). * * Return: %true is time a is after time b, otherwise %false. */ #define time_after64(a,b) \ (typecheck(__u64, a) && \ typecheck(__u64, b) && \ ((__s64)((b) - (a)) < 0)) /** * time_before64 - returns true if the time a is before time b. * @a: first comparable as __u64 * @b: second comparable as __u64 * * This must be used when utilizing jiffies_64 (i.e. return value of * get_jiffies_64()). * * Return: %true is time a is before time b, otherwise %false. */ #define time_before64(a,b) time_after64(b,a) /** * time_after_eq64 - returns true if the time a is after or the same as time b. * @a: first comparable as __u64 * @b: second comparable as __u64 * * This must be used when utilizing jiffies_64 (i.e. return value of * get_jiffies_64()). * * Return: %true is time a is after or the same as time b, otherwise %false. */ #define time_after_eq64(a,b) \ (typecheck(__u64, a) && \ typecheck(__u64, b) && \ ((__s64)((a) - (b)) >= 0)) /** * time_before_eq64 - returns true if the time a is before or the same as time b. * @a: first comparable as __u64 * @b: second comparable as __u64 * * This must be used when utilizing jiffies_64 (i.e. return value of * get_jiffies_64()). * * Return: %true is time a is before or the same as time b, otherwise %false. */ #define time_before_eq64(a,b) time_after_eq64(b,a) /** * time_in_range64 - Calculate whether a is in the range of [b, c]. * @a: time to test * @b: beginning of the range * @c: end of the range * * Return: %true is time a is in the range [b, c], otherwise %false. */ #define time_in_range64(a, b, c) \ (time_after_eq64(a, b) && \ time_before_eq64(a, c)) /* * These eight macros compare jiffies[_64] and 'a' for convenience. */ /** * time_is_before_jiffies - return true if a is before jiffies * @a: time (unsigned long) to compare to jiffies * * Return: %true is time a is before jiffies, otherwise %false. */ #define time_is_before_jiffies(a) time_after(jiffies, a) /** * time_is_before_jiffies64 - return true if a is before jiffies_64 * @a: time (__u64) to compare to jiffies_64 * * Return: %true is time a is before jiffies_64, otherwise %false. */ #define time_is_before_jiffies64(a) time_after64(get_jiffies_64(), a) /** * time_is_after_jiffies - return true if a is after jiffies * @a: time (unsigned long) to compare to jiffies * * Return: %true is time a is after jiffies, otherwise %false. */ #define time_is_after_jiffies(a) time_before(jiffies, a) /** * time_is_after_jiffies64 - return true if a is after jiffies_64 * @a: time (__u64) to compare to jiffies_64 * * Return: %true is time a is after jiffies_64, otherwise %false. */ #define time_is_after_jiffies64(a) time_before64(get_jiffies_64(), a) /** * time_is_before_eq_jiffies - return true if a is before or equal to jiffies * @a: time (unsigned long) to compare to jiffies * * Return: %true is time a is before or the same as jiffies, otherwise %false. */ #define time_is_before_eq_jiffies(a) time_after_eq(jiffies, a) /** * time_is_before_eq_jiffies64 - return true if a is before or equal to jiffies_64 * @a: time (__u64) to compare to jiffies_64 * * Return: %true is time a is before or the same jiffies_64, otherwise %false. */ #define time_is_before_eq_jiffies64(a) time_after_eq64(get_jiffies_64(), a) /** * time_is_after_eq_jiffies - return true if a is after or equal to jiffies * @a: time (unsigned long) to compare to jiffies * * Return: %true is time a is after or the same as jiffies, otherwise %false. */ #define time_is_after_eq_jiffies(a) time_before_eq(jiffies, a) /** * time_is_after_eq_jiffies64 - return true if a is after or equal to jiffies_64 * @a: time (__u64) to compare to jiffies_64 * * Return: %true is time a is after or the same as jiffies_64, otherwise %false. */ #define time_is_after_eq_jiffies64(a) time_before_eq64(get_jiffies_64(), a) /* * Have the 32-bit jiffies value wrap 5 minutes after boot * so jiffies wrap bugs show up earlier. */ #define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-300*HZ)) /* * Change timeval to jiffies, trying to avoid the * most obvious overflows.. * * And some not so obvious. * * Note that we don't want to return LONG_MAX, because * for various timeout reasons we often end up having * to wait "jiffies+1" in order to guarantee that we wait * at _least_ "jiffies" - so "jiffies+1" had better still * be positive. */ #define MAX_JIFFY_OFFSET ((LONG_MAX >> 1)-1) extern unsigned long preset_lpj; /* * We want to do realistic conversions of time so we need to use the same * values the update wall clock code uses as the jiffies size. This value * is: TICK_NSEC (which is defined in timex.h). This * is a constant and is in nanoseconds. We will use scaled math * with a set of scales defined here as SEC_JIFFIE_SC, USEC_JIFFIE_SC and * NSEC_JIFFIE_SC. Note that these defines contain nothing but * constants and so are computed at compile time. SHIFT_HZ (computed in * timex.h) adjusts the scaling for different HZ values. * Scaled math??? What is that? * * Scaled math is a way to do integer math on values that would, * otherwise, either overflow, underflow, or cause undesired div * instructions to appear in the execution path. In short, we "scale" * up the operands so they take more bits (more precision, less * underflow), do the desired operation and then "scale" the result back * by the same amount. If we do the scaling by shifting we avoid the * costly mpy and the dastardly div instructions. * Suppose, for example, we want to convert from seconds to jiffies * where jiffies is defined in nanoseconds as NSEC_PER_JIFFIE. The * simple math is: jiff = (sec * NSEC_PER_SEC) / NSEC_PER_JIFFIE; We * observe that (NSEC_PER_SEC / NSEC_PER_JIFFIE) is a constant which we * might calculate at compile time, however, the result will only have * about 3-4 bits of precision (less for smaller values of HZ). * * So, we scale as follows: * jiff = (sec) * (NSEC_PER_SEC / NSEC_PER_JIFFIE); * jiff = ((sec) * ((NSEC_PER_SEC * SCALE)/ NSEC_PER_JIFFIE)) / SCALE; * Then we make SCALE a power of two so: * jiff = ((sec) * ((NSEC_PER_SEC << SCALE)/ NSEC_PER_JIFFIE)) >> SCALE; * Now we define: * #define SEC_CONV = ((NSEC_PER_SEC << SCALE)/ NSEC_PER_JIFFIE)) * jiff = (sec * SEC_CONV) >> SCALE; * * Often the math we use will expand beyond 32-bits so we tell C how to * do this and pass the 64-bit result of the mpy through the ">> SCALE" * which should take the result back to 32-bits. We want this expansion * to capture as much precision as possible. At the same time we don't * want to overflow so we pick the SCALE to avoid this. In this file, * that means using a different scale for each range of HZ values (as * defined in timex.h). * * For those who want to know, gcc will give a 64-bit result from a "*" * operator if the result is a long long AND at least one of the * operands is cast to long long (usually just prior to the "*" so as * not to confuse it into thinking it really has a 64-bit operand, * which, buy the way, it can do, but it takes more code and at least 2 * mpys). * We also need to be aware that one second in nanoseconds is only a * couple of bits away from overflowing a 32-bit word, so we MUST use * 64-bits to get the full range time in nanoseconds. */ /* * Here are the scales we will use. One for seconds, nanoseconds and * microseconds. * * Within the limits of cpp we do a rough cut at the SEC_JIFFIE_SC and * check if the sign bit is set. If not, we bump the shift count by 1. * (Gets an extra bit of precision where we can use it.) * We know it is set for HZ = 1024 and HZ = 100 not for 1000. * Haven't tested others. * Limits of cpp (for #if expressions) only long (no long long), but * then we only need the most signicant bit. */ #define SEC_JIFFIE_SC (31 - SHIFT_HZ) #if !((((NSEC_PER_SEC << 2) / TICK_NSEC) << (SEC_JIFFIE_SC - 2)) & 0x80000000) #undef SEC_JIFFIE_SC #define SEC_JIFFIE_SC (32 - SHIFT_HZ) #endif #define NSEC_JIFFIE_SC (SEC_JIFFIE_SC + 29) #define SEC_CONVERSION ((unsigned long)((((u64)NSEC_PER_SEC << SEC_JIFFIE_SC) +\ TICK_NSEC -1) / (u64)TICK_NSEC)) #define NSEC_CONVERSION ((unsigned long)((((u64)1 << NSEC_JIFFIE_SC) +\ TICK_NSEC -1) / (u64)TICK_NSEC)) /* * The maximum jiffy value is (MAX_INT >> 1). Here we translate that * into seconds. The 64-bit case will overflow if we are not careful, * so use the messy SH_DIV macro to do it. Still all constants. */ #if BITS_PER_LONG < 64 # define MAX_SEC_IN_JIFFIES \ (long)((u64)((u64)MAX_JIFFY_OFFSET * TICK_NSEC) / NSEC_PER_SEC) #else /* take care of overflow on 64-bit machines */ # define MAX_SEC_IN_JIFFIES \ (SH_DIV((MAX_JIFFY_OFFSET >> SEC_JIFFIE_SC) * TICK_NSEC, NSEC_PER_SEC, 1) - 1) #endif /* * Convert various time units to each other: */ extern unsigned int jiffies_to_msecs(const unsigned long j); extern unsigned int jiffies_to_usecs(const unsigned long j); /** * jiffies_to_nsecs - Convert jiffies to nanoseconds * @j: jiffies value * * Return: nanoseconds value */ static inline u64 jiffies_to_nsecs(const unsigned long j) { return (u64)jiffies_to_usecs(j) * NSEC_PER_USEC; } extern u64 jiffies64_to_nsecs(u64 j); extern u64 jiffies64_to_msecs(u64 j); extern unsigned long __msecs_to_jiffies(const unsigned int m); #if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) /* * HZ is equal to or smaller than 1000, and 1000 is a nice round * multiple of HZ, divide with the factor between them, but round * upwards: */ static inline unsigned long _msecs_to_jiffies(const unsigned int m) { return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ); } #elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) /* * HZ is larger than 1000, and HZ is a nice round multiple of 1000 - * simply multiply with the factor between them. * * But first make sure the multiplication result cannot overflow: */ static inline unsigned long _msecs_to_jiffies(const unsigned int m) { if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) return MAX_JIFFY_OFFSET; return m * (HZ / MSEC_PER_SEC); } #else /* * Generic case - multiply, round and divide. But first check that if * we are doing a net multiplication, that we wouldn't overflow: */ static inline unsigned long _msecs_to_jiffies(const unsigned int m) { if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) return MAX_JIFFY_OFFSET; return (MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32) >> MSEC_TO_HZ_SHR32; } #endif /** * msecs_to_jiffies: - convert milliseconds to jiffies * @m: time in milliseconds * * conversion is done as follows: * * - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET) * * - 'too large' values [that would result in larger than * MAX_JIFFY_OFFSET values] mean 'infinite timeout' too. * * - all other values are converted to jiffies by either multiplying * the input value by a factor or dividing it with a factor and * handling any 32-bit overflows. * for the details see _msecs_to_jiffies() * * msecs_to_jiffies() checks for the passed in value being a constant * via __builtin_constant_p() allowing gcc to eliminate most of the * code. __msecs_to_jiffies() is called if the value passed does not * allow constant folding and the actual conversion must be done at * runtime. * The HZ range specific helpers _msecs_to_jiffies() are called both * directly here and from __msecs_to_jiffies() in the case where * constant folding is not possible. * * Return: jiffies value */ static __always_inline unsigned long msecs_to_jiffies(const unsigned int m) { if (__builtin_constant_p(m)) { if ((int)m < 0) return MAX_JIFFY_OFFSET; return _msecs_to_jiffies(m); } else { return __msecs_to_jiffies(m); } } /** * secs_to_jiffies: - convert seconds to jiffies * @_secs: time in seconds * * Conversion is done by simple multiplication with HZ * * secs_to_jiffies() is defined as a macro rather than a static inline * function so it can be used in static initializers. * * Return: jiffies value */ #define secs_to_jiffies(_secs) (unsigned long)((_secs) * HZ) extern unsigned long __usecs_to_jiffies(const unsigned int u); #if !(USEC_PER_SEC % HZ) static inline unsigned long _usecs_to_jiffies(const unsigned int u) { return (u + (USEC_PER_SEC / HZ) - 1) / (USEC_PER_SEC / HZ); } #else static inline unsigned long _usecs_to_jiffies(const unsigned int u) { return (USEC_TO_HZ_MUL32 * u + USEC_TO_HZ_ADJ32) >> USEC_TO_HZ_SHR32; } #endif /** * usecs_to_jiffies: - convert microseconds to jiffies * @u: time in microseconds * * conversion is done as follows: * * - 'too large' values [that would result in larger than * MAX_JIFFY_OFFSET values] mean 'infinite timeout' too. * * - all other values are converted to jiffies by either multiplying * the input value by a factor or dividing it with a factor and * handling any 32-bit overflows as for msecs_to_jiffies. * * usecs_to_jiffies() checks for the passed in value being a constant * via __builtin_constant_p() allowing gcc to eliminate most of the * code. __usecs_to_jiffies() is called if the value passed does not * allow constant folding and the actual conversion must be done at * runtime. * The HZ range specific helpers _usecs_to_jiffies() are called both * directly here and from __msecs_to_jiffies() in the case where * constant folding is not possible. * * Return: jiffies value */ static __always_inline unsigned long usecs_to_jiffies(const unsigned int u) { if (__builtin_constant_p(u)) { if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET)) return MAX_JIFFY_OFFSET; return _usecs_to_jiffies(u); } else { return __usecs_to_jiffies(u); } } extern unsigned long timespec64_to_jiffies(const struct timespec64 *value); extern void jiffies_to_timespec64(const unsigned long jiffies, struct timespec64 *value); extern clock_t jiffies_to_clock_t(unsigned long x); static inline clock_t jiffies_delta_to_clock_t(long delta) { return jiffies_to_clock_t(max(0L, delta)); } static inline unsigned int jiffies_delta_to_msecs(long delta) { return jiffies_to_msecs(max(0L, delta)); } extern unsigned long clock_t_to_jiffies(unsigned long x); extern u64 jiffies_64_to_clock_t(u64 x); extern u64 nsec_to_clock_t(u64 x); extern u64 nsecs_to_jiffies64(u64 n); extern unsigned long nsecs_to_jiffies(u64 n); #define TIMESTAMP_SIZE 30 struct ctl_table; int proc_dointvec_jiffies(const struct ctl_table *table, int dir, void *buffer, size_t *lenp, loff_t *ppos); int proc_dointvec_ms_jiffies_minmax(const struct ctl_table *table, int dir, void *buffer, size_t *lenp, loff_t *ppos); int proc_dointvec_userhz_jiffies(const struct ctl_table *table, int dir, void *buffer, size_t *lenp, loff_t *ppos); int proc_dointvec_ms_jiffies(const struct ctl_table *table, int dir, void *buffer, size_t *lenp, loff_t *ppos); int proc_doulongvec_ms_jiffies_minmax(const struct ctl_table *table, int dir, void *buffer, size_t *lenp, loff_t *ppos); #endif |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 | /* IPv6-specific defines for netfilter. * (C)1998 Rusty Russell -- This code is GPL. * (C)1999 David Jeffery * this header was blatantly ripped from netfilter_ipv4.h * it's amazing what adding a bunch of 6s can do =8^) */ #ifndef __LINUX_IP6_NETFILTER_H #define __LINUX_IP6_NETFILTER_H #include <uapi/linux/netfilter_ipv6.h> #include <net/tcp.h> /* Check for an extension */ static inline int nf_ip6_ext_hdr(u8 nexthdr) { return (nexthdr == IPPROTO_HOPOPTS) || (nexthdr == IPPROTO_ROUTING) || (nexthdr == IPPROTO_FRAGMENT) || (nexthdr == IPPROTO_ESP) || (nexthdr == IPPROTO_AH) || (nexthdr == IPPROTO_NONE) || (nexthdr == IPPROTO_DSTOPTS); } /* Extra routing may needed on local out, as the QUEUE target never returns * control to the table. */ struct ip6_rt_info { struct in6_addr daddr; struct in6_addr saddr; u_int32_t mark; }; struct nf_queue_entry; struct nf_bridge_frag_data; /* * Hook functions for ipv6 to allow xt_* modules to be built-in even * if IPv6 is a module. */ struct nf_ipv6_ops { #if IS_MODULE(CONFIG_IPV6) int (*chk_addr)(struct net *net, const struct in6_addr *addr, const struct net_device *dev, int strict); int (*route_me_harder)(struct net *net, struct sock *sk, struct sk_buff *skb); int (*dev_get_saddr)(struct net *net, const struct net_device *dev, const struct in6_addr *daddr, unsigned int srcprefs, struct in6_addr *saddr); int (*route)(struct net *net, struct dst_entry **dst, struct flowi *fl, bool strict); u32 (*cookie_init_sequence)(const struct ipv6hdr *iph, const struct tcphdr *th, u16 *mssp); int (*cookie_v6_check)(const struct ipv6hdr *iph, const struct tcphdr *th); #endif void (*route_input)(struct sk_buff *skb); int (*fragment)(struct net *net, struct sock *sk, struct sk_buff *skb, int (*output)(struct net *, struct sock *, struct sk_buff *)); int (*reroute)(struct sk_buff *skb, const struct nf_queue_entry *entry); #if IS_MODULE(CONFIG_IPV6) int (*br_fragment)(struct net *net, struct sock *sk, struct sk_buff *skb, struct nf_bridge_frag_data *data, int (*output)(struct net *, struct sock *sk, const struct nf_bridge_frag_data *data, struct sk_buff *)); #endif }; #ifdef CONFIG_NETFILTER #include <net/addrconf.h> extern const struct nf_ipv6_ops __rcu *nf_ipv6_ops; static inline const struct nf_ipv6_ops *nf_get_ipv6_ops(void) { return rcu_dereference(nf_ipv6_ops); } static inline int nf_ipv6_chk_addr(struct net *net, const struct in6_addr *addr, const struct net_device *dev, int strict) { #if IS_MODULE(CONFIG_IPV6) const struct nf_ipv6_ops *v6_ops = nf_get_ipv6_ops(); if (!v6_ops) return 1; return v6_ops->chk_addr(net, addr, dev, strict); #elif IS_BUILTIN(CONFIG_IPV6) return ipv6_chk_addr(net, addr, dev, strict); #else return 1; #endif } int __nf_ip6_route(struct net *net, struct dst_entry **dst, struct flowi *fl, bool strict); static inline int nf_ip6_route(struct net *net, struct dst_entry **dst, struct flowi *fl, bool strict) { #if IS_MODULE(CONFIG_IPV6) const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops(); if (v6ops) return v6ops->route(net, dst, fl, strict); return -EHOSTUNREACH; #endif #if IS_BUILTIN(CONFIG_IPV6) return __nf_ip6_route(net, dst, fl, strict); #else return -EHOSTUNREACH; #endif } #include <net/netfilter/ipv6/nf_defrag_ipv6.h> int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, struct nf_bridge_frag_data *data, int (*output)(struct net *, struct sock *sk, const struct nf_bridge_frag_data *data, struct sk_buff *)); static inline int nf_br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, struct nf_bridge_frag_data *data, int (*output)(struct net *, struct sock *sk, const struct nf_bridge_frag_data *data, struct sk_buff *)) { #if IS_MODULE(CONFIG_IPV6) const struct nf_ipv6_ops *v6_ops = nf_get_ipv6_ops(); if (!v6_ops) return 1; return v6_ops->br_fragment(net, sk, skb, data, output); #elif IS_BUILTIN(CONFIG_IPV6) return br_ip6_fragment(net, sk, skb, data, output); #else return 1; #endif } int ip6_route_me_harder(struct net *net, struct sock *sk, struct sk_buff *skb); static inline int nf_ip6_route_me_harder(struct net *net, struct sock *sk, struct sk_buff *skb) { #if IS_MODULE(CONFIG_IPV6) const struct nf_ipv6_ops *v6_ops = nf_get_ipv6_ops(); if (!v6_ops) return -EHOSTUNREACH; return v6_ops->route_me_harder(net, sk, skb); #elif IS_BUILTIN(CONFIG_IPV6) return ip6_route_me_harder(net, sk, skb); #else return -EHOSTUNREACH; #endif } static inline u32 nf_ipv6_cookie_init_sequence(const struct ipv6hdr *iph, const struct tcphdr *th, u16 *mssp) { #if IS_ENABLED(CONFIG_SYN_COOKIES) #if IS_MODULE(CONFIG_IPV6) const struct nf_ipv6_ops *v6_ops = nf_get_ipv6_ops(); if (v6_ops) return v6_ops->cookie_init_sequence(iph, th, mssp); #elif IS_BUILTIN(CONFIG_IPV6) return __cookie_v6_init_sequence(iph, th, mssp); #endif #endif return 0; } static inline int nf_cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th) { #if IS_ENABLED(CONFIG_SYN_COOKIES) #if IS_MODULE(CONFIG_IPV6) const struct nf_ipv6_ops *v6_ops = nf_get_ipv6_ops(); if (v6_ops) return v6_ops->cookie_v6_check(iph, th); #elif IS_BUILTIN(CONFIG_IPV6) return __cookie_v6_check(iph, th); #endif #endif return 0; } __sum16 nf_ip6_checksum(struct sk_buff *skb, unsigned int hook, unsigned int dataoff, u_int8_t protocol); int nf_ip6_check_hbh_len(struct sk_buff *skb, u32 *plen); int ipv6_netfilter_init(void); void ipv6_netfilter_fini(void); #else /* CONFIG_NETFILTER */ static inline int ipv6_netfilter_init(void) { return 0; } static inline void ipv6_netfilter_fini(void) { return; } static inline const struct nf_ipv6_ops *nf_get_ipv6_ops(void) { return NULL; } #endif /* CONFIG_NETFILTER */ #endif /*__LINUX_IP6_NETFILTER_H*/ |
| 18 4 90 89 90 74 3 71 1 2 2 10 3 1 5 4 7 8 11 14 7 36 2 5 7 39 39 38 2 37 37 14 22 1 1 1 1 1 29 30 30 29 2 28 3 3 1 1 19 1 18 1 1 1 2 2 1 3 2 2 2 2 2 2 1 1 1 6 55 55 52 54 53 31 22 9 9 4 2 10 11 10 9 13 44 46 23 23 20 3 23 23 22 22 3 2 1 2 1 2 1 3 31 31 36 10 2 11 2 11 2 10 4 31 8 31 31 31 30 31 31 30 30 31 31 31 31 75 58 9 7 956 918 17 18 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Extension Header handling for IPv6 * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * Andi Kleen <ak@muc.de> * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> */ /* Changes: * yoshfuji : ensure not to overrun while parsing * tlv options. * Mitsuru KANDA @USAGI and: Remove ipv6_parse_exthdrs(). * YOSHIFUJI Hideaki @USAGI Register inbound extension header * handlers as inet6_protocol{}. */ #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/in6.h> #include <linux/icmpv6.h> #include <linux/slab.h> #include <linux/export.h> #include <net/dst.h> #include <net/sock.h> #include <net/snmp.h> #include <net/ipv6.h> #include <net/protocol.h> #include <net/transp_v6.h> #include <net/rawv6.h> #include <net/ndisc.h> #include <net/ip6_route.h> #include <net/addrconf.h> #include <net/calipso.h> #if IS_ENABLED(CONFIG_IPV6_MIP6) #include <net/xfrm.h> #endif #include <linux/seg6.h> #include <net/seg6.h> #ifdef CONFIG_IPV6_SEG6_HMAC #include <net/seg6_hmac.h> #endif #include <net/rpl.h> #include <linux/ioam6.h> #include <linux/ioam6_genl.h> #include <net/ioam6.h> #include <net/dst_metadata.h> #include <linux/uaccess.h> /********************* Generic functions *********************/ /* An unknown option is detected, decide what to do */ static bool ip6_tlvopt_unknown(struct sk_buff *skb, int optoff, bool disallow_unknowns) { if (disallow_unknowns) { /* If unknown TLVs are disallowed by configuration * then always silently drop packet. Note this also * means no ICMP parameter problem is sent which * could be a good property to mitigate a reflection DOS * attack. */ goto drop; } switch ((skb_network_header(skb)[optoff] & 0xC0) >> 6) { case 0: /* ignore */ return true; case 1: /* drop packet */ break; case 3: /* Send ICMP if not a multicast address and drop packet */ /* Actually, it is redundant check. icmp_send will recheck in any case. */ if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) break; fallthrough; case 2: /* send ICMP PARM PROB regardless and drop packet */ icmpv6_param_prob_reason(skb, ICMPV6_UNK_OPTION, optoff, SKB_DROP_REASON_UNHANDLED_PROTO); return false; } drop: kfree_skb_reason(skb, SKB_DROP_REASON_UNHANDLED_PROTO); return false; } static bool ipv6_hop_ra(struct sk_buff *skb, int optoff); static bool ipv6_hop_ioam(struct sk_buff *skb, int optoff); static bool ipv6_hop_jumbo(struct sk_buff *skb, int optoff); static bool ipv6_hop_calipso(struct sk_buff *skb, int optoff); #if IS_ENABLED(CONFIG_IPV6_MIP6) static bool ipv6_dest_hao(struct sk_buff *skb, int optoff); #endif /* Parse tlv encoded option header (hop-by-hop or destination) */ static bool ip6_parse_tlv(bool hopbyhop, struct sk_buff *skb, int max_count) { int len = (skb_transport_header(skb)[1] + 1) << 3; const unsigned char *nh = skb_network_header(skb); int off = skb_network_header_len(skb); bool disallow_unknowns = false; int tlv_count = 0; int padlen = 0; if (unlikely(max_count < 0)) { disallow_unknowns = true; max_count = -max_count; } off += 2; len -= 2; while (len > 0) { int optlen, i; if (nh[off] == IPV6_TLV_PAD1) { padlen++; if (padlen > 7) goto bad; off++; len--; continue; } if (len < 2) goto bad; optlen = nh[off + 1] + 2; if (optlen > len) goto bad; if (nh[off] == IPV6_TLV_PADN) { /* RFC 2460 states that the purpose of PadN is * to align the containing header to multiples * of 8. 7 is therefore the highest valid value. * See also RFC 4942, Section 2.1.9.5. */ padlen += optlen; if (padlen > 7) goto bad; /* RFC 4942 recommends receiving hosts to * actively check PadN payload to contain * only zeroes. */ for (i = 2; i < optlen; i++) { if (nh[off + i] != 0) goto bad; } } else { tlv_count++; if (tlv_count > max_count) goto bad; if (hopbyhop) { switch (nh[off]) { case IPV6_TLV_ROUTERALERT: if (!ipv6_hop_ra(skb, off)) return false; break; case IPV6_TLV_IOAM: if (!ipv6_hop_ioam(skb, off)) return false; nh = skb_network_header(skb); break; case IPV6_TLV_JUMBO: if (!ipv6_hop_jumbo(skb, off)) return false; break; case IPV6_TLV_CALIPSO: if (!ipv6_hop_calipso(skb, off)) return false; break; default: if (!ip6_tlvopt_unknown(skb, off, disallow_unknowns)) return false; break; } } else { switch (nh[off]) { #if IS_ENABLED(CONFIG_IPV6_MIP6) case IPV6_TLV_HAO: if (!ipv6_dest_hao(skb, off)) return false; break; #endif default: if (!ip6_tlvopt_unknown(skb, off, disallow_unknowns)) return false; break; } } padlen = 0; } off += optlen; len -= optlen; } if (len == 0) return true; bad: kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR); return false; } /***************************** Destination options header. *****************************/ #if IS_ENABLED(CONFIG_IPV6_MIP6) static bool ipv6_dest_hao(struct sk_buff *skb, int optoff) { struct ipv6_destopt_hao *hao; struct inet6_skb_parm *opt = IP6CB(skb); struct ipv6hdr *ipv6h = ipv6_hdr(skb); SKB_DR(reason); int ret; if (opt->dsthao) { net_dbg_ratelimited("hao duplicated\n"); goto discard; } opt->dsthao = opt->dst1; opt->dst1 = 0; hao = (struct ipv6_destopt_hao *)(skb_network_header(skb) + optoff); if (hao->length != 16) { net_dbg_ratelimited("hao invalid option length = %d\n", hao->length); SKB_DR_SET(reason, IP_INHDR); goto discard; } if (!(ipv6_addr_type(&hao->addr) & IPV6_ADDR_UNICAST)) { net_dbg_ratelimited("hao is not an unicast addr: %pI6\n", &hao->addr); SKB_DR_SET(reason, INVALID_PROTO); goto discard; } ret = xfrm6_input_addr(skb, (xfrm_address_t *)&ipv6h->daddr, (xfrm_address_t *)&hao->addr, IPPROTO_DSTOPTS); if (unlikely(ret < 0)) { SKB_DR_SET(reason, XFRM_POLICY); goto discard; } if (skb_cloned(skb)) { if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) goto discard; /* update all variable using below by copied skbuff */ hao = (struct ipv6_destopt_hao *)(skb_network_header(skb) + optoff); ipv6h = ipv6_hdr(skb); } if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = CHECKSUM_NONE; swap(ipv6h->saddr, hao->addr); if (skb->tstamp == 0) __net_timestamp(skb); return true; discard: kfree_skb_reason(skb, reason); return false; } #endif static int ipv6_destopt_rcv(struct sk_buff *skb) { struct inet6_dev *idev = __in6_dev_get(skb->dev); struct inet6_skb_parm *opt = IP6CB(skb); #if IS_ENABLED(CONFIG_IPV6_MIP6) __u16 dstbuf; #endif struct dst_entry *dst = skb_dst(skb); struct net *net = dev_net(skb->dev); int extlen; if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) || !pskb_may_pull(skb, (skb_transport_offset(skb) + ((skb_transport_header(skb)[1] + 1) << 3)))) { __IP6_INC_STATS(dev_net(dst_dev(dst)), idev, IPSTATS_MIB_INHDRERRORS); fail_and_free: kfree_skb(skb); return -1; } extlen = (skb_transport_header(skb)[1] + 1) << 3; if (extlen > net->ipv6.sysctl.max_dst_opts_len) goto fail_and_free; opt->lastopt = opt->dst1 = skb_network_header_len(skb); #if IS_ENABLED(CONFIG_IPV6_MIP6) dstbuf = opt->dst1; #endif if (ip6_parse_tlv(false, skb, net->ipv6.sysctl.max_dst_opts_cnt)) { skb->transport_header += extlen; opt = IP6CB(skb); #if IS_ENABLED(CONFIG_IPV6_MIP6) opt->nhoff = dstbuf; #else opt->nhoff = opt->dst1; #endif return 1; } __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); return -1; } static void seg6_update_csum(struct sk_buff *skb) { struct ipv6_sr_hdr *hdr; struct in6_addr *addr; __be32 from, to; /* srh is at transport offset and seg_left is already decremented * but daddr is not yet updated with next segment */ hdr = (struct ipv6_sr_hdr *)skb_transport_header(skb); addr = hdr->segments + hdr->segments_left; hdr->segments_left++; from = *(__be32 *)hdr; hdr->segments_left--; to = *(__be32 *)hdr; /* update skb csum with diff resulting from seg_left decrement */ update_csum_diff4(skb, from, to); /* compute csum diff between current and next segment and update */ update_csum_diff16(skb, (__be32 *)(&ipv6_hdr(skb)->daddr), (__be32 *)addr); } static int ipv6_srh_rcv(struct sk_buff *skb) { struct inet6_skb_parm *opt = IP6CB(skb); struct net *net = dev_net(skb->dev); struct ipv6_sr_hdr *hdr; struct inet6_dev *idev; struct in6_addr *addr; int accept_seg6; hdr = (struct ipv6_sr_hdr *)skb_transport_header(skb); idev = __in6_dev_get(skb->dev); accept_seg6 = min(READ_ONCE(net->ipv6.devconf_all->seg6_enabled), READ_ONCE(idev->cnf.seg6_enabled)); if (!accept_seg6) { kfree_skb(skb); return -1; } #ifdef CONFIG_IPV6_SEG6_HMAC if (!seg6_hmac_validate_skb(skb)) { kfree_skb(skb); return -1; } #endif looped_back: if (hdr->segments_left == 0) { if (hdr->nexthdr == NEXTHDR_IPV6 || hdr->nexthdr == NEXTHDR_IPV4) { int offset = (hdr->hdrlen + 1) << 3; skb_postpull_rcsum(skb, skb_network_header(skb), skb_network_header_len(skb)); skb_pull(skb, offset); skb_postpull_rcsum(skb, skb_transport_header(skb), offset); skb_reset_network_header(skb); skb_reset_transport_header(skb); skb->encapsulation = 0; if (hdr->nexthdr == NEXTHDR_IPV4) skb->protocol = htons(ETH_P_IP); __skb_tunnel_rx(skb, skb->dev, net); netif_rx(skb); return -1; } opt->srcrt = skb_network_header_len(skb); opt->lastopt = opt->srcrt; skb->transport_header += (hdr->hdrlen + 1) << 3; opt->nhoff = (&hdr->nexthdr) - skb_network_header(skb); return 1; } if (hdr->segments_left >= (hdr->hdrlen >> 1)) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, ((&hdr->segments_left) - skb_network_header(skb))); return -1; } if (skb_cloned(skb)) { if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) { __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_OUTDISCARDS); kfree_skb(skb); return -1; } hdr = (struct ipv6_sr_hdr *)skb_transport_header(skb); } hdr->segments_left--; addr = hdr->segments + hdr->segments_left; skb_push(skb, sizeof(struct ipv6hdr)); if (skb->ip_summed == CHECKSUM_COMPLETE) seg6_update_csum(skb); ipv6_hdr(skb)->daddr = *addr; ip6_route_input(skb); if (skb_dst(skb)->error) { dst_input(skb); return -1; } if (skb_dst_dev(skb)->flags & IFF_LOOPBACK) { if (ipv6_hdr(skb)->hop_limit <= 1) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); kfree_skb(skb); return -1; } ipv6_hdr(skb)->hop_limit--; skb_pull(skb, sizeof(struct ipv6hdr)); goto looped_back; } dst_input(skb); return -1; } static int ipv6_rpl_srh_rcv(struct sk_buff *skb) { struct ipv6_rpl_sr_hdr *hdr, *ohdr, *chdr; struct inet6_skb_parm *opt = IP6CB(skb); struct net *net = dev_net(skb->dev); struct inet6_dev *idev; struct ipv6hdr *oldhdr; unsigned char *buf; int accept_rpl_seg; int i, err; u64 n = 0; u32 r; idev = __in6_dev_get(skb->dev); accept_rpl_seg = min(READ_ONCE(net->ipv6.devconf_all->rpl_seg_enabled), READ_ONCE(idev->cnf.rpl_seg_enabled)); if (!accept_rpl_seg) { kfree_skb(skb); return -1; } looped_back: hdr = (struct ipv6_rpl_sr_hdr *)skb_transport_header(skb); if (hdr->segments_left == 0) { if (hdr->nexthdr == NEXTHDR_IPV6) { int offset = (hdr->hdrlen + 1) << 3; skb_postpull_rcsum(skb, skb_network_header(skb), skb_network_header_len(skb)); skb_pull(skb, offset); skb_postpull_rcsum(skb, skb_transport_header(skb), offset); skb_reset_network_header(skb); skb_reset_transport_header(skb); skb->encapsulation = 0; __skb_tunnel_rx(skb, skb->dev, net); netif_rx(skb); return -1; } opt->srcrt = skb_network_header_len(skb); opt->lastopt = opt->srcrt; skb->transport_header += (hdr->hdrlen + 1) << 3; opt->nhoff = (&hdr->nexthdr) - skb_network_header(skb); return 1; } n = (hdr->hdrlen << 3) - hdr->pad - (16 - hdr->cmpre); r = do_div(n, (16 - hdr->cmpri)); /* checks if calculation was without remainder and n fits into * unsigned char which is segments_left field. Should not be * higher than that. */ if (r || (n + 1) > 255) { kfree_skb(skb); return -1; } if (hdr->segments_left > n + 1) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, ((&hdr->segments_left) - skb_network_header(skb))); return -1; } hdr->segments_left--; i = n - hdr->segments_left; buf = kcalloc(struct_size(hdr, segments.addr, n + 2), 2, GFP_ATOMIC); if (unlikely(!buf)) { kfree_skb(skb); return -1; } ohdr = (struct ipv6_rpl_sr_hdr *)buf; ipv6_rpl_srh_decompress(ohdr, hdr, &ipv6_hdr(skb)->daddr, n); chdr = (struct ipv6_rpl_sr_hdr *)(buf + ((ohdr->hdrlen + 1) << 3)); if (ipv6_addr_is_multicast(&ohdr->rpl_segaddr[i])) { kfree_skb(skb); kfree(buf); return -1; } err = ipv6_chk_rpl_srh_loop(net, ohdr->rpl_segaddr, n + 1); if (err) { icmpv6_send(skb, ICMPV6_PARAMPROB, 0, 0); kfree_skb(skb); kfree(buf); return -1; } swap(ipv6_hdr(skb)->daddr, ohdr->rpl_segaddr[i]); ipv6_rpl_srh_compress(chdr, ohdr, &ipv6_hdr(skb)->daddr, n); oldhdr = ipv6_hdr(skb); skb_pull(skb, ((hdr->hdrlen + 1) << 3)); skb_postpull_rcsum(skb, oldhdr, sizeof(struct ipv6hdr) + ((hdr->hdrlen + 1) << 3)); if (unlikely(!hdr->segments_left)) { if (pskb_expand_head(skb, sizeof(struct ipv6hdr) + ((chdr->hdrlen + 1) << 3), 0, GFP_ATOMIC)) { __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_OUTDISCARDS); kfree_skb(skb); kfree(buf); return -1; } oldhdr = ipv6_hdr(skb); } skb_push(skb, ((chdr->hdrlen + 1) << 3) + sizeof(struct ipv6hdr)); skb_reset_network_header(skb); skb_mac_header_rebuild(skb); skb_set_transport_header(skb, sizeof(struct ipv6hdr)); memmove(ipv6_hdr(skb), oldhdr, sizeof(struct ipv6hdr)); memcpy(skb_transport_header(skb), chdr, (chdr->hdrlen + 1) << 3); ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); skb_postpush_rcsum(skb, ipv6_hdr(skb), sizeof(struct ipv6hdr) + ((chdr->hdrlen + 1) << 3)); kfree(buf); ip6_route_input(skb); if (skb_dst(skb)->error) { dst_input(skb); return -1; } if (skb_dst_dev(skb)->flags & IFF_LOOPBACK) { if (ipv6_hdr(skb)->hop_limit <= 1) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); kfree_skb(skb); return -1; } ipv6_hdr(skb)->hop_limit--; skb_pull(skb, sizeof(struct ipv6hdr)); goto looped_back; } dst_input(skb); return -1; } /******************************** Routing header. ********************************/ /* called with rcu_read_lock() */ static int ipv6_rthdr_rcv(struct sk_buff *skb) { struct inet6_dev *idev = __in6_dev_get(skb->dev); struct inet6_skb_parm *opt = IP6CB(skb); struct in6_addr *addr = NULL; int n, i; struct ipv6_rt_hdr *hdr; struct rt0_hdr *rthdr; struct net *net = dev_net(skb->dev); int accept_source_route; accept_source_route = READ_ONCE(net->ipv6.devconf_all->accept_source_route); if (idev) accept_source_route = min(accept_source_route, READ_ONCE(idev->cnf.accept_source_route)); if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) || !pskb_may_pull(skb, (skb_transport_offset(skb) + ((skb_transport_header(skb)[1] + 1) << 3)))) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); kfree_skb(skb); return -1; } hdr = (struct ipv6_rt_hdr *)skb_transport_header(skb); if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr) || skb->pkt_type != PACKET_HOST) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); kfree_skb(skb); return -1; } switch (hdr->type) { case IPV6_SRCRT_TYPE_4: /* segment routing */ return ipv6_srh_rcv(skb); case IPV6_SRCRT_TYPE_3: /* rpl segment routing */ return ipv6_rpl_srh_rcv(skb); default: break; } looped_back: if (hdr->segments_left == 0) { switch (hdr->type) { #if IS_ENABLED(CONFIG_IPV6_MIP6) case IPV6_SRCRT_TYPE_2: /* Silently discard type 2 header unless it was * processed by own */ if (!addr) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); kfree_skb(skb); return -1; } break; #endif default: break; } opt->lastopt = opt->srcrt = skb_network_header_len(skb); skb->transport_header += (hdr->hdrlen + 1) << 3; opt->dst0 = opt->dst1; opt->dst1 = 0; opt->nhoff = (&hdr->nexthdr) - skb_network_header(skb); return 1; } switch (hdr->type) { #if IS_ENABLED(CONFIG_IPV6_MIP6) case IPV6_SRCRT_TYPE_2: if (accept_source_route < 0) goto unknown_rh; /* Silently discard invalid RTH type 2 */ if (hdr->hdrlen != 2 || hdr->segments_left != 1) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); kfree_skb(skb); return -1; } break; #endif default: goto unknown_rh; } /* * This is the routing header forwarding algorithm from * RFC 2460, page 16. */ n = hdr->hdrlen >> 1; if (hdr->segments_left > n) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, ((&hdr->segments_left) - skb_network_header(skb))); return -1; } /* We are about to mangle packet header. Be careful! Do not damage packets queued somewhere. */ if (skb_cloned(skb)) { /* the copy is a forwarded packet */ if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) { __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_OUTDISCARDS); kfree_skb(skb); return -1; } hdr = (struct ipv6_rt_hdr *)skb_transport_header(skb); } if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = CHECKSUM_NONE; i = n - --hdr->segments_left; rthdr = (struct rt0_hdr *) hdr; addr = rthdr->addr; addr += i - 1; switch (hdr->type) { #if IS_ENABLED(CONFIG_IPV6_MIP6) case IPV6_SRCRT_TYPE_2: if (xfrm6_input_addr(skb, (xfrm_address_t *)addr, (xfrm_address_t *)&ipv6_hdr(skb)->saddr, IPPROTO_ROUTING) < 0) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); kfree_skb(skb); return -1; } if (!ipv6_chk_home_addr(skb_dst_dev_net(skb), addr)) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); kfree_skb(skb); return -1; } break; #endif default: break; } if (ipv6_addr_is_multicast(addr)) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); kfree_skb(skb); return -1; } swap(*addr, ipv6_hdr(skb)->daddr); ip6_route_input(skb); if (skb_dst(skb)->error) { skb_push(skb, -skb_network_offset(skb)); dst_input(skb); return -1; } if (skb_dst_dev(skb)->flags & IFF_LOOPBACK) { if (ipv6_hdr(skb)->hop_limit <= 1) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); kfree_skb(skb); return -1; } ipv6_hdr(skb)->hop_limit--; goto looped_back; } skb_push(skb, -skb_network_offset(skb)); dst_input(skb); return -1; unknown_rh: __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, (&hdr->type) - skb_network_header(skb)); return -1; } static const struct inet6_protocol rthdr_protocol = { .handler = ipv6_rthdr_rcv, .flags = INET6_PROTO_NOPOLICY, }; static const struct inet6_protocol destopt_protocol = { .handler = ipv6_destopt_rcv, .flags = INET6_PROTO_NOPOLICY, }; static const struct inet6_protocol nodata_protocol = { .handler = dst_discard, .flags = INET6_PROTO_NOPOLICY, }; int __init ipv6_exthdrs_init(void) { int ret; ret = inet6_add_protocol(&rthdr_protocol, IPPROTO_ROUTING); if (ret) goto out; ret = inet6_add_protocol(&destopt_protocol, IPPROTO_DSTOPTS); if (ret) goto out_rthdr; ret = inet6_add_protocol(&nodata_protocol, IPPROTO_NONE); if (ret) goto out_destopt; out: return ret; out_destopt: inet6_del_protocol(&destopt_protocol, IPPROTO_DSTOPTS); out_rthdr: inet6_del_protocol(&rthdr_protocol, IPPROTO_ROUTING); goto out; }; void ipv6_exthdrs_exit(void) { inet6_del_protocol(&nodata_protocol, IPPROTO_NONE); inet6_del_protocol(&destopt_protocol, IPPROTO_DSTOPTS); inet6_del_protocol(&rthdr_protocol, IPPROTO_ROUTING); } /********************************** Hop-by-hop options. **********************************/ /* Router Alert as of RFC 2711 */ static bool ipv6_hop_ra(struct sk_buff *skb, int optoff) { const unsigned char *nh = skb_network_header(skb); if (nh[optoff + 1] == 2) { IP6CB(skb)->flags |= IP6SKB_ROUTERALERT; memcpy(&IP6CB(skb)->ra, nh + optoff + 2, sizeof(IP6CB(skb)->ra)); return true; } net_dbg_ratelimited("ipv6_hop_ra: wrong RA length %d\n", nh[optoff + 1]); kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR); return false; } /* IOAM */ static bool ipv6_hop_ioam(struct sk_buff *skb, int optoff) { struct ioam6_trace_hdr *trace; struct ioam6_namespace *ns; struct ioam6_hdr *hdr; /* Bad alignment (must be 4n-aligned) */ if (optoff & 3) goto drop; /* Ignore if IOAM is not enabled on ingress */ if (!READ_ONCE(__in6_dev_get(skb->dev)->cnf.ioam6_enabled)) goto ignore; /* Truncated Option header */ hdr = (struct ioam6_hdr *)(skb_network_header(skb) + optoff); if (hdr->opt_len < 2) goto drop; switch (hdr->type) { case IOAM6_TYPE_PREALLOC: /* Truncated Pre-allocated Trace header */ if (hdr->opt_len < 2 + sizeof(*trace)) goto drop; /* Malformed Pre-allocated Trace header */ trace = (struct ioam6_trace_hdr *)((u8 *)hdr + sizeof(*hdr)); if (hdr->opt_len < 2 + sizeof(*trace) + trace->remlen * 4) goto drop; /* Ignore if the IOAM namespace is unknown */ ns = ioam6_namespace(dev_net(skb->dev), trace->namespace_id); if (!ns) goto ignore; if (!skb_valid_dst(skb)) ip6_route_input(skb); /* About to mangle packet header */ if (skb_ensure_writable(skb, optoff + 2 + hdr->opt_len)) goto drop; /* Trace pointer may have changed */ trace = (struct ioam6_trace_hdr *)(skb_network_header(skb) + optoff + sizeof(*hdr)); ioam6_fill_trace_data(skb, ns, trace, true); ioam6_event(IOAM6_EVENT_TRACE, dev_net(skb->dev), GFP_ATOMIC, (void *)trace, hdr->opt_len - 2); break; default: break; } ignore: return true; drop: kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR); return false; } /* Jumbo payload */ static bool ipv6_hop_jumbo(struct sk_buff *skb, int optoff) { const unsigned char *nh = skb_network_header(skb); SKB_DR(reason); u32 pkt_len; if (nh[optoff + 1] != 4 || (optoff & 3) != 2) { net_dbg_ratelimited("ipv6_hop_jumbo: wrong jumbo opt length/alignment %d\n", nh[optoff+1]); SKB_DR_SET(reason, IP_INHDR); goto drop; } pkt_len = ntohl(*(__be32 *)(nh + optoff + 2)); if (pkt_len <= IPV6_MAXPLEN) { icmpv6_param_prob_reason(skb, ICMPV6_HDR_FIELD, optoff + 2, SKB_DROP_REASON_IP_INHDR); return false; } if (ipv6_hdr(skb)->payload_len) { icmpv6_param_prob_reason(skb, ICMPV6_HDR_FIELD, optoff, SKB_DROP_REASON_IP_INHDR); return false; } if (pkt_len > skb->len - sizeof(struct ipv6hdr)) { SKB_DR_SET(reason, PKT_TOO_SMALL); goto drop; } if (pskb_trim_rcsum(skb, pkt_len + sizeof(struct ipv6hdr))) goto drop; IP6CB(skb)->flags |= IP6SKB_JUMBOGRAM; return true; drop: kfree_skb_reason(skb, reason); return false; } /* CALIPSO RFC 5570 */ static bool ipv6_hop_calipso(struct sk_buff *skb, int optoff) { const unsigned char *nh = skb_network_header(skb); if (nh[optoff + 1] < 8) goto drop; if (nh[optoff + 6] * 4 + 8 > nh[optoff + 1]) goto drop; if (!calipso_validate(skb, nh + optoff)) goto drop; return true; drop: kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR); return false; } int ipv6_parse_hopopts(struct sk_buff *skb) { struct inet6_skb_parm *opt = IP6CB(skb); struct net *net = dev_net(skb->dev); int extlen; /* * skb_network_header(skb) is equal to skb->data, and * skb_network_header_len(skb) is always equal to * sizeof(struct ipv6hdr) by definition of * hop-by-hop options. */ if (!pskb_may_pull(skb, sizeof(struct ipv6hdr) + 8) || !pskb_may_pull(skb, (sizeof(struct ipv6hdr) + ((skb_transport_header(skb)[1] + 1) << 3)))) { fail_and_free: kfree_skb(skb); return -1; } extlen = (skb_transport_header(skb)[1] + 1) << 3; if (extlen > net->ipv6.sysctl.max_hbh_opts_len) goto fail_and_free; opt->flags |= IP6SKB_HOPBYHOP; if (ip6_parse_tlv(true, skb, net->ipv6.sysctl.max_hbh_opts_cnt)) { skb->transport_header += extlen; opt = IP6CB(skb); opt->nhoff = sizeof(struct ipv6hdr); return 1; } return -1; } /* * Creating outbound headers. * * "build" functions work when skb is filled from head to tail (datagram) * "push" functions work when headers are added from tail to head (tcp) * * In both cases we assume, that caller reserved enough room * for headers. */ static void ipv6_push_rthdr0(struct sk_buff *skb, u8 *proto, struct ipv6_rt_hdr *opt, struct in6_addr **addr_p, struct in6_addr *saddr) { struct rt0_hdr *phdr, *ihdr; int hops; ihdr = (struct rt0_hdr *) opt; phdr = skb_push(skb, (ihdr->rt_hdr.hdrlen + 1) << 3); memcpy(phdr, ihdr, sizeof(struct rt0_hdr)); hops = ihdr->rt_hdr.hdrlen >> 1; if (hops > 1) memcpy(phdr->addr, ihdr->addr + 1, (hops - 1) * sizeof(struct in6_addr)); phdr->addr[hops - 1] = **addr_p; *addr_p = ihdr->addr; phdr->rt_hdr.nexthdr = *proto; *proto = NEXTHDR_ROUTING; } static void ipv6_push_rthdr4(struct sk_buff *skb, u8 *proto, struct ipv6_rt_hdr *opt, struct in6_addr **addr_p, struct in6_addr *saddr) { struct ipv6_sr_hdr *sr_phdr, *sr_ihdr; int plen, hops; sr_ihdr = (struct ipv6_sr_hdr *)opt; plen = (sr_ihdr->hdrlen + 1) << 3; sr_phdr = skb_push(skb, plen); memcpy(sr_phdr, sr_ihdr, sizeof(struct ipv6_sr_hdr)); hops = sr_ihdr->first_segment + 1; memcpy(sr_phdr->segments + 1, sr_ihdr->segments + 1, (hops - 1) * sizeof(struct in6_addr)); sr_phdr->segments[0] = **addr_p; *addr_p = &sr_ihdr->segments[sr_ihdr->segments_left]; if (sr_ihdr->hdrlen > hops * 2) { int tlvs_offset, tlvs_length; tlvs_offset = (1 + hops * 2) << 3; tlvs_length = (sr_ihdr->hdrlen - hops * 2) << 3; memcpy((char *)sr_phdr + tlvs_offset, (char *)sr_ihdr + tlvs_offset, tlvs_length); } #ifdef CONFIG_IPV6_SEG6_HMAC if (sr_has_hmac(sr_phdr)) { struct net *net = NULL; if (skb->dev) net = dev_net(skb->dev); else if (skb->sk) net = sock_net(skb->sk); WARN_ON(!net); if (net) seg6_push_hmac(net, saddr, sr_phdr); } #endif sr_phdr->nexthdr = *proto; *proto = NEXTHDR_ROUTING; } static void ipv6_push_rthdr(struct sk_buff *skb, u8 *proto, struct ipv6_rt_hdr *opt, struct in6_addr **addr_p, struct in6_addr *saddr) { switch (opt->type) { case IPV6_SRCRT_TYPE_0: case IPV6_SRCRT_STRICT: case IPV6_SRCRT_TYPE_2: ipv6_push_rthdr0(skb, proto, opt, addr_p, saddr); break; case IPV6_SRCRT_TYPE_4: ipv6_push_rthdr4(skb, proto, opt, addr_p, saddr); break; default: break; } } static void ipv6_push_exthdr(struct sk_buff *skb, u8 *proto, u8 type, struct ipv6_opt_hdr *opt) { struct ipv6_opt_hdr *h = skb_push(skb, ipv6_optlen(opt)); memcpy(h, opt, ipv6_optlen(opt)); h->nexthdr = *proto; *proto = type; } void ipv6_push_nfrag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt, u8 *proto, struct in6_addr **daddr, struct in6_addr *saddr) { if (opt->srcrt) { ipv6_push_rthdr(skb, proto, opt->srcrt, daddr, saddr); /* * IPV6_RTHDRDSTOPTS is ignored * unless IPV6_RTHDR is set (RFC3542). */ if (opt->dst0opt) ipv6_push_exthdr(skb, proto, NEXTHDR_DEST, opt->dst0opt); } if (opt->hopopt) ipv6_push_exthdr(skb, proto, NEXTHDR_HOP, opt->hopopt); } void ipv6_push_frag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt, u8 *proto) { if (opt->dst1opt) ipv6_push_exthdr(skb, proto, NEXTHDR_DEST, opt->dst1opt); } EXPORT_SYMBOL(ipv6_push_frag_opts); struct ipv6_txoptions * ipv6_dup_options(struct sock *sk, struct ipv6_txoptions *opt) { struct ipv6_txoptions *opt2; opt2 = sock_kmemdup(sk, opt, opt->tot_len, GFP_ATOMIC); if (opt2) { long dif = (char *)opt2 - (char *)opt; if (opt2->hopopt) *((char **)&opt2->hopopt) += dif; if (opt2->dst0opt) *((char **)&opt2->dst0opt) += dif; if (opt2->dst1opt) *((char **)&opt2->dst1opt) += dif; if (opt2->srcrt) *((char **)&opt2->srcrt) += dif; refcount_set(&opt2->refcnt, 1); } return opt2; } EXPORT_SYMBOL_GPL(ipv6_dup_options); static void ipv6_renew_option(int renewtype, struct ipv6_opt_hdr **dest, struct ipv6_opt_hdr *old, struct ipv6_opt_hdr *new, int newtype, char **p) { struct ipv6_opt_hdr *src; src = (renewtype == newtype ? new : old); if (!src) return; memcpy(*p, src, ipv6_optlen(src)); *dest = (struct ipv6_opt_hdr *)*p; *p += CMSG_ALIGN(ipv6_optlen(*dest)); } /** * ipv6_renew_options - replace a specific ext hdr with a new one. * * @sk: sock from which to allocate memory * @opt: original options * @newtype: option type to replace in @opt * @newopt: new option of type @newtype to replace (user-mem) * * Returns a new set of options which is a copy of @opt with the * option type @newtype replaced with @newopt. * * @opt may be NULL, in which case a new set of options is returned * containing just @newopt. * * @newopt may be NULL, in which case the specified option type is * not copied into the new set of options. * * The new set of options is allocated from the socket option memory * buffer of @sk. */ struct ipv6_txoptions * ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt, int newtype, struct ipv6_opt_hdr *newopt) { int tot_len = 0; char *p; struct ipv6_txoptions *opt2; if (opt) { if (newtype != IPV6_HOPOPTS && opt->hopopt) tot_len += CMSG_ALIGN(ipv6_optlen(opt->hopopt)); if (newtype != IPV6_RTHDRDSTOPTS && opt->dst0opt) tot_len += CMSG_ALIGN(ipv6_optlen(opt->dst0opt)); if (newtype != IPV6_RTHDR && opt->srcrt) tot_len += CMSG_ALIGN(ipv6_optlen(opt->srcrt)); if (newtype != IPV6_DSTOPTS && opt->dst1opt) tot_len += CMSG_ALIGN(ipv6_optlen(opt->dst1opt)); } if (newopt) tot_len += CMSG_ALIGN(ipv6_optlen(newopt)); if (!tot_len) return NULL; tot_len += sizeof(*opt2); opt2 = sock_kmalloc(sk, tot_len, GFP_ATOMIC); if (!opt2) return ERR_PTR(-ENOBUFS); memset(opt2, 0, tot_len); refcount_set(&opt2->refcnt, 1); opt2->tot_len = tot_len; p = (char *)(opt2 + 1); ipv6_renew_option(IPV6_HOPOPTS, &opt2->hopopt, (opt ? opt->hopopt : NULL), newopt, newtype, &p); ipv6_renew_option(IPV6_RTHDRDSTOPTS, &opt2->dst0opt, (opt ? opt->dst0opt : NULL), newopt, newtype, &p); ipv6_renew_option(IPV6_RTHDR, (struct ipv6_opt_hdr **)&opt2->srcrt, (opt ? (struct ipv6_opt_hdr *)opt->srcrt : NULL), newopt, newtype, &p); ipv6_renew_option(IPV6_DSTOPTS, &opt2->dst1opt, (opt ? opt->dst1opt : NULL), newopt, newtype, &p); opt2->opt_nflen = (opt2->hopopt ? ipv6_optlen(opt2->hopopt) : 0) + (opt2->dst0opt ? ipv6_optlen(opt2->dst0opt) : 0) + (opt2->srcrt ? ipv6_optlen(opt2->srcrt) : 0); opt2->opt_flen = (opt2->dst1opt ? ipv6_optlen(opt2->dst1opt) : 0); return opt2; } struct ipv6_txoptions *__ipv6_fixup_options(struct ipv6_txoptions *opt_space, struct ipv6_txoptions *opt) { /* * ignore the dest before srcrt unless srcrt is being included. * --yoshfuji */ if (opt->dst0opt && !opt->srcrt) { if (opt_space != opt) { memcpy(opt_space, opt, sizeof(*opt_space)); opt = opt_space; } opt->opt_nflen -= ipv6_optlen(opt->dst0opt); opt->dst0opt = NULL; } return opt; } EXPORT_SYMBOL_GPL(__ipv6_fixup_options); /** * fl6_update_dst - update flowi destination address with info given * by srcrt option, if any. * * @fl6: flowi6 for which daddr is to be updated * @opt: struct ipv6_txoptions in which to look for srcrt opt * @orig: copy of original daddr address if modified * * Returns NULL if no txoptions or no srcrt, otherwise returns orig * and initial value of fl6->daddr set in orig */ struct in6_addr *fl6_update_dst(struct flowi6 *fl6, const struct ipv6_txoptions *opt, struct in6_addr *orig) { if (!opt || !opt->srcrt) return NULL; *orig = fl6->daddr; switch (opt->srcrt->type) { case IPV6_SRCRT_TYPE_0: case IPV6_SRCRT_STRICT: case IPV6_SRCRT_TYPE_2: fl6->daddr = *((struct rt0_hdr *)opt->srcrt)->addr; break; case IPV6_SRCRT_TYPE_4: { struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *)opt->srcrt; fl6->daddr = srh->segments[srh->segments_left]; break; } default: return NULL; } return orig; } EXPORT_SYMBOL_GPL(fl6_update_dst); |
| 20 67 18 8 112 2 111 113 113 3 1 6 55 55 13 8 2 7 58 58 4 58 2 3 3 3 3 3 4 4 4 4 6 6 5 2 3 22 6 5 1 1 5 13 18 11 1 5 9 5 5 3 8 6 9 5 5 9 13 3 3 2 2 2 10 1 1 6 5 1 2 7 7 2 5 7 7 5 2 3 3 3 3 1859 1858 481 482 76 2 13 13 55 55 55 55 55 14 1 12 13 6 2 1 1 2 12 3 9 12 12 12 12 12 12 20 5 15 4 8 3 17 4 6 6 6 6 11 9 2 1 14 5 6 1 2 112 112 73 2 3 67 22 2 3 2 2 1 1 10 1 1 1 3 1 5 14 2 1 16 1 1 1 1 1 1 1 1 1 1 1 4 14 3 4 5 3 3 4 1 3 4 1 4 1 1 1 1 1 122 121 3 3 19 19 1 18 18 1 18 17 4 5 10 2 6 3 26 19 7 2 3 3 2 5 4 4 1 26 4 7 7 5 4 1 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Linux IPv6 multicast routing support for BSD pim6sd * Based on net/ipv4/ipmr.c. * * (c) 2004 Mickael Hoerdt, <hoerdt@clarinet.u-strasbg.fr> * LSIIT Laboratory, Strasbourg, France * (c) 2004 Jean-Philippe Andriot, <jean-philippe.andriot@6WIND.com> * 6WIND, Paris, France * Copyright (C)2007,2008 USAGI/WIDE Project * YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org> */ #include <linux/uaccess.h> #include <linux/types.h> #include <linux/sched.h> #include <linux/errno.h> #include <linux/mm.h> #include <linux/kernel.h> #include <linux/fcntl.h> #include <linux/stat.h> #include <linux/socket.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/inetdevice.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/init.h> #include <linux/compat.h> #include <linux/rhashtable.h> #include <net/protocol.h> #include <linux/skbuff.h> #include <net/raw.h> #include <linux/notifier.h> #include <linux/if_arp.h> #include <net/checksum.h> #include <net/netlink.h> #include <net/fib_rules.h> #include <net/ipv6.h> #include <net/ip6_route.h> #include <linux/mroute6.h> #include <linux/pim.h> #include <net/addrconf.h> #include <linux/netfilter_ipv6.h> #include <linux/export.h> #include <net/ip6_checksum.h> #include <linux/netconf.h> #include <net/ip_tunnels.h> #include <linux/nospec.h> struct ip6mr_rule { struct fib_rule common; }; struct ip6mr_result { struct mr_table *mrt; }; /* Big lock, protecting vif table, mrt cache and mroute socket state. Note that the changes are semaphored via rtnl_lock. */ static DEFINE_SPINLOCK(mrt_lock); static struct net_device *vif_dev_read(const struct vif_device *vif) { return rcu_dereference(vif->dev); } /* Multicast router control variables */ /* Special spinlock for queue of unresolved entries */ static DEFINE_SPINLOCK(mfc_unres_lock); /* We return to original Alan's scheme. Hash table of resolved entries is changed only in process context and protected with weak lock mrt_lock. Queue of unresolved entries is protected with strong spinlock mfc_unres_lock. In this case data path is free of exclusive locks at all. */ static struct kmem_cache *mrt_cachep __read_mostly; static struct mr_table *ip6mr_new_table(struct net *net, u32 id); static void ip6mr_free_table(struct mr_table *mrt); static void ip6_mr_forward(struct net *net, struct mr_table *mrt, struct net_device *dev, struct sk_buff *skb, struct mfc6_cache *cache); static int ip6mr_cache_report(const struct mr_table *mrt, struct sk_buff *pkt, mifi_t mifi, int assert); static void mr6_netlink_event(struct mr_table *mrt, struct mfc6_cache *mfc, int cmd); static void mrt6msg_netlink_event(const struct mr_table *mrt, struct sk_buff *pkt); static int ip6mr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack); static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb); static void mroute_clean_tables(struct mr_table *mrt, int flags); static void ipmr_expire_process(struct timer_list *t); #ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES #define ip6mr_for_each_table(mrt, net) \ list_for_each_entry_rcu(mrt, &net->ipv6.mr6_tables, list, \ lockdep_rtnl_is_held() || \ list_empty(&net->ipv6.mr6_tables)) static struct mr_table *ip6mr_mr_table_iter(struct net *net, struct mr_table *mrt) { struct mr_table *ret; if (!mrt) ret = list_entry_rcu(net->ipv6.mr6_tables.next, struct mr_table, list); else ret = list_entry_rcu(mrt->list.next, struct mr_table, list); if (&ret->list == &net->ipv6.mr6_tables) return NULL; return ret; } static struct mr_table *__ip6mr_get_table(struct net *net, u32 id) { struct mr_table *mrt; ip6mr_for_each_table(mrt, net) { if (mrt->id == id) return mrt; } return NULL; } static struct mr_table *ip6mr_get_table(struct net *net, u32 id) { struct mr_table *mrt; rcu_read_lock(); mrt = __ip6mr_get_table(net, id); rcu_read_unlock(); return mrt; } static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6, struct mr_table **mrt) { int err; struct ip6mr_result res; struct fib_lookup_arg arg = { .result = &res, .flags = FIB_LOOKUP_NOREF, }; /* update flow if oif or iif point to device enslaved to l3mdev */ l3mdev_update_flow(net, flowi6_to_flowi(flp6)); err = fib_rules_lookup(net->ipv6.mr6_rules_ops, flowi6_to_flowi(flp6), 0, &arg); if (err < 0) return err; *mrt = res.mrt; return 0; } static int ip6mr_rule_action(struct fib_rule *rule, struct flowi *flp, int flags, struct fib_lookup_arg *arg) { struct ip6mr_result *res = arg->result; struct mr_table *mrt; switch (rule->action) { case FR_ACT_TO_TBL: break; case FR_ACT_UNREACHABLE: return -ENETUNREACH; case FR_ACT_PROHIBIT: return -EACCES; case FR_ACT_BLACKHOLE: default: return -EINVAL; } arg->table = fib_rule_get_table(rule, arg); mrt = __ip6mr_get_table(rule->fr_net, arg->table); if (!mrt) return -EAGAIN; res->mrt = mrt; return 0; } static int ip6mr_rule_match(struct fib_rule *rule, struct flowi *flp, int flags) { return 1; } static int ip6mr_rule_configure(struct fib_rule *rule, struct sk_buff *skb, struct fib_rule_hdr *frh, struct nlattr **tb, struct netlink_ext_ack *extack) { return 0; } static int ip6mr_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, struct nlattr **tb) { return 1; } static int ip6mr_rule_fill(struct fib_rule *rule, struct sk_buff *skb, struct fib_rule_hdr *frh) { frh->dst_len = 0; frh->src_len = 0; frh->tos = 0; return 0; } static const struct fib_rules_ops __net_initconst ip6mr_rules_ops_template = { .family = RTNL_FAMILY_IP6MR, .rule_size = sizeof(struct ip6mr_rule), .addr_size = sizeof(struct in6_addr), .action = ip6mr_rule_action, .match = ip6mr_rule_match, .configure = ip6mr_rule_configure, .compare = ip6mr_rule_compare, .fill = ip6mr_rule_fill, .nlgroup = RTNLGRP_IPV6_RULE, .owner = THIS_MODULE, }; static int __net_init ip6mr_rules_init(struct net *net) { struct fib_rules_ops *ops; struct mr_table *mrt; int err; ops = fib_rules_register(&ip6mr_rules_ops_template, net); if (IS_ERR(ops)) return PTR_ERR(ops); INIT_LIST_HEAD(&net->ipv6.mr6_tables); mrt = ip6mr_new_table(net, RT6_TABLE_DFLT); if (IS_ERR(mrt)) { err = PTR_ERR(mrt); goto err1; } err = fib_default_rule_add(ops, 0x7fff, RT6_TABLE_DFLT); if (err < 0) goto err2; net->ipv6.mr6_rules_ops = ops; return 0; err2: rtnl_lock(); ip6mr_free_table(mrt); rtnl_unlock(); err1: fib_rules_unregister(ops); return err; } static void __net_exit ip6mr_rules_exit(struct net *net) { struct mr_table *mrt, *next; ASSERT_RTNL(); list_for_each_entry_safe(mrt, next, &net->ipv6.mr6_tables, list) { list_del(&mrt->list); ip6mr_free_table(mrt); } fib_rules_unregister(net->ipv6.mr6_rules_ops); } static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { return fib_rules_dump(net, nb, RTNL_FAMILY_IP6MR, extack); } static unsigned int ip6mr_rules_seq_read(const struct net *net) { return fib_rules_seq_read(net, RTNL_FAMILY_IP6MR); } bool ip6mr_rule_default(const struct fib_rule *rule) { return fib_rule_matchall(rule) && rule->action == FR_ACT_TO_TBL && rule->table == RT6_TABLE_DFLT && !rule->l3mdev; } EXPORT_SYMBOL(ip6mr_rule_default); #else #define ip6mr_for_each_table(mrt, net) \ for (mrt = net->ipv6.mrt6; mrt; mrt = NULL) static struct mr_table *ip6mr_mr_table_iter(struct net *net, struct mr_table *mrt) { if (!mrt) return net->ipv6.mrt6; return NULL; } static struct mr_table *ip6mr_get_table(struct net *net, u32 id) { return net->ipv6.mrt6; } #define __ip6mr_get_table ip6mr_get_table static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6, struct mr_table **mrt) { *mrt = net->ipv6.mrt6; return 0; } static int __net_init ip6mr_rules_init(struct net *net) { struct mr_table *mrt; mrt = ip6mr_new_table(net, RT6_TABLE_DFLT); if (IS_ERR(mrt)) return PTR_ERR(mrt); net->ipv6.mrt6 = mrt; return 0; } static void __net_exit ip6mr_rules_exit(struct net *net) { ASSERT_RTNL(); ip6mr_free_table(net->ipv6.mrt6); net->ipv6.mrt6 = NULL; } static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { return 0; } static unsigned int ip6mr_rules_seq_read(const struct net *net) { return 0; } #endif static int ip6mr_hash_cmp(struct rhashtable_compare_arg *arg, const void *ptr) { const struct mfc6_cache_cmp_arg *cmparg = arg->key; struct mfc6_cache *c = (struct mfc6_cache *)ptr; return !ipv6_addr_equal(&c->mf6c_mcastgrp, &cmparg->mf6c_mcastgrp) || !ipv6_addr_equal(&c->mf6c_origin, &cmparg->mf6c_origin); } static const struct rhashtable_params ip6mr_rht_params = { .head_offset = offsetof(struct mr_mfc, mnode), .key_offset = offsetof(struct mfc6_cache, cmparg), .key_len = sizeof(struct mfc6_cache_cmp_arg), .nelem_hint = 3, .obj_cmpfn = ip6mr_hash_cmp, .automatic_shrinking = true, }; static void ip6mr_new_table_set(struct mr_table *mrt, struct net *net) { #ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES list_add_tail_rcu(&mrt->list, &net->ipv6.mr6_tables); #endif } static struct mfc6_cache_cmp_arg ip6mr_mr_table_ops_cmparg_any = { .mf6c_origin = IN6ADDR_ANY_INIT, .mf6c_mcastgrp = IN6ADDR_ANY_INIT, }; static struct mr_table_ops ip6mr_mr_table_ops = { .rht_params = &ip6mr_rht_params, .cmparg_any = &ip6mr_mr_table_ops_cmparg_any, }; static struct mr_table *ip6mr_new_table(struct net *net, u32 id) { struct mr_table *mrt; mrt = __ip6mr_get_table(net, id); if (mrt) return mrt; return mr_table_alloc(net, id, &ip6mr_mr_table_ops, ipmr_expire_process, ip6mr_new_table_set); } static void ip6mr_free_table(struct mr_table *mrt) { struct net *net = read_pnet(&mrt->net); WARN_ON_ONCE(!mr_can_free_table(net)); timer_shutdown_sync(&mrt->ipmr_expire_timer); mroute_clean_tables(mrt, MRT6_FLUSH_MIFS | MRT6_FLUSH_MIFS_STATIC | MRT6_FLUSH_MFC | MRT6_FLUSH_MFC_STATIC); rhltable_destroy(&mrt->mfc_hash); kfree(mrt); } #ifdef CONFIG_PROC_FS /* The /proc interfaces to multicast routing * /proc/ip6_mr_cache /proc/ip6_mr_vif */ static void *ip6mr_vif_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { struct mr_vif_iter *iter = seq->private; struct net *net = seq_file_net(seq); struct mr_table *mrt; rcu_read_lock(); mrt = __ip6mr_get_table(net, RT6_TABLE_DFLT); if (!mrt) { rcu_read_unlock(); return ERR_PTR(-ENOENT); } iter->mrt = mrt; return mr_vif_seq_start(seq, pos); } static void ip6mr_vif_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { rcu_read_unlock(); } static int ip6mr_vif_seq_show(struct seq_file *seq, void *v) { struct mr_vif_iter *iter = seq->private; struct mr_table *mrt = iter->mrt; if (v == SEQ_START_TOKEN) { seq_puts(seq, "Interface BytesIn PktsIn BytesOut PktsOut Flags\n"); } else { const struct vif_device *vif = v; const struct net_device *vif_dev; const char *name; vif_dev = vif_dev_read(vif); name = vif_dev ? vif_dev->name : "none"; seq_printf(seq, "%2td %-10s %8ld %7ld %8ld %7ld %05X\n", vif - mrt->vif_table, name, vif->bytes_in, vif->pkt_in, vif->bytes_out, vif->pkt_out, vif->flags); } return 0; } static const struct seq_operations ip6mr_vif_seq_ops = { .start = ip6mr_vif_seq_start, .next = mr_vif_seq_next, .stop = ip6mr_vif_seq_stop, .show = ip6mr_vif_seq_show, }; static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos) { struct net *net = seq_file_net(seq); struct mr_table *mrt; mrt = ip6mr_get_table(net, RT6_TABLE_DFLT); if (!mrt) return ERR_PTR(-ENOENT); return mr_mfc_seq_start(seq, pos, mrt, &mfc_unres_lock); } static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) { int n; if (v == SEQ_START_TOKEN) { seq_puts(seq, "Group " "Origin " "Iif Pkts Bytes Wrong Oifs\n"); } else { const struct mfc6_cache *mfc = v; const struct mr_mfc_iter *it = seq->private; struct mr_table *mrt = it->mrt; seq_printf(seq, "%pI6 %pI6 %-3hd", &mfc->mf6c_mcastgrp, &mfc->mf6c_origin, mfc->_c.mfc_parent); if (it->cache != &mrt->mfc_unres_queue) { seq_printf(seq, " %8lu %8lu %8lu", atomic_long_read(&mfc->_c.mfc_un.res.pkt), atomic_long_read(&mfc->_c.mfc_un.res.bytes), atomic_long_read(&mfc->_c.mfc_un.res.wrong_if)); for (n = mfc->_c.mfc_un.res.minvif; n < mfc->_c.mfc_un.res.maxvif; n++) { if (VIF_EXISTS(mrt, n) && mfc->_c.mfc_un.res.ttls[n] < 255) seq_printf(seq, " %2d:%-3d", n, mfc->_c.mfc_un.res.ttls[n]); } } else { /* unresolved mfc_caches don't contain * pkt, bytes and wrong_if values */ seq_printf(seq, " %8lu %8lu %8lu", 0ul, 0ul, 0ul); } seq_putc(seq, '\n'); } return 0; } static const struct seq_operations ipmr_mfc_seq_ops = { .start = ipmr_mfc_seq_start, .next = mr_mfc_seq_next, .stop = mr_mfc_seq_stop, .show = ipmr_mfc_seq_show, }; #endif #ifdef CONFIG_IPV6_PIMSM_V2 static int pim6_rcv(struct sk_buff *skb) { struct pimreghdr *pim; struct ipv6hdr *encap; struct net_device *reg_dev = NULL; struct net *net = dev_net(skb->dev); struct mr_table *mrt; struct flowi6 fl6 = { .flowi6_iif = skb->dev->ifindex, .flowi6_mark = skb->mark, }; int reg_vif_num; if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap))) goto drop; pim = (struct pimreghdr *)skb_transport_header(skb); if (pim->type != ((PIM_VERSION << 4) | PIM_TYPE_REGISTER) || (pim->flags & PIM_NULL_REGISTER) || (csum_ipv6_magic(&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, sizeof(*pim), IPPROTO_PIM, csum_partial((void *)pim, sizeof(*pim), 0)) && csum_fold(skb_checksum(skb, 0, skb->len, 0)))) goto drop; /* check if the inner packet is destined to mcast group */ encap = (struct ipv6hdr *)(skb_transport_header(skb) + sizeof(*pim)); if (!ipv6_addr_is_multicast(&encap->daddr) || encap->payload_len == 0 || ntohs(encap->payload_len) + sizeof(*pim) > skb->len) goto drop; if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0) goto drop; /* Pairs with WRITE_ONCE() in mif6_add()/mif6_delete() */ reg_vif_num = READ_ONCE(mrt->mroute_reg_vif_num); if (reg_vif_num >= 0) reg_dev = vif_dev_read(&mrt->vif_table[reg_vif_num]); if (!reg_dev) goto drop; skb->mac_header = skb->network_header; skb_pull(skb, (u8 *)encap - skb->data); skb_reset_network_header(skb); skb->protocol = htons(ETH_P_IPV6); skb->ip_summed = CHECKSUM_NONE; skb_tunnel_rx(skb, reg_dev, dev_net(reg_dev)); netif_rx(skb); return 0; drop: kfree_skb(skb); return 0; } static const struct inet6_protocol pim6_protocol = { .handler = pim6_rcv, }; /* Service routines creating virtual interfaces: PIMREG */ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev) { struct net *net = dev_net(dev); struct mr_table *mrt; struct flowi6 fl6 = { .flowi6_oif = dev->ifindex, .flowi6_iif = skb->skb_iif ? : LOOPBACK_IFINDEX, .flowi6_mark = skb->mark, }; if (!pskb_inet_may_pull(skb)) goto tx_err; if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0) goto tx_err; DEV_STATS_ADD(dev, tx_bytes, skb->len); DEV_STATS_INC(dev, tx_packets); rcu_read_lock(); ip6mr_cache_report(mrt, skb, READ_ONCE(mrt->mroute_reg_vif_num), MRT6MSG_WHOLEPKT); rcu_read_unlock(); kfree_skb(skb); return NETDEV_TX_OK; tx_err: DEV_STATS_INC(dev, tx_errors); kfree_skb(skb); return NETDEV_TX_OK; } static int reg_vif_get_iflink(const struct net_device *dev) { return 0; } static const struct net_device_ops reg_vif_netdev_ops = { .ndo_start_xmit = reg_vif_xmit, .ndo_get_iflink = reg_vif_get_iflink, }; static void reg_vif_setup(struct net_device *dev) { dev->type = ARPHRD_PIMREG; dev->mtu = 1500 - sizeof(struct ipv6hdr) - 8; dev->flags = IFF_NOARP; dev->netdev_ops = ®_vif_netdev_ops; dev->needs_free_netdev = true; dev->netns_immutable = true; } static struct net_device *ip6mr_reg_vif(struct net *net, struct mr_table *mrt) { struct net_device *dev; char name[IFNAMSIZ]; if (mrt->id == RT6_TABLE_DFLT) sprintf(name, "pim6reg"); else sprintf(name, "pim6reg%u", mrt->id); dev = alloc_netdev(0, name, NET_NAME_UNKNOWN, reg_vif_setup); if (!dev) return NULL; dev_net_set(dev, net); if (register_netdevice(dev)) { free_netdev(dev); return NULL; } if (dev_open(dev, NULL)) goto failure; dev_hold(dev); return dev; failure: unregister_netdevice(dev); return NULL; } #endif static int call_ip6mr_vif_entry_notifiers(struct net *net, enum fib_event_type event_type, struct vif_device *vif, struct net_device *vif_dev, mifi_t vif_index, u32 tb_id) { return mr_call_vif_notifiers(net, RTNL_FAMILY_IP6MR, event_type, vif, vif_dev, vif_index, tb_id, &net->ipv6.ipmr_seq); } static int call_ip6mr_mfc_entry_notifiers(struct net *net, enum fib_event_type event_type, struct mfc6_cache *mfc, u32 tb_id) { return mr_call_mfc_notifiers(net, RTNL_FAMILY_IP6MR, event_type, &mfc->_c, tb_id, &net->ipv6.ipmr_seq); } /* Delete a VIF entry */ static int mif6_delete(struct mr_table *mrt, int vifi, int notify, struct list_head *head) { struct vif_device *v; struct net_device *dev; struct inet6_dev *in6_dev; if (vifi < 0 || vifi >= mrt->maxvif) return -EADDRNOTAVAIL; v = &mrt->vif_table[vifi]; dev = rtnl_dereference(v->dev); if (!dev) return -EADDRNOTAVAIL; call_ip6mr_vif_entry_notifiers(read_pnet(&mrt->net), FIB_EVENT_VIF_DEL, v, dev, vifi, mrt->id); spin_lock(&mrt_lock); RCU_INIT_POINTER(v->dev, NULL); #ifdef CONFIG_IPV6_PIMSM_V2 if (vifi == mrt->mroute_reg_vif_num) { /* Pairs with READ_ONCE() in ip6mr_cache_report() and reg_vif_xmit() */ WRITE_ONCE(mrt->mroute_reg_vif_num, -1); } #endif if (vifi + 1 == mrt->maxvif) { int tmp; for (tmp = vifi - 1; tmp >= 0; tmp--) { if (VIF_EXISTS(mrt, tmp)) break; } WRITE_ONCE(mrt->maxvif, tmp + 1); } spin_unlock(&mrt_lock); dev_set_allmulti(dev, -1); in6_dev = __in6_dev_get(dev); if (in6_dev) { atomic_dec(&in6_dev->cnf.mc_forwarding); inet6_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF, NETCONFA_MC_FORWARDING, dev->ifindex, &in6_dev->cnf); } if ((v->flags & MIFF_REGISTER) && !notify) unregister_netdevice_queue(dev, head); netdev_put(dev, &v->dev_tracker); return 0; } static inline void ip6mr_cache_free_rcu(struct rcu_head *head) { struct mr_mfc *c = container_of(head, struct mr_mfc, rcu); kmem_cache_free(mrt_cachep, (struct mfc6_cache *)c); } static inline void ip6mr_cache_free(struct mfc6_cache *c) { call_rcu(&c->_c.rcu, ip6mr_cache_free_rcu); } /* Destroy an unresolved cache entry, killing queued skbs and reporting error to netlink readers. */ static void ip6mr_destroy_unres(struct mr_table *mrt, struct mfc6_cache *c) { struct net *net = read_pnet(&mrt->net); struct sk_buff *skb; atomic_dec(&mrt->cache_resolve_queue_len); while ((skb = skb_dequeue(&c->_c.mfc_un.unres.unresolved)) != NULL) { if (ipv6_hdr(skb)->version == 0) { struct nlmsghdr *nlh = skb_pull(skb, sizeof(struct ipv6hdr)); nlh->nlmsg_type = NLMSG_ERROR; nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr)); skb_trim(skb, nlh->nlmsg_len); ((struct nlmsgerr *)nlmsg_data(nlh))->error = -ETIMEDOUT; rtnl_unicast(skb, net, NETLINK_CB(skb).portid); } else kfree_skb(skb); } ip6mr_cache_free(c); } /* Timer process for all the unresolved queue. */ static void ipmr_do_expire_process(struct mr_table *mrt) { unsigned long now = jiffies; unsigned long expires = 10 * HZ; struct mr_mfc *c, *next; list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) { if (time_after(c->mfc_un.unres.expires, now)) { /* not yet... */ unsigned long interval = c->mfc_un.unres.expires - now; if (interval < expires) expires = interval; continue; } list_del(&c->list); mr6_netlink_event(mrt, (struct mfc6_cache *)c, RTM_DELROUTE); ip6mr_destroy_unres(mrt, (struct mfc6_cache *)c); } if (!list_empty(&mrt->mfc_unres_queue)) mod_timer(&mrt->ipmr_expire_timer, jiffies + expires); } static void ipmr_expire_process(struct timer_list *t) { struct mr_table *mrt = timer_container_of(mrt, t, ipmr_expire_timer); if (!spin_trylock(&mfc_unres_lock)) { mod_timer(&mrt->ipmr_expire_timer, jiffies + 1); return; } if (!list_empty(&mrt->mfc_unres_queue)) ipmr_do_expire_process(mrt); spin_unlock(&mfc_unres_lock); } /* Fill oifs list. It is called under locked mrt_lock. */ static void ip6mr_update_thresholds(struct mr_table *mrt, struct mr_mfc *cache, unsigned char *ttls) { int vifi; cache->mfc_un.res.minvif = MAXMIFS; cache->mfc_un.res.maxvif = 0; memset(cache->mfc_un.res.ttls, 255, MAXMIFS); for (vifi = 0; vifi < mrt->maxvif; vifi++) { if (VIF_EXISTS(mrt, vifi) && ttls[vifi] && ttls[vifi] < 255) { cache->mfc_un.res.ttls[vifi] = ttls[vifi]; if (cache->mfc_un.res.minvif > vifi) cache->mfc_un.res.minvif = vifi; if (cache->mfc_un.res.maxvif <= vifi) cache->mfc_un.res.maxvif = vifi + 1; } } WRITE_ONCE(cache->mfc_un.res.lastuse, jiffies); } static int mif6_add(struct net *net, struct mr_table *mrt, struct mif6ctl *vifc, int mrtsock) { int vifi = vifc->mif6c_mifi; struct vif_device *v = &mrt->vif_table[vifi]; struct net_device *dev; struct inet6_dev *in6_dev; int err; /* Is vif busy ? */ if (VIF_EXISTS(mrt, vifi)) return -EADDRINUSE; switch (vifc->mif6c_flags) { #ifdef CONFIG_IPV6_PIMSM_V2 case MIFF_REGISTER: /* * Special Purpose VIF in PIM * All the packets will be sent to the daemon */ if (mrt->mroute_reg_vif_num >= 0) return -EADDRINUSE; dev = ip6mr_reg_vif(net, mrt); if (!dev) return -ENOBUFS; err = dev_set_allmulti(dev, 1); if (err) { unregister_netdevice(dev); dev_put(dev); return err; } break; #endif case 0: dev = dev_get_by_index(net, vifc->mif6c_pifi); if (!dev) return -EADDRNOTAVAIL; err = dev_set_allmulti(dev, 1); if (err) { dev_put(dev); return err; } break; default: return -EINVAL; } in6_dev = __in6_dev_get(dev); if (in6_dev) { atomic_inc(&in6_dev->cnf.mc_forwarding); inet6_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF, NETCONFA_MC_FORWARDING, dev->ifindex, &in6_dev->cnf); } /* Fill in the VIF structures */ vif_device_init(v, dev, vifc->vifc_rate_limit, vifc->vifc_threshold, vifc->mif6c_flags | (!mrtsock ? VIFF_STATIC : 0), MIFF_REGISTER); /* And finish update writing critical data */ spin_lock(&mrt_lock); rcu_assign_pointer(v->dev, dev); netdev_tracker_alloc(dev, &v->dev_tracker, GFP_ATOMIC); #ifdef CONFIG_IPV6_PIMSM_V2 if (v->flags & MIFF_REGISTER) WRITE_ONCE(mrt->mroute_reg_vif_num, vifi); #endif if (vifi + 1 > mrt->maxvif) WRITE_ONCE(mrt->maxvif, vifi + 1); spin_unlock(&mrt_lock); call_ip6mr_vif_entry_notifiers(net, FIB_EVENT_VIF_ADD, v, dev, vifi, mrt->id); return 0; } static struct mfc6_cache *ip6mr_cache_find(struct mr_table *mrt, const struct in6_addr *origin, const struct in6_addr *mcastgrp) { struct mfc6_cache_cmp_arg arg = { .mf6c_origin = *origin, .mf6c_mcastgrp = *mcastgrp, }; return mr_mfc_find(mrt, &arg); } /* Look for a (*,G) entry */ static struct mfc6_cache *ip6mr_cache_find_any(struct mr_table *mrt, struct in6_addr *mcastgrp, mifi_t mifi) { struct mfc6_cache_cmp_arg arg = { .mf6c_origin = in6addr_any, .mf6c_mcastgrp = *mcastgrp, }; if (ipv6_addr_any(mcastgrp)) return mr_mfc_find_any_parent(mrt, mifi); return mr_mfc_find_any(mrt, mifi, &arg); } /* Look for a (S,G,iif) entry if parent != -1 */ static struct mfc6_cache * ip6mr_cache_find_parent(struct mr_table *mrt, const struct in6_addr *origin, const struct in6_addr *mcastgrp, int parent) { struct mfc6_cache_cmp_arg arg = { .mf6c_origin = *origin, .mf6c_mcastgrp = *mcastgrp, }; return mr_mfc_find_parent(mrt, &arg, parent); } /* Allocate a multicast cache entry */ static struct mfc6_cache *ip6mr_cache_alloc(void) { struct mfc6_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL); if (!c) return NULL; c->_c.mfc_un.res.last_assert = jiffies - MFC_ASSERT_THRESH - 1; c->_c.mfc_un.res.minvif = MAXMIFS; c->_c.free = ip6mr_cache_free_rcu; refcount_set(&c->_c.mfc_un.res.refcount, 1); return c; } static struct mfc6_cache *ip6mr_cache_alloc_unres(void) { struct mfc6_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC); if (!c) return NULL; skb_queue_head_init(&c->_c.mfc_un.unres.unresolved); c->_c.mfc_un.unres.expires = jiffies + 10 * HZ; return c; } /* * A cache entry has gone into a resolved state from queued */ static void ip6mr_cache_resolve(struct net *net, struct mr_table *mrt, struct mfc6_cache *uc, struct mfc6_cache *c) { struct sk_buff *skb; /* * Play the pending entries through our router */ while ((skb = __skb_dequeue(&uc->_c.mfc_un.unres.unresolved))) { if (ipv6_hdr(skb)->version == 0) { struct nlmsghdr *nlh = skb_pull(skb, sizeof(struct ipv6hdr)); if (mr_fill_mroute(mrt, skb, &c->_c, nlmsg_data(nlh)) > 0) { nlh->nlmsg_len = skb_tail_pointer(skb) - (u8 *)nlh; } else { nlh->nlmsg_type = NLMSG_ERROR; nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr)); skb_trim(skb, nlh->nlmsg_len); ((struct nlmsgerr *)nlmsg_data(nlh))->error = -EMSGSIZE; } rtnl_unicast(skb, net, NETLINK_CB(skb).portid); } else { rcu_read_lock(); ip6_mr_forward(net, mrt, skb->dev, skb, c); rcu_read_unlock(); } } } /* * Bounce a cache query up to pim6sd and netlink. * * Called under rcu_read_lock() */ static int ip6mr_cache_report(const struct mr_table *mrt, struct sk_buff *pkt, mifi_t mifi, int assert) { struct sock *mroute6_sk; struct sk_buff *skb; struct mrt6msg *msg; int ret; #ifdef CONFIG_IPV6_PIMSM_V2 if (assert == MRT6MSG_WHOLEPKT || assert == MRT6MSG_WRMIFWHOLE) skb = skb_realloc_headroom(pkt, -skb_network_offset(pkt) +sizeof(*msg)); else #endif skb = alloc_skb(sizeof(struct ipv6hdr) + sizeof(*msg), GFP_ATOMIC); if (!skb) return -ENOBUFS; /* I suppose that internal messages * do not require checksums */ skb->ip_summed = CHECKSUM_UNNECESSARY; #ifdef CONFIG_IPV6_PIMSM_V2 if (assert == MRT6MSG_WHOLEPKT || assert == MRT6MSG_WRMIFWHOLE) { /* Ugly, but we have no choice with this interface. Duplicate old header, fix length etc. And all this only to mangle msg->im6_msgtype and to set msg->im6_mbz to "mbz" :-) */ __skb_pull(skb, skb_network_offset(pkt)); skb_push(skb, sizeof(*msg)); skb_reset_transport_header(skb); msg = (struct mrt6msg *)skb_transport_header(skb); msg->im6_mbz = 0; msg->im6_msgtype = assert; if (assert == MRT6MSG_WRMIFWHOLE) msg->im6_mif = mifi; else msg->im6_mif = READ_ONCE(mrt->mroute_reg_vif_num); msg->im6_pad = 0; msg->im6_src = ipv6_hdr(pkt)->saddr; msg->im6_dst = ipv6_hdr(pkt)->daddr; skb->ip_summed = CHECKSUM_UNNECESSARY; } else #endif { /* * Copy the IP header */ skb_put(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); skb_copy_to_linear_data(skb, ipv6_hdr(pkt), sizeof(struct ipv6hdr)); /* * Add our header */ skb_put(skb, sizeof(*msg)); skb_reset_transport_header(skb); msg = (struct mrt6msg *)skb_transport_header(skb); msg->im6_mbz = 0; msg->im6_msgtype = assert; msg->im6_mif = mifi; msg->im6_pad = 0; msg->im6_src = ipv6_hdr(pkt)->saddr; msg->im6_dst = ipv6_hdr(pkt)->daddr; skb_dst_set(skb, dst_clone(skb_dst(pkt))); skb->ip_summed = CHECKSUM_UNNECESSARY; } mroute6_sk = rcu_dereference(mrt->mroute_sk); if (!mroute6_sk) { kfree_skb(skb); return -EINVAL; } mrt6msg_netlink_event(mrt, skb); /* Deliver to user space multicast routing algorithms */ ret = sock_queue_rcv_skb(mroute6_sk, skb); if (ret < 0) { net_warn_ratelimited("mroute6: pending queue full, dropping entries\n"); kfree_skb(skb); } return ret; } /* Queue a packet for resolution. It gets locked cache entry! */ static int ip6mr_cache_unresolved(struct mr_table *mrt, mifi_t mifi, struct sk_buff *skb, struct net_device *dev) { struct mfc6_cache *c; bool found = false; int err; spin_lock_bh(&mfc_unres_lock); list_for_each_entry(c, &mrt->mfc_unres_queue, _c.list) { if (ipv6_addr_equal(&c->mf6c_mcastgrp, &ipv6_hdr(skb)->daddr) && ipv6_addr_equal(&c->mf6c_origin, &ipv6_hdr(skb)->saddr)) { found = true; break; } } if (!found) { /* * Create a new entry if allowable */ c = ip6mr_cache_alloc_unres(); if (!c) { spin_unlock_bh(&mfc_unres_lock); kfree_skb(skb); return -ENOBUFS; } /* Fill in the new cache entry */ c->_c.mfc_parent = -1; c->mf6c_origin = ipv6_hdr(skb)->saddr; c->mf6c_mcastgrp = ipv6_hdr(skb)->daddr; /* * Reflect first query at pim6sd */ err = ip6mr_cache_report(mrt, skb, mifi, MRT6MSG_NOCACHE); if (err < 0) { /* If the report failed throw the cache entry out - Brad Parker */ spin_unlock_bh(&mfc_unres_lock); ip6mr_cache_free(c); kfree_skb(skb); return err; } atomic_inc(&mrt->cache_resolve_queue_len); list_add(&c->_c.list, &mrt->mfc_unres_queue); mr6_netlink_event(mrt, c, RTM_NEWROUTE); ipmr_do_expire_process(mrt); } /* See if we can append the packet */ if (c->_c.mfc_un.unres.unresolved.qlen > 3) { kfree_skb(skb); err = -ENOBUFS; } else { if (dev) { skb->dev = dev; skb->skb_iif = dev->ifindex; } skb_queue_tail(&c->_c.mfc_un.unres.unresolved, skb); err = 0; } spin_unlock_bh(&mfc_unres_lock); return err; } /* * MFC6 cache manipulation by user space */ static int ip6mr_mfc_delete(struct mr_table *mrt, struct mf6cctl *mfc, int parent) { struct mfc6_cache *c; /* The entries are added/deleted only under RTNL */ rcu_read_lock(); c = ip6mr_cache_find_parent(mrt, &mfc->mf6cc_origin.sin6_addr, &mfc->mf6cc_mcastgrp.sin6_addr, parent); rcu_read_unlock(); if (!c) return -ENOENT; rhltable_remove(&mrt->mfc_hash, &c->_c.mnode, ip6mr_rht_params); list_del_rcu(&c->_c.list); call_ip6mr_mfc_entry_notifiers(read_pnet(&mrt->net), FIB_EVENT_ENTRY_DEL, c, mrt->id); mr6_netlink_event(mrt, c, RTM_DELROUTE); mr_cache_put(&c->_c); return 0; } static int ip6mr_device_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); struct mr_table *mrt; struct vif_device *v; int ct; if (event != NETDEV_UNREGISTER) return NOTIFY_DONE; ip6mr_for_each_table(mrt, net) { v = &mrt->vif_table[0]; for (ct = 0; ct < mrt->maxvif; ct++, v++) { if (rcu_access_pointer(v->dev) == dev) mif6_delete(mrt, ct, 1, NULL); } } return NOTIFY_DONE; } static unsigned int ip6mr_seq_read(const struct net *net) { return READ_ONCE(net->ipv6.ipmr_seq) + ip6mr_rules_seq_read(net); } static int ip6mr_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { return mr_dump(net, nb, RTNL_FAMILY_IP6MR, ip6mr_rules_dump, ip6mr_mr_table_iter, extack); } static struct notifier_block ip6_mr_notifier = { .notifier_call = ip6mr_device_event }; static const struct fib_notifier_ops ip6mr_notifier_ops_template = { .family = RTNL_FAMILY_IP6MR, .fib_seq_read = ip6mr_seq_read, .fib_dump = ip6mr_dump, .owner = THIS_MODULE, }; static int __net_init ip6mr_notifier_init(struct net *net) { struct fib_notifier_ops *ops; net->ipv6.ipmr_seq = 0; ops = fib_notifier_ops_register(&ip6mr_notifier_ops_template, net); if (IS_ERR(ops)) return PTR_ERR(ops); net->ipv6.ip6mr_notifier_ops = ops; return 0; } static void __net_exit ip6mr_notifier_exit(struct net *net) { fib_notifier_ops_unregister(net->ipv6.ip6mr_notifier_ops); net->ipv6.ip6mr_notifier_ops = NULL; } /* Setup for IP multicast routing */ static int __net_init ip6mr_net_init(struct net *net) { int err; err = ip6mr_notifier_init(net); if (err) return err; err = ip6mr_rules_init(net); if (err < 0) goto ip6mr_rules_fail; #ifdef CONFIG_PROC_FS err = -ENOMEM; if (!proc_create_net("ip6_mr_vif", 0, net->proc_net, &ip6mr_vif_seq_ops, sizeof(struct mr_vif_iter))) goto proc_vif_fail; if (!proc_create_net("ip6_mr_cache", 0, net->proc_net, &ipmr_mfc_seq_ops, sizeof(struct mr_mfc_iter))) goto proc_cache_fail; #endif return 0; #ifdef CONFIG_PROC_FS proc_cache_fail: remove_proc_entry("ip6_mr_vif", net->proc_net); proc_vif_fail: rtnl_lock(); ip6mr_rules_exit(net); rtnl_unlock(); #endif ip6mr_rules_fail: ip6mr_notifier_exit(net); return err; } static void __net_exit ip6mr_net_exit(struct net *net) { #ifdef CONFIG_PROC_FS remove_proc_entry("ip6_mr_cache", net->proc_net); remove_proc_entry("ip6_mr_vif", net->proc_net); #endif ip6mr_notifier_exit(net); } static void __net_exit ip6mr_net_exit_batch(struct list_head *net_list) { struct net *net; rtnl_lock(); list_for_each_entry(net, net_list, exit_list) ip6mr_rules_exit(net); rtnl_unlock(); } static struct pernet_operations ip6mr_net_ops = { .init = ip6mr_net_init, .exit = ip6mr_net_exit, .exit_batch = ip6mr_net_exit_batch, }; static const struct rtnl_msg_handler ip6mr_rtnl_msg_handlers[] __initconst_or_module = { {.owner = THIS_MODULE, .protocol = RTNL_FAMILY_IP6MR, .msgtype = RTM_GETROUTE, .doit = ip6mr_rtm_getroute, .dumpit = ip6mr_rtm_dumproute}, }; int __init ip6_mr_init(void) { int err; mrt_cachep = KMEM_CACHE(mfc6_cache, SLAB_HWCACHE_ALIGN); if (!mrt_cachep) return -ENOMEM; err = register_pernet_subsys(&ip6mr_net_ops); if (err) goto reg_pernet_fail; err = register_netdevice_notifier(&ip6_mr_notifier); if (err) goto reg_notif_fail; #ifdef CONFIG_IPV6_PIMSM_V2 if (inet6_add_protocol(&pim6_protocol, IPPROTO_PIM) < 0) { pr_err("%s: can't add PIM protocol\n", __func__); err = -EAGAIN; goto add_proto_fail; } #endif err = rtnl_register_many(ip6mr_rtnl_msg_handlers); if (!err) return 0; #ifdef CONFIG_IPV6_PIMSM_V2 inet6_del_protocol(&pim6_protocol, IPPROTO_PIM); add_proto_fail: unregister_netdevice_notifier(&ip6_mr_notifier); #endif reg_notif_fail: unregister_pernet_subsys(&ip6mr_net_ops); reg_pernet_fail: kmem_cache_destroy(mrt_cachep); return err; } void __init ip6_mr_cleanup(void) { rtnl_unregister_many(ip6mr_rtnl_msg_handlers); #ifdef CONFIG_IPV6_PIMSM_V2 inet6_del_protocol(&pim6_protocol, IPPROTO_PIM); #endif unregister_netdevice_notifier(&ip6_mr_notifier); unregister_pernet_subsys(&ip6mr_net_ops); kmem_cache_destroy(mrt_cachep); } static int ip6mr_mfc_add(struct net *net, struct mr_table *mrt, struct mf6cctl *mfc, int mrtsock, int parent) { unsigned char ttls[MAXMIFS]; struct mfc6_cache *uc, *c; struct mr_mfc *_uc; bool found; int i, err; if (mfc->mf6cc_parent >= MAXMIFS) return -ENFILE; memset(ttls, 255, MAXMIFS); for (i = 0; i < MAXMIFS; i++) { if (IF_ISSET(i, &mfc->mf6cc_ifset)) ttls[i] = 1; } /* The entries are added/deleted only under RTNL */ rcu_read_lock(); c = ip6mr_cache_find_parent(mrt, &mfc->mf6cc_origin.sin6_addr, &mfc->mf6cc_mcastgrp.sin6_addr, parent); rcu_read_unlock(); if (c) { spin_lock(&mrt_lock); c->_c.mfc_parent = mfc->mf6cc_parent; ip6mr_update_thresholds(mrt, &c->_c, ttls); if (!mrtsock) c->_c.mfc_flags |= MFC_STATIC; spin_unlock(&mrt_lock); call_ip6mr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, c, mrt->id); mr6_netlink_event(mrt, c, RTM_NEWROUTE); return 0; } if (!ipv6_addr_any(&mfc->mf6cc_mcastgrp.sin6_addr) && !ipv6_addr_is_multicast(&mfc->mf6cc_mcastgrp.sin6_addr)) return -EINVAL; c = ip6mr_cache_alloc(); if (!c) return -ENOMEM; c->mf6c_origin = mfc->mf6cc_origin.sin6_addr; c->mf6c_mcastgrp = mfc->mf6cc_mcastgrp.sin6_addr; c->_c.mfc_parent = mfc->mf6cc_parent; ip6mr_update_thresholds(mrt, &c->_c, ttls); if (!mrtsock) c->_c.mfc_flags |= MFC_STATIC; err = rhltable_insert_key(&mrt->mfc_hash, &c->cmparg, &c->_c.mnode, ip6mr_rht_params); if (err) { pr_err("ip6mr: rhtable insert error %d\n", err); ip6mr_cache_free(c); return err; } list_add_tail_rcu(&c->_c.list, &mrt->mfc_cache_list); /* Check to see if we resolved a queued list. If so we * need to send on the frames and tidy up. */ found = false; spin_lock_bh(&mfc_unres_lock); list_for_each_entry(_uc, &mrt->mfc_unres_queue, list) { uc = (struct mfc6_cache *)_uc; if (ipv6_addr_equal(&uc->mf6c_origin, &c->mf6c_origin) && ipv6_addr_equal(&uc->mf6c_mcastgrp, &c->mf6c_mcastgrp)) { list_del(&_uc->list); atomic_dec(&mrt->cache_resolve_queue_len); found = true; break; } } if (list_empty(&mrt->mfc_unres_queue)) timer_delete(&mrt->ipmr_expire_timer); spin_unlock_bh(&mfc_unres_lock); if (found) { ip6mr_cache_resolve(net, mrt, uc, c); ip6mr_cache_free(uc); } call_ip6mr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_ADD, c, mrt->id); mr6_netlink_event(mrt, c, RTM_NEWROUTE); return 0; } /* * Close the multicast socket, and clear the vif tables etc */ static void mroute_clean_tables(struct mr_table *mrt, int flags) { struct mr_mfc *c, *tmp; LIST_HEAD(list); int i; /* Shut down all active vif entries */ if (flags & (MRT6_FLUSH_MIFS | MRT6_FLUSH_MIFS_STATIC)) { for (i = 0; i < mrt->maxvif; i++) { if (((mrt->vif_table[i].flags & VIFF_STATIC) && !(flags & MRT6_FLUSH_MIFS_STATIC)) || (!(mrt->vif_table[i].flags & VIFF_STATIC) && !(flags & MRT6_FLUSH_MIFS))) continue; mif6_delete(mrt, i, 0, &list); } unregister_netdevice_many(&list); } /* Wipe the cache */ if (flags & (MRT6_FLUSH_MFC | MRT6_FLUSH_MFC_STATIC)) { list_for_each_entry_safe(c, tmp, &mrt->mfc_cache_list, list) { if (((c->mfc_flags & MFC_STATIC) && !(flags & MRT6_FLUSH_MFC_STATIC)) || (!(c->mfc_flags & MFC_STATIC) && !(flags & MRT6_FLUSH_MFC))) continue; rhltable_remove(&mrt->mfc_hash, &c->mnode, ip6mr_rht_params); list_del_rcu(&c->list); call_ip6mr_mfc_entry_notifiers(read_pnet(&mrt->net), FIB_EVENT_ENTRY_DEL, (struct mfc6_cache *)c, mrt->id); mr6_netlink_event(mrt, (struct mfc6_cache *)c, RTM_DELROUTE); mr_cache_put(c); } } if (flags & MRT6_FLUSH_MFC) { if (atomic_read(&mrt->cache_resolve_queue_len) != 0) { spin_lock_bh(&mfc_unres_lock); list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) { list_del(&c->list); mr6_netlink_event(mrt, (struct mfc6_cache *)c, RTM_DELROUTE); ip6mr_destroy_unres(mrt, (struct mfc6_cache *)c); } spin_unlock_bh(&mfc_unres_lock); } } } static int ip6mr_sk_init(struct mr_table *mrt, struct sock *sk) { int err = 0; struct net *net = sock_net(sk); rtnl_lock(); spin_lock(&mrt_lock); if (rtnl_dereference(mrt->mroute_sk)) { err = -EADDRINUSE; } else { rcu_assign_pointer(mrt->mroute_sk, sk); sock_set_flag(sk, SOCK_RCU_FREE); atomic_inc(&net->ipv6.devconf_all->mc_forwarding); } spin_unlock(&mrt_lock); if (!err) inet6_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_MC_FORWARDING, NETCONFA_IFINDEX_ALL, net->ipv6.devconf_all); rtnl_unlock(); return err; } int ip6mr_sk_done(struct sock *sk) { struct net *net = sock_net(sk); struct ipv6_devconf *devconf; struct mr_table *mrt; int err = -EACCES; if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num != IPPROTO_ICMPV6) return err; devconf = net->ipv6.devconf_all; if (!devconf || !atomic_read(&devconf->mc_forwarding)) return err; rtnl_lock(); ip6mr_for_each_table(mrt, net) { if (sk == rtnl_dereference(mrt->mroute_sk)) { spin_lock(&mrt_lock); RCU_INIT_POINTER(mrt->mroute_sk, NULL); /* Note that mroute_sk had SOCK_RCU_FREE set, * so the RCU grace period before sk freeing * is guaranteed by sk_destruct() */ atomic_dec(&devconf->mc_forwarding); spin_unlock(&mrt_lock); inet6_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_MC_FORWARDING, NETCONFA_IFINDEX_ALL, net->ipv6.devconf_all); mroute_clean_tables(mrt, MRT6_FLUSH_MIFS | MRT6_FLUSH_MFC); err = 0; break; } } rtnl_unlock(); return err; } bool mroute6_is_socket(struct net *net, struct sk_buff *skb) { struct mr_table *mrt; struct flowi6 fl6 = { .flowi6_iif = skb->skb_iif ? : LOOPBACK_IFINDEX, .flowi6_oif = skb->dev->ifindex, .flowi6_mark = skb->mark, }; if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0) return NULL; return rcu_access_pointer(mrt->mroute_sk); } EXPORT_SYMBOL(mroute6_is_socket); /* * Socket options and virtual interface manipulation. The whole * virtual interface system is a complete heap, but unfortunately * that's how BSD mrouted happens to think. Maybe one day with a proper * MOSPF/PIM router set up we can clean this up. */ int ip6_mroute_setsockopt(struct sock *sk, int optname, sockptr_t optval, unsigned int optlen) { int ret, parent = 0; struct mif6ctl vif; struct mf6cctl mfc; mifi_t mifi; struct net *net = sock_net(sk); struct mr_table *mrt; if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num != IPPROTO_ICMPV6) return -EOPNOTSUPP; mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT); if (!mrt) return -ENOENT; if (optname != MRT6_INIT) { if (sk != rcu_access_pointer(mrt->mroute_sk) && !ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EACCES; } switch (optname) { case MRT6_INIT: if (optlen < sizeof(int)) return -EINVAL; return ip6mr_sk_init(mrt, sk); case MRT6_DONE: return ip6mr_sk_done(sk); case MRT6_ADD_MIF: if (optlen < sizeof(vif)) return -EINVAL; if (copy_from_sockptr(&vif, optval, sizeof(vif))) return -EFAULT; if (vif.mif6c_mifi >= MAXMIFS) return -ENFILE; rtnl_lock(); ret = mif6_add(net, mrt, &vif, sk == rtnl_dereference(mrt->mroute_sk)); rtnl_unlock(); return ret; case MRT6_DEL_MIF: if (optlen < sizeof(mifi_t)) return -EINVAL; if (copy_from_sockptr(&mifi, optval, sizeof(mifi_t))) return -EFAULT; rtnl_lock(); ret = mif6_delete(mrt, mifi, 0, NULL); rtnl_unlock(); return ret; /* * Manipulate the forwarding caches. These live * in a sort of kernel/user symbiosis. */ case MRT6_ADD_MFC: case MRT6_DEL_MFC: parent = -1; fallthrough; case MRT6_ADD_MFC_PROXY: case MRT6_DEL_MFC_PROXY: if (optlen < sizeof(mfc)) return -EINVAL; if (copy_from_sockptr(&mfc, optval, sizeof(mfc))) return -EFAULT; if (parent == 0) parent = mfc.mf6cc_parent; rtnl_lock(); if (optname == MRT6_DEL_MFC || optname == MRT6_DEL_MFC_PROXY) ret = ip6mr_mfc_delete(mrt, &mfc, parent); else ret = ip6mr_mfc_add(net, mrt, &mfc, sk == rtnl_dereference(mrt->mroute_sk), parent); rtnl_unlock(); return ret; case MRT6_FLUSH: { int flags; if (optlen != sizeof(flags)) return -EINVAL; if (copy_from_sockptr(&flags, optval, sizeof(flags))) return -EFAULT; rtnl_lock(); mroute_clean_tables(mrt, flags); rtnl_unlock(); return 0; } /* * Control PIM assert (to activate pim will activate assert) */ case MRT6_ASSERT: { int v; if (optlen != sizeof(v)) return -EINVAL; if (copy_from_sockptr(&v, optval, sizeof(v))) return -EFAULT; mrt->mroute_do_assert = v; return 0; } #ifdef CONFIG_IPV6_PIMSM_V2 case MRT6_PIM: { bool do_wrmifwhole; int v; if (optlen != sizeof(v)) return -EINVAL; if (copy_from_sockptr(&v, optval, sizeof(v))) return -EFAULT; do_wrmifwhole = (v == MRT6MSG_WRMIFWHOLE); v = !!v; rtnl_lock(); ret = 0; if (v != mrt->mroute_do_pim) { mrt->mroute_do_pim = v; mrt->mroute_do_assert = v; mrt->mroute_do_wrvifwhole = do_wrmifwhole; } rtnl_unlock(); return ret; } #endif #ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES case MRT6_TABLE: { u32 v; if (optlen != sizeof(u32)) return -EINVAL; if (copy_from_sockptr(&v, optval, sizeof(v))) return -EFAULT; /* "pim6reg%u" should not exceed 16 bytes (IFNAMSIZ) */ if (v != RT_TABLE_DEFAULT && v >= 100000000) return -EINVAL; if (sk == rcu_access_pointer(mrt->mroute_sk)) return -EBUSY; rtnl_lock(); ret = 0; mrt = ip6mr_new_table(net, v); if (IS_ERR(mrt)) ret = PTR_ERR(mrt); else raw6_sk(sk)->ip6mr_table = v; rtnl_unlock(); return ret; } #endif /* * Spurious command, or MRT6_VERSION which you cannot * set. */ default: return -ENOPROTOOPT; } } /* * Getsock opt support for the multicast routing system. */ int ip6_mroute_getsockopt(struct sock *sk, int optname, sockptr_t optval, sockptr_t optlen) { int olr; int val; struct net *net = sock_net(sk); struct mr_table *mrt; if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num != IPPROTO_ICMPV6) return -EOPNOTSUPP; mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT); if (!mrt) return -ENOENT; switch (optname) { case MRT6_VERSION: val = 0x0305; break; #ifdef CONFIG_IPV6_PIMSM_V2 case MRT6_PIM: val = mrt->mroute_do_pim; break; #endif case MRT6_ASSERT: val = mrt->mroute_do_assert; break; default: return -ENOPROTOOPT; } if (copy_from_sockptr(&olr, optlen, sizeof(int))) return -EFAULT; olr = min_t(int, olr, sizeof(int)); if (olr < 0) return -EINVAL; if (copy_to_sockptr(optlen, &olr, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, &val, olr)) return -EFAULT; return 0; } /* * The IP multicast ioctl support routines. */ int ip6mr_ioctl(struct sock *sk, int cmd, void *arg) { struct sioc_sg_req6 *sr; struct sioc_mif_req6 *vr; struct vif_device *vif; struct mfc6_cache *c; struct net *net = sock_net(sk); struct mr_table *mrt; mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT); if (!mrt) return -ENOENT; switch (cmd) { case SIOCGETMIFCNT_IN6: vr = (struct sioc_mif_req6 *)arg; if (vr->mifi >= mrt->maxvif) return -EINVAL; vr->mifi = array_index_nospec(vr->mifi, mrt->maxvif); rcu_read_lock(); vif = &mrt->vif_table[vr->mifi]; if (VIF_EXISTS(mrt, vr->mifi)) { vr->icount = READ_ONCE(vif->pkt_in); vr->ocount = READ_ONCE(vif->pkt_out); vr->ibytes = READ_ONCE(vif->bytes_in); vr->obytes = READ_ONCE(vif->bytes_out); rcu_read_unlock(); return 0; } rcu_read_unlock(); return -EADDRNOTAVAIL; case SIOCGETSGCNT_IN6: sr = (struct sioc_sg_req6 *)arg; rcu_read_lock(); c = ip6mr_cache_find(mrt, &sr->src.sin6_addr, &sr->grp.sin6_addr); if (c) { sr->pktcnt = atomic_long_read(&c->_c.mfc_un.res.pkt); sr->bytecnt = atomic_long_read(&c->_c.mfc_un.res.bytes); sr->wrong_if = atomic_long_read(&c->_c.mfc_un.res.wrong_if); rcu_read_unlock(); return 0; } rcu_read_unlock(); return -EADDRNOTAVAIL; default: return -ENOIOCTLCMD; } } #ifdef CONFIG_COMPAT struct compat_sioc_sg_req6 { struct sockaddr_in6 src; struct sockaddr_in6 grp; compat_ulong_t pktcnt; compat_ulong_t bytecnt; compat_ulong_t wrong_if; }; struct compat_sioc_mif_req6 { mifi_t mifi; compat_ulong_t icount; compat_ulong_t ocount; compat_ulong_t ibytes; compat_ulong_t obytes; }; int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) { struct compat_sioc_sg_req6 sr; struct compat_sioc_mif_req6 vr; struct vif_device *vif; struct mfc6_cache *c; struct net *net = sock_net(sk); struct mr_table *mrt; mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT); if (!mrt) return -ENOENT; switch (cmd) { case SIOCGETMIFCNT_IN6: if (copy_from_user(&vr, arg, sizeof(vr))) return -EFAULT; if (vr.mifi >= mrt->maxvif) return -EINVAL; vr.mifi = array_index_nospec(vr.mifi, mrt->maxvif); rcu_read_lock(); vif = &mrt->vif_table[vr.mifi]; if (VIF_EXISTS(mrt, vr.mifi)) { vr.icount = READ_ONCE(vif->pkt_in); vr.ocount = READ_ONCE(vif->pkt_out); vr.ibytes = READ_ONCE(vif->bytes_in); vr.obytes = READ_ONCE(vif->bytes_out); rcu_read_unlock(); if (copy_to_user(arg, &vr, sizeof(vr))) return -EFAULT; return 0; } rcu_read_unlock(); return -EADDRNOTAVAIL; case SIOCGETSGCNT_IN6: if (copy_from_user(&sr, arg, sizeof(sr))) return -EFAULT; rcu_read_lock(); c = ip6mr_cache_find(mrt, &sr.src.sin6_addr, &sr.grp.sin6_addr); if (c) { sr.pktcnt = atomic_long_read(&c->_c.mfc_un.res.pkt); sr.bytecnt = atomic_long_read(&c->_c.mfc_un.res.bytes); sr.wrong_if = atomic_long_read(&c->_c.mfc_un.res.wrong_if); rcu_read_unlock(); if (copy_to_user(arg, &sr, sizeof(sr))) return -EFAULT; return 0; } rcu_read_unlock(); return -EADDRNOTAVAIL; default: return -ENOIOCTLCMD; } } #endif static inline int ip6mr_forward2_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_OUTFORWDATAGRAMS); return dst_output(net, sk, skb); } /* * Processing handlers for ip6mr_forward */ static int ip6mr_prepare_xmit(struct net *net, struct mr_table *mrt, struct sk_buff *skb, int vifi) { struct vif_device *vif = &mrt->vif_table[vifi]; struct net_device *vif_dev; struct ipv6hdr *ipv6h; struct dst_entry *dst; struct flowi6 fl6; vif_dev = vif_dev_read(vif); if (!vif_dev) return -1; #ifdef CONFIG_IPV6_PIMSM_V2 if (vif->flags & MIFF_REGISTER) { WRITE_ONCE(vif->pkt_out, vif->pkt_out + 1); WRITE_ONCE(vif->bytes_out, vif->bytes_out + skb->len); DEV_STATS_ADD(vif_dev, tx_bytes, skb->len); DEV_STATS_INC(vif_dev, tx_packets); ip6mr_cache_report(mrt, skb, vifi, MRT6MSG_WHOLEPKT); return -1; } #endif ipv6h = ipv6_hdr(skb); fl6 = (struct flowi6) { .flowi6_oif = vif->link, .daddr = ipv6h->daddr, }; dst = ip6_route_output(net, NULL, &fl6); if (dst->error) { dst_release(dst); return -1; } skb_dst_drop(skb); skb_dst_set(skb, dst); /* * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally * not only before forwarding, but after forwarding on all output * interfaces. It is clear, if mrouter runs a multicasting * program, it should receive packets not depending to what interface * program is joined. * If we will not make it, the program will have to join on all * interfaces. On the other hand, multihoming host (or router, but * not mrouter) cannot join to more than one interface - it will * result in receiving multiple packets. */ skb->dev = vif_dev; WRITE_ONCE(vif->pkt_out, vif->pkt_out + 1); WRITE_ONCE(vif->bytes_out, vif->bytes_out + skb->len); /* We are about to write */ /* XXX: extension headers? */ if (skb_cow(skb, sizeof(*ipv6h) + LL_RESERVED_SPACE(vif_dev))) return -1; ipv6h = ipv6_hdr(skb); ipv6h->hop_limit--; return 0; } static void ip6mr_forward2(struct net *net, struct mr_table *mrt, struct sk_buff *skb, int vifi) { struct net_device *indev = skb->dev; if (ip6mr_prepare_xmit(net, mrt, skb, vifi)) goto out_free; IP6CB(skb)->flags |= IP6SKB_FORWARDED; NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, net, NULL, skb, indev, skb->dev, ip6mr_forward2_finish); return; out_free: kfree_skb(skb); } static void ip6mr_output2(struct net *net, struct mr_table *mrt, struct sk_buff *skb, int vifi) { if (ip6mr_prepare_xmit(net, mrt, skb, vifi)) goto out_free; ip6_output(net, NULL, skb); return; out_free: kfree_skb(skb); } /* Called with rcu_read_lock() */ static int ip6mr_find_vif(struct mr_table *mrt, struct net_device *dev) { int ct; /* Pairs with WRITE_ONCE() in mif6_delete()/mif6_add() */ for (ct = READ_ONCE(mrt->maxvif) - 1; ct >= 0; ct--) { if (rcu_access_pointer(mrt->vif_table[ct].dev) == dev) break; } return ct; } /* Called under rcu_read_lock() */ static void ip6_mr_forward(struct net *net, struct mr_table *mrt, struct net_device *dev, struct sk_buff *skb, struct mfc6_cache *c) { int psend = -1; int vif, ct; int true_vifi = ip6mr_find_vif(mrt, dev); vif = c->_c.mfc_parent; atomic_long_inc(&c->_c.mfc_un.res.pkt); atomic_long_add(skb->len, &c->_c.mfc_un.res.bytes); WRITE_ONCE(c->_c.mfc_un.res.lastuse, jiffies); if (ipv6_addr_any(&c->mf6c_origin) && true_vifi >= 0) { struct mfc6_cache *cache_proxy; /* For an (*,G) entry, we only check that the incoming * interface is part of the static tree. */ cache_proxy = mr_mfc_find_any_parent(mrt, vif); if (cache_proxy && cache_proxy->_c.mfc_un.res.ttls[true_vifi] < 255) goto forward; } /* * Wrong interface: drop packet and (maybe) send PIM assert. */ if (rcu_access_pointer(mrt->vif_table[vif].dev) != dev) { atomic_long_inc(&c->_c.mfc_un.res.wrong_if); if (true_vifi >= 0 && mrt->mroute_do_assert && /* pimsm uses asserts, when switching from RPT to SPT, so that we cannot check that packet arrived on an oif. It is bad, but otherwise we would need to move pretty large chunk of pimd to kernel. Ough... --ANK */ (mrt->mroute_do_pim || c->_c.mfc_un.res.ttls[true_vifi] < 255) && time_after(jiffies, c->_c.mfc_un.res.last_assert + MFC_ASSERT_THRESH)) { c->_c.mfc_un.res.last_assert = jiffies; ip6mr_cache_report(mrt, skb, true_vifi, MRT6MSG_WRONGMIF); if (mrt->mroute_do_wrvifwhole) ip6mr_cache_report(mrt, skb, true_vifi, MRT6MSG_WRMIFWHOLE); } goto dont_forward; } forward: WRITE_ONCE(mrt->vif_table[vif].pkt_in, mrt->vif_table[vif].pkt_in + 1); WRITE_ONCE(mrt->vif_table[vif].bytes_in, mrt->vif_table[vif].bytes_in + skb->len); /* * Forward the frame */ if (ipv6_addr_any(&c->mf6c_origin) && ipv6_addr_any(&c->mf6c_mcastgrp)) { if (true_vifi >= 0 && true_vifi != c->_c.mfc_parent && ipv6_hdr(skb)->hop_limit > c->_c.mfc_un.res.ttls[c->_c.mfc_parent]) { /* It's an (*,*) entry and the packet is not coming from * the upstream: forward the packet to the upstream * only. */ psend = c->_c.mfc_parent; goto last_forward; } goto dont_forward; } for (ct = c->_c.mfc_un.res.maxvif - 1; ct >= c->_c.mfc_un.res.minvif; ct--) { /* For (*,G) entry, don't forward to the incoming interface */ if ((!ipv6_addr_any(&c->mf6c_origin) || ct != true_vifi) && ipv6_hdr(skb)->hop_limit > c->_c.mfc_un.res.ttls[ct]) { if (psend != -1) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) ip6mr_forward2(net, mrt, skb2, psend); } psend = ct; } } last_forward: if (psend != -1) { ip6mr_forward2(net, mrt, skb, psend); return; } dont_forward: kfree_skb(skb); } /* Called under rcu_read_lock() */ static void ip6_mr_output_finish(struct net *net, struct mr_table *mrt, struct net_device *dev, struct sk_buff *skb, struct mfc6_cache *c) { int psend = -1; int ct; WARN_ON_ONCE(!rcu_read_lock_held()); atomic_long_inc(&c->_c.mfc_un.res.pkt); atomic_long_add(skb->len, &c->_c.mfc_un.res.bytes); WRITE_ONCE(c->_c.mfc_un.res.lastuse, jiffies); /* Forward the frame */ if (ipv6_addr_any(&c->mf6c_origin) && ipv6_addr_any(&c->mf6c_mcastgrp)) { if (ipv6_hdr(skb)->hop_limit > c->_c.mfc_un.res.ttls[c->_c.mfc_parent]) { /* It's an (*,*) entry and the packet is not coming from * the upstream: forward the packet to the upstream * only. */ psend = c->_c.mfc_parent; goto last_forward; } goto dont_forward; } for (ct = c->_c.mfc_un.res.maxvif - 1; ct >= c->_c.mfc_un.res.minvif; ct--) { if (ipv6_hdr(skb)->hop_limit > c->_c.mfc_un.res.ttls[ct]) { if (psend != -1) { struct sk_buff *skb2; skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) ip6mr_output2(net, mrt, skb2, psend); } psend = ct; } } last_forward: if (psend != -1) { ip6mr_output2(net, mrt, skb, psend); return; } dont_forward: kfree_skb(skb); } /* * Multicast packets for forwarding arrive here */ int ip6_mr_input(struct sk_buff *skb) { struct net_device *dev = skb->dev; struct net *net = dev_net_rcu(dev); struct mfc6_cache *cache; struct mr_table *mrt; struct flowi6 fl6 = { .flowi6_iif = dev->ifindex, .flowi6_mark = skb->mark, }; int err; /* skb->dev passed in is the master dev for vrfs. * Get the proper interface that does have a vif associated with it. */ if (netif_is_l3_master(dev)) { dev = dev_get_by_index_rcu(net, IPCB(skb)->iif); if (!dev) { kfree_skb(skb); return -ENODEV; } } err = ip6mr_fib_lookup(net, &fl6, &mrt); if (err < 0) { kfree_skb(skb); return err; } cache = ip6mr_cache_find(mrt, &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr); if (!cache) { int vif = ip6mr_find_vif(mrt, dev); if (vif >= 0) cache = ip6mr_cache_find_any(mrt, &ipv6_hdr(skb)->daddr, vif); } /* * No usable cache entry */ if (!cache) { int vif; vif = ip6mr_find_vif(mrt, dev); if (vif >= 0) { int err = ip6mr_cache_unresolved(mrt, vif, skb, dev); return err; } kfree_skb(skb); return -ENODEV; } ip6_mr_forward(net, mrt, dev, skb, cache); return 0; } int ip6_mr_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb_dst(skb)->dev; struct flowi6 fl6 = (struct flowi6) { .flowi6_iif = LOOPBACK_IFINDEX, .flowi6_mark = skb->mark, }; struct mfc6_cache *cache; struct mr_table *mrt; int err; int vif; guard(rcu)(); if (IP6CB(skb)->flags & IP6SKB_FORWARDED) goto ip6_output; if (!(IP6CB(skb)->flags & IP6SKB_MCROUTE)) goto ip6_output; err = ip6mr_fib_lookup(net, &fl6, &mrt); if (err < 0) { kfree_skb(skb); return err; } cache = ip6mr_cache_find(mrt, &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr); if (!cache) { vif = ip6mr_find_vif(mrt, dev); if (vif >= 0) cache = ip6mr_cache_find_any(mrt, &ipv6_hdr(skb)->daddr, vif); } /* No usable cache entry */ if (!cache) { vif = ip6mr_find_vif(mrt, dev); if (vif >= 0) return ip6mr_cache_unresolved(mrt, vif, skb, dev); goto ip6_output; } /* Wrong interface */ vif = cache->_c.mfc_parent; if (rcu_access_pointer(mrt->vif_table[vif].dev) != dev) goto ip6_output; ip6_mr_output_finish(net, mrt, dev, skb, cache); return 0; ip6_output: return ip6_output(net, sk, skb); } int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm, u32 portid) { int err; struct mr_table *mrt; struct mfc6_cache *cache; struct rt6_info *rt = dst_rt6_info(skb_dst(skb)); rcu_read_lock(); mrt = __ip6mr_get_table(net, RT6_TABLE_DFLT); if (!mrt) { rcu_read_unlock(); return -ENOENT; } cache = ip6mr_cache_find(mrt, &rt->rt6i_src.addr, &rt->rt6i_dst.addr); if (!cache && skb->dev) { int vif = ip6mr_find_vif(mrt, skb->dev); if (vif >= 0) cache = ip6mr_cache_find_any(mrt, &rt->rt6i_dst.addr, vif); } if (!cache) { struct sk_buff *skb2; struct ipv6hdr *iph; struct net_device *dev; int vif; dev = skb->dev; if (!dev || (vif = ip6mr_find_vif(mrt, dev)) < 0) { rcu_read_unlock(); return -ENODEV; } /* really correct? */ skb2 = alloc_skb(sizeof(struct ipv6hdr), GFP_ATOMIC); if (!skb2) { rcu_read_unlock(); return -ENOMEM; } NETLINK_CB(skb2).portid = portid; skb_reset_transport_header(skb2); skb_put(skb2, sizeof(struct ipv6hdr)); skb_reset_network_header(skb2); iph = ipv6_hdr(skb2); iph->version = 0; iph->priority = 0; iph->flow_lbl[0] = 0; iph->flow_lbl[1] = 0; iph->flow_lbl[2] = 0; iph->payload_len = 0; iph->nexthdr = IPPROTO_NONE; iph->hop_limit = 0; iph->saddr = rt->rt6i_src.addr; iph->daddr = rt->rt6i_dst.addr; err = ip6mr_cache_unresolved(mrt, vif, skb2, dev); rcu_read_unlock(); return err; } err = mr_fill_mroute(mrt, skb, &cache->_c, rtm); rcu_read_unlock(); return err; } static int ip6mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, u32 portid, u32 seq, struct mfc6_cache *c, int cmd, int flags) { struct nlmsghdr *nlh; struct rtmsg *rtm; int err; nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rtm), flags); if (!nlh) return -EMSGSIZE; rtm = nlmsg_data(nlh); rtm->rtm_family = RTNL_FAMILY_IP6MR; rtm->rtm_dst_len = 128; rtm->rtm_src_len = 128; rtm->rtm_tos = 0; rtm->rtm_table = mrt->id; if (nla_put_u32(skb, RTA_TABLE, mrt->id)) goto nla_put_failure; rtm->rtm_type = RTN_MULTICAST; rtm->rtm_scope = RT_SCOPE_UNIVERSE; if (c->_c.mfc_flags & MFC_STATIC) rtm->rtm_protocol = RTPROT_STATIC; else rtm->rtm_protocol = RTPROT_MROUTED; rtm->rtm_flags = 0; if (nla_put_in6_addr(skb, RTA_SRC, &c->mf6c_origin) || nla_put_in6_addr(skb, RTA_DST, &c->mf6c_mcastgrp)) goto nla_put_failure; err = mr_fill_mroute(mrt, skb, &c->_c, rtm); /* do not break the dump if cache is unresolved */ if (err < 0 && err != -ENOENT) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int _ip6mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, u32 portid, u32 seq, struct mr_mfc *c, int cmd, int flags) { return ip6mr_fill_mroute(mrt, skb, portid, seq, (struct mfc6_cache *)c, cmd, flags); } static int mr6_msgsize(bool unresolved, int maxvif) { size_t len = NLMSG_ALIGN(sizeof(struct rtmsg)) + nla_total_size(4) /* RTA_TABLE */ + nla_total_size(sizeof(struct in6_addr)) /* RTA_SRC */ + nla_total_size(sizeof(struct in6_addr)) /* RTA_DST */ ; if (!unresolved) len = len + nla_total_size(4) /* RTA_IIF */ + nla_total_size(0) /* RTA_MULTIPATH */ + maxvif * NLA_ALIGN(sizeof(struct rtnexthop)) /* RTA_MFC_STATS */ + nla_total_size_64bit(sizeof(struct rta_mfc_stats)) ; return len; } static void mr6_netlink_event(struct mr_table *mrt, struct mfc6_cache *mfc, int cmd) { struct net *net = read_pnet(&mrt->net); struct sk_buff *skb; int err = -ENOBUFS; skb = nlmsg_new(mr6_msgsize(mfc->_c.mfc_parent >= MAXMIFS, mrt->maxvif), GFP_ATOMIC); if (!skb) goto errout; err = ip6mr_fill_mroute(mrt, skb, 0, 0, mfc, cmd, 0); if (err < 0) goto errout; rtnl_notify(skb, net, 0, RTNLGRP_IPV6_MROUTE, NULL, GFP_ATOMIC); return; errout: kfree_skb(skb); rtnl_set_sk_err(net, RTNLGRP_IPV6_MROUTE, err); } static size_t mrt6msg_netlink_msgsize(size_t payloadlen) { size_t len = NLMSG_ALIGN(sizeof(struct rtgenmsg)) + nla_total_size(1) /* IP6MRA_CREPORT_MSGTYPE */ + nla_total_size(4) /* IP6MRA_CREPORT_MIF_ID */ /* IP6MRA_CREPORT_SRC_ADDR */ + nla_total_size(sizeof(struct in6_addr)) /* IP6MRA_CREPORT_DST_ADDR */ + nla_total_size(sizeof(struct in6_addr)) /* IP6MRA_CREPORT_PKT */ + nla_total_size(payloadlen) ; return len; } static void mrt6msg_netlink_event(const struct mr_table *mrt, struct sk_buff *pkt) { struct net *net = read_pnet(&mrt->net); struct nlmsghdr *nlh; struct rtgenmsg *rtgenm; struct mrt6msg *msg; struct sk_buff *skb; struct nlattr *nla; int payloadlen; payloadlen = pkt->len - sizeof(struct mrt6msg); msg = (struct mrt6msg *)skb_transport_header(pkt); skb = nlmsg_new(mrt6msg_netlink_msgsize(payloadlen), GFP_ATOMIC); if (!skb) goto errout; nlh = nlmsg_put(skb, 0, 0, RTM_NEWCACHEREPORT, sizeof(struct rtgenmsg), 0); if (!nlh) goto errout; rtgenm = nlmsg_data(nlh); rtgenm->rtgen_family = RTNL_FAMILY_IP6MR; if (nla_put_u8(skb, IP6MRA_CREPORT_MSGTYPE, msg->im6_msgtype) || nla_put_u32(skb, IP6MRA_CREPORT_MIF_ID, msg->im6_mif) || nla_put_in6_addr(skb, IP6MRA_CREPORT_SRC_ADDR, &msg->im6_src) || nla_put_in6_addr(skb, IP6MRA_CREPORT_DST_ADDR, &msg->im6_dst)) goto nla_put_failure; nla = nla_reserve(skb, IP6MRA_CREPORT_PKT, payloadlen); if (!nla || skb_copy_bits(pkt, sizeof(struct mrt6msg), nla_data(nla), payloadlen)) goto nla_put_failure; nlmsg_end(skb, nlh); rtnl_notify(skb, net, 0, RTNLGRP_IPV6_MROUTE_R, NULL, GFP_ATOMIC); return; nla_put_failure: nlmsg_cancel(skb, nlh); errout: kfree_skb(skb); rtnl_set_sk_err(net, RTNLGRP_IPV6_MROUTE_R, -ENOBUFS); } static const struct nla_policy ip6mr_getroute_policy[RTA_MAX + 1] = { [RTA_SRC] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)), [RTA_DST] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)), [RTA_TABLE] = { .type = NLA_U32 }, }; static int ip6mr_rtm_valid_getroute_req(struct sk_buff *skb, const struct nlmsghdr *nlh, struct nlattr **tb, struct netlink_ext_ack *extack) { struct rtmsg *rtm; int err; err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, ip6mr_getroute_policy, extack); if (err) return err; rtm = nlmsg_data(nlh); if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) || (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) || rtm->rtm_tos || rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope || rtm->rtm_type || rtm->rtm_flags) { NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for multicast route get request"); return -EINVAL; } if ((tb[RTA_SRC] && !rtm->rtm_src_len) || (tb[RTA_DST] && !rtm->rtm_dst_len)) { NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6"); return -EINVAL; } return 0; } static int ip6mr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); struct in6_addr src = {}, grp = {}; struct nlattr *tb[RTA_MAX + 1]; struct mfc6_cache *cache; struct mr_table *mrt; struct sk_buff *skb; u32 tableid; int err; err = ip6mr_rtm_valid_getroute_req(in_skb, nlh, tb, extack); if (err < 0) return err; if (tb[RTA_SRC]) src = nla_get_in6_addr(tb[RTA_SRC]); if (tb[RTA_DST]) grp = nla_get_in6_addr(tb[RTA_DST]); tableid = nla_get_u32_default(tb[RTA_TABLE], 0); mrt = __ip6mr_get_table(net, tableid ?: RT_TABLE_DEFAULT); if (!mrt) { NL_SET_ERR_MSG_MOD(extack, "MR table does not exist"); return -ENOENT; } /* entries are added/deleted only under RTNL */ rcu_read_lock(); cache = ip6mr_cache_find(mrt, &src, &grp); rcu_read_unlock(); if (!cache) { NL_SET_ERR_MSG_MOD(extack, "MR cache entry not found"); return -ENOENT; } skb = nlmsg_new(mr6_msgsize(false, mrt->maxvif), GFP_KERNEL); if (!skb) return -ENOBUFS; err = ip6mr_fill_mroute(mrt, skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, cache, RTM_NEWROUTE, 0); if (err < 0) { kfree_skb(skb); return err; } return rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); } static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) { const struct nlmsghdr *nlh = cb->nlh; struct fib_dump_filter filter = { .rtnl_held = true, }; int err; if (cb->strict_check) { err = ip_valid_fib_dump_req(sock_net(skb->sk), nlh, &filter, cb); if (err < 0) return err; } if (filter.table_id) { struct mr_table *mrt; mrt = __ip6mr_get_table(sock_net(skb->sk), filter.table_id); if (!mrt) { if (rtnl_msg_family(cb->nlh) != RTNL_FAMILY_IP6MR) return skb->len; NL_SET_ERR_MSG_MOD(cb->extack, "MR table does not exist"); return -ENOENT; } err = mr_table_dump(mrt, skb, cb, _ip6mr_fill_mroute, &mfc_unres_lock, &filter); return skb->len ? : err; } return mr_rtm_dumproute(skb, cb, ip6mr_mr_table_iter, _ip6mr_fill_mroute, &mfc_unres_lock, &filter); } |
| 22 22 22 2 22 5 5 1 4 2 2 2 2 2 28 6 22 6 22 10 5 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 | // SPDX-License-Identifier: GPL-2.0 /* * FUSE inode io modes. * * Copyright (c) 2024 CTERA Networks. */ #include "fuse_i.h" #include <linux/kernel.h> #include <linux/sched.h> #include <linux/file.h> #include <linux/fs.h> /* * Return true if need to wait for new opens in caching mode. */ static inline bool fuse_is_io_cache_wait(struct fuse_inode *fi) { return READ_ONCE(fi->iocachectr) < 0 && !fuse_inode_backing(fi); } /* * Called on cached file open() and on first mmap() of direct_io file. * Takes cached_io inode mode reference to be dropped on file release. * * Blocks new parallel dio writes and waits for the in-progress parallel dio * writes to complete. */ int fuse_file_cached_io_open(struct inode *inode, struct fuse_file *ff) { struct fuse_inode *fi = get_fuse_inode(inode); /* There are no io modes if server does not implement open */ if (!ff->args) return 0; spin_lock(&fi->lock); /* * Setting the bit advises new direct-io writes to use an exclusive * lock - without it the wait below might be forever. */ while (fuse_is_io_cache_wait(fi)) { set_bit(FUSE_I_CACHE_IO_MODE, &fi->state); spin_unlock(&fi->lock); wait_event(fi->direct_io_waitq, !fuse_is_io_cache_wait(fi)); spin_lock(&fi->lock); } /* * Check if inode entered passthrough io mode while waiting for parallel * dio write completion. */ if (fuse_inode_backing(fi)) { clear_bit(FUSE_I_CACHE_IO_MODE, &fi->state); spin_unlock(&fi->lock); return -ETXTBSY; } WARN_ON(ff->iomode == IOM_UNCACHED); if (ff->iomode == IOM_NONE) { ff->iomode = IOM_CACHED; if (fi->iocachectr == 0) set_bit(FUSE_I_CACHE_IO_MODE, &fi->state); fi->iocachectr++; } spin_unlock(&fi->lock); return 0; } static void fuse_file_cached_io_release(struct fuse_file *ff, struct fuse_inode *fi) { spin_lock(&fi->lock); WARN_ON(fi->iocachectr <= 0); WARN_ON(ff->iomode != IOM_CACHED); ff->iomode = IOM_NONE; fi->iocachectr--; if (fi->iocachectr == 0) clear_bit(FUSE_I_CACHE_IO_MODE, &fi->state); spin_unlock(&fi->lock); } /* Start strictly uncached io mode where cache access is not allowed */ int fuse_inode_uncached_io_start(struct fuse_inode *fi, struct fuse_backing *fb) { struct fuse_backing *oldfb; int err = 0; spin_lock(&fi->lock); /* deny conflicting backing files on same fuse inode */ oldfb = fuse_inode_backing(fi); if (fb && oldfb && oldfb != fb) { err = -EBUSY; goto unlock; } if (fi->iocachectr > 0) { err = -ETXTBSY; goto unlock; } fi->iocachectr--; /* fuse inode holds a single refcount of backing file */ if (fb && !oldfb) { oldfb = fuse_inode_backing_set(fi, fb); WARN_ON_ONCE(oldfb != NULL); } else { fuse_backing_put(fb); } unlock: spin_unlock(&fi->lock); return err; } /* Takes uncached_io inode mode reference to be dropped on file release */ static int fuse_file_uncached_io_open(struct inode *inode, struct fuse_file *ff, struct fuse_backing *fb) { struct fuse_inode *fi = get_fuse_inode(inode); int err; err = fuse_inode_uncached_io_start(fi, fb); if (err) return err; WARN_ON(ff->iomode != IOM_NONE); ff->iomode = IOM_UNCACHED; return 0; } void fuse_inode_uncached_io_end(struct fuse_inode *fi) { struct fuse_backing *oldfb = NULL; spin_lock(&fi->lock); WARN_ON(fi->iocachectr >= 0); fi->iocachectr++; if (!fi->iocachectr) { wake_up(&fi->direct_io_waitq); oldfb = fuse_inode_backing_set(fi, NULL); } spin_unlock(&fi->lock); if (oldfb) fuse_backing_put(oldfb); } /* Drop uncached_io reference from passthrough open */ static void fuse_file_uncached_io_release(struct fuse_file *ff, struct fuse_inode *fi) { WARN_ON(ff->iomode != IOM_UNCACHED); ff->iomode = IOM_NONE; fuse_inode_uncached_io_end(fi); } /* * Open flags that are allowed in combination with FOPEN_PASSTHROUGH. * A combination of FOPEN_PASSTHROUGH and FOPEN_DIRECT_IO means that read/write * operations go directly to the server, but mmap is done on the backing file. * FOPEN_PASSTHROUGH mode should not co-exist with any users of the fuse inode * page cache, so FOPEN_KEEP_CACHE is a strange and undesired combination. */ #define FOPEN_PASSTHROUGH_MASK \ (FOPEN_PASSTHROUGH | FOPEN_DIRECT_IO | FOPEN_PARALLEL_DIRECT_WRITES | \ FOPEN_NOFLUSH) static int fuse_file_passthrough_open(struct inode *inode, struct file *file) { struct fuse_file *ff = file->private_data; struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_backing *fb; int err; /* Check allowed conditions for file open in passthrough mode */ if (!IS_ENABLED(CONFIG_FUSE_PASSTHROUGH) || !fc->passthrough || (ff->open_flags & ~FOPEN_PASSTHROUGH_MASK)) return -EINVAL; fb = fuse_passthrough_open(file, ff->args->open_outarg.backing_id); if (IS_ERR(fb)) return PTR_ERR(fb); /* First passthrough file open denies caching inode io mode */ err = fuse_file_uncached_io_open(inode, ff, fb); if (!err) return 0; fuse_passthrough_release(ff, fb); fuse_backing_put(fb); return err; } /* Request access to submit new io to inode via open file */ int fuse_file_io_open(struct file *file, struct inode *inode) { struct fuse_file *ff = file->private_data; struct fuse_inode *fi = get_fuse_inode(inode); int err; /* * io modes are not relevant with DAX and with server that does not * implement open. */ if (FUSE_IS_DAX(inode) || !ff->args) return 0; /* * Server is expected to use FOPEN_PASSTHROUGH for all opens of an inode * which is already open for passthrough. */ err = -EINVAL; if (fuse_inode_backing(fi) && !(ff->open_flags & FOPEN_PASSTHROUGH)) goto fail; /* * FOPEN_PARALLEL_DIRECT_WRITES requires FOPEN_DIRECT_IO. */ if (!(ff->open_flags & FOPEN_DIRECT_IO)) ff->open_flags &= ~FOPEN_PARALLEL_DIRECT_WRITES; /* * First passthrough file open denies caching inode io mode. * First caching file open enters caching inode io mode. * * Note that if user opens a file open with O_DIRECT, but server did * not specify FOPEN_DIRECT_IO, a later fcntl() could remove O_DIRECT, * so we put the inode in caching mode to prevent parallel dio. */ if ((ff->open_flags & FOPEN_DIRECT_IO) && !(ff->open_flags & FOPEN_PASSTHROUGH)) return 0; if (ff->open_flags & FOPEN_PASSTHROUGH) err = fuse_file_passthrough_open(inode, file); else err = fuse_file_cached_io_open(inode, ff); if (err) goto fail; return 0; fail: pr_debug("failed to open file in requested io mode (open_flags=0x%x, err=%i).\n", ff->open_flags, err); /* * The file open mode determines the inode io mode. * Using incorrect open mode is a server mistake, which results in * user visible failure of open() with EIO error. */ return -EIO; } /* No more pending io and no new io possible to inode via open/mmapped file */ void fuse_file_io_release(struct fuse_file *ff, struct inode *inode) { struct fuse_inode *fi = get_fuse_inode(inode); /* * Last passthrough file close allows caching inode io mode. * Last caching file close exits caching inode io mode. */ switch (ff->iomode) { case IOM_NONE: /* Nothing to do */ break; case IOM_UNCACHED: fuse_file_uncached_io_release(ff, fi); break; case IOM_CACHED: fuse_file_cached_io_release(ff, fi); break; } } |
| 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1 1 2 2 2 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 | // SPDX-License-Identifier: GPL-2.0-or-later /* * HID driver for Corsair devices * * Supported devices: * - Vengeance K70 Keyboard * - K70 RAPIDFIRE Keyboard * - Vengeance K90 Keyboard * - Scimitar PRO RGB Gaming Mouse * * Copyright (c) 2015 Clement Vuchener * Copyright (c) 2017 Oscar Campos * Copyright (c) 2017 Aaron Bottegal */ /* */ #include <linux/hid.h> #include <linux/module.h> #include <linux/usb.h> #include <linux/leds.h> #include "hid-ids.h" #define CORSAIR_USE_K90_MACRO (1<<0) #define CORSAIR_USE_K90_BACKLIGHT (1<<1) struct k90_led { struct led_classdev cdev; int brightness; struct work_struct work; bool removed; }; struct k90_drvdata { struct k90_led record_led; }; struct corsair_drvdata { unsigned long quirks; struct k90_drvdata *k90; struct k90_led *backlight; }; #define K90_GKEY_COUNT 18 static int corsair_usage_to_gkey(unsigned int usage) { /* G1 (0xd0) to G16 (0xdf) */ if (usage >= 0xd0 && usage <= 0xdf) return usage - 0xd0 + 1; /* G17 (0xe8) to G18 (0xe9) */ if (usage >= 0xe8 && usage <= 0xe9) return usage - 0xe8 + 17; return 0; } static unsigned short corsair_gkey_map[K90_GKEY_COUNT] = { BTN_TRIGGER_HAPPY1, BTN_TRIGGER_HAPPY2, BTN_TRIGGER_HAPPY3, BTN_TRIGGER_HAPPY4, BTN_TRIGGER_HAPPY5, BTN_TRIGGER_HAPPY6, BTN_TRIGGER_HAPPY7, BTN_TRIGGER_HAPPY8, BTN_TRIGGER_HAPPY9, BTN_TRIGGER_HAPPY10, BTN_TRIGGER_HAPPY11, BTN_TRIGGER_HAPPY12, BTN_TRIGGER_HAPPY13, BTN_TRIGGER_HAPPY14, BTN_TRIGGER_HAPPY15, BTN_TRIGGER_HAPPY16, BTN_TRIGGER_HAPPY17, BTN_TRIGGER_HAPPY18, }; module_param_array_named(gkey_codes, corsair_gkey_map, ushort, NULL, S_IRUGO); MODULE_PARM_DESC(gkey_codes, "Key codes for the G-keys"); static unsigned short corsair_record_keycodes[2] = { BTN_TRIGGER_HAPPY19, BTN_TRIGGER_HAPPY20 }; module_param_array_named(recordkey_codes, corsair_record_keycodes, ushort, NULL, S_IRUGO); MODULE_PARM_DESC(recordkey_codes, "Key codes for the MR (start and stop record) button"); static unsigned short corsair_profile_keycodes[3] = { BTN_TRIGGER_HAPPY21, BTN_TRIGGER_HAPPY22, BTN_TRIGGER_HAPPY23 }; module_param_array_named(profilekey_codes, corsair_profile_keycodes, ushort, NULL, S_IRUGO); MODULE_PARM_DESC(profilekey_codes, "Key codes for the profile buttons"); #define CORSAIR_USAGE_SPECIAL_MIN 0xf0 #define CORSAIR_USAGE_SPECIAL_MAX 0xff #define CORSAIR_USAGE_MACRO_RECORD_START 0xf6 #define CORSAIR_USAGE_MACRO_RECORD_STOP 0xf7 #define CORSAIR_USAGE_PROFILE 0xf1 #define CORSAIR_USAGE_M1 0xf1 #define CORSAIR_USAGE_M2 0xf2 #define CORSAIR_USAGE_M3 0xf3 #define CORSAIR_USAGE_PROFILE_MAX 0xf3 #define CORSAIR_USAGE_META_OFF 0xf4 #define CORSAIR_USAGE_META_ON 0xf5 #define CORSAIR_USAGE_LIGHT 0xfa #define CORSAIR_USAGE_LIGHT_OFF 0xfa #define CORSAIR_USAGE_LIGHT_DIM 0xfb #define CORSAIR_USAGE_LIGHT_MEDIUM 0xfc #define CORSAIR_USAGE_LIGHT_BRIGHT 0xfd #define CORSAIR_USAGE_LIGHT_MAX 0xfd /* USB control protocol */ #define K90_REQUEST_BRIGHTNESS 49 #define K90_REQUEST_MACRO_MODE 2 #define K90_REQUEST_STATUS 4 #define K90_REQUEST_GET_MODE 5 #define K90_REQUEST_PROFILE 20 #define K90_MACRO_MODE_SW 0x0030 #define K90_MACRO_MODE_HW 0x0001 #define K90_MACRO_LED_ON 0x0020 #define K90_MACRO_LED_OFF 0x0040 /* * LED class devices */ #define K90_BACKLIGHT_LED_SUFFIX "::backlight" #define K90_RECORD_LED_SUFFIX "::record" static enum led_brightness k90_backlight_get(struct led_classdev *led_cdev) { int ret; struct k90_led *led = container_of(led_cdev, struct k90_led, cdev); struct device *dev = led->cdev.dev->parent; struct usb_interface *usbif = to_usb_interface(dev->parent); struct usb_device *usbdev = interface_to_usbdev(usbif); int brightness; char *data; data = kmalloc(8, GFP_KERNEL); if (!data) return -ENOMEM; ret = usb_control_msg(usbdev, usb_rcvctrlpipe(usbdev, 0), K90_REQUEST_STATUS, USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_DEVICE, 0, 0, data, 8, USB_CTRL_SET_TIMEOUT); if (ret < 5) { dev_warn(dev, "Failed to get K90 initial state (error %d).\n", ret); ret = -EIO; goto out; } brightness = data[4]; if (brightness < 0 || brightness > 3) { dev_warn(dev, "Read invalid backlight brightness: %02hhx.\n", data[4]); ret = -EIO; goto out; } ret = brightness; out: kfree(data); return ret; } static enum led_brightness k90_record_led_get(struct led_classdev *led_cdev) { struct k90_led *led = container_of(led_cdev, struct k90_led, cdev); return led->brightness; } static void k90_brightness_set(struct led_classdev *led_cdev, enum led_brightness brightness) { struct k90_led *led = container_of(led_cdev, struct k90_led, cdev); led->brightness = brightness; schedule_work(&led->work); } static void k90_backlight_work(struct work_struct *work) { int ret; struct k90_led *led = container_of(work, struct k90_led, work); struct device *dev; struct usb_interface *usbif; struct usb_device *usbdev; if (led->removed) return; dev = led->cdev.dev->parent; usbif = to_usb_interface(dev->parent); usbdev = interface_to_usbdev(usbif); ret = usb_control_msg(usbdev, usb_sndctrlpipe(usbdev, 0), K90_REQUEST_BRIGHTNESS, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE, led->brightness, 0, NULL, 0, USB_CTRL_SET_TIMEOUT); if (ret != 0) dev_warn(dev, "Failed to set backlight brightness (error: %d).\n", ret); } static void k90_record_led_work(struct work_struct *work) { int ret; struct k90_led *led = container_of(work, struct k90_led, work); struct device *dev; struct usb_interface *usbif; struct usb_device *usbdev; int value; if (led->removed) return; dev = led->cdev.dev->parent; usbif = to_usb_interface(dev->parent); usbdev = interface_to_usbdev(usbif); if (led->brightness > 0) value = K90_MACRO_LED_ON; else value = K90_MACRO_LED_OFF; ret = usb_control_msg(usbdev, usb_sndctrlpipe(usbdev, 0), K90_REQUEST_MACRO_MODE, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE, value, 0, NULL, 0, USB_CTRL_SET_TIMEOUT); if (ret != 0) dev_warn(dev, "Failed to set record LED state (error: %d).\n", ret); } /* * Keyboard attributes */ static ssize_t k90_show_macro_mode(struct device *dev, struct device_attribute *attr, char *buf) { int ret; struct usb_interface *usbif = to_usb_interface(dev->parent); struct usb_device *usbdev = interface_to_usbdev(usbif); const char *macro_mode; char *data; data = kmalloc(2, GFP_KERNEL); if (!data) return -ENOMEM; ret = usb_control_msg(usbdev, usb_rcvctrlpipe(usbdev, 0), K90_REQUEST_GET_MODE, USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_DEVICE, 0, 0, data, 2, USB_CTRL_SET_TIMEOUT); if (ret < 1) { dev_warn(dev, "Failed to get K90 initial mode (error %d).\n", ret); ret = -EIO; goto out; } switch (data[0]) { case K90_MACRO_MODE_HW: macro_mode = "HW"; break; case K90_MACRO_MODE_SW: macro_mode = "SW"; break; default: dev_warn(dev, "K90 in unknown mode: %02hhx.\n", data[0]); ret = -EIO; goto out; } ret = sysfs_emit(buf, "%s\n", macro_mode); out: kfree(data); return ret; } static ssize_t k90_store_macro_mode(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { int ret; struct usb_interface *usbif = to_usb_interface(dev->parent); struct usb_device *usbdev = interface_to_usbdev(usbif); __u16 value; if (strncmp(buf, "SW", 2) == 0) value = K90_MACRO_MODE_SW; else if (strncmp(buf, "HW", 2) == 0) value = K90_MACRO_MODE_HW; else return -EINVAL; ret = usb_control_msg(usbdev, usb_sndctrlpipe(usbdev, 0), K90_REQUEST_MACRO_MODE, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE, value, 0, NULL, 0, USB_CTRL_SET_TIMEOUT); if (ret != 0) { dev_warn(dev, "Failed to set macro mode.\n"); return ret; } return count; } static ssize_t k90_show_current_profile(struct device *dev, struct device_attribute *attr, char *buf) { int ret; struct usb_interface *usbif = to_usb_interface(dev->parent); struct usb_device *usbdev = interface_to_usbdev(usbif); int current_profile; char *data; data = kmalloc(8, GFP_KERNEL); if (!data) return -ENOMEM; ret = usb_control_msg(usbdev, usb_rcvctrlpipe(usbdev, 0), K90_REQUEST_STATUS, USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_DEVICE, 0, 0, data, 8, USB_CTRL_SET_TIMEOUT); if (ret < 8) { dev_warn(dev, "Failed to get K90 initial state (error %d).\n", ret); ret = -EIO; goto out; } current_profile = data[7]; if (current_profile < 1 || current_profile > 3) { dev_warn(dev, "Read invalid current profile: %02hhx.\n", data[7]); ret = -EIO; goto out; } ret = sysfs_emit(buf, "%d\n", current_profile); out: kfree(data); return ret; } static ssize_t k90_store_current_profile(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { int ret; struct usb_interface *usbif = to_usb_interface(dev->parent); struct usb_device *usbdev = interface_to_usbdev(usbif); int profile; if (kstrtoint(buf, 10, &profile)) return -EINVAL; if (profile < 1 || profile > 3) return -EINVAL; ret = usb_control_msg(usbdev, usb_sndctrlpipe(usbdev, 0), K90_REQUEST_PROFILE, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE, profile, 0, NULL, 0, USB_CTRL_SET_TIMEOUT); if (ret != 0) { dev_warn(dev, "Failed to change current profile (error %d).\n", ret); return ret; } return count; } static DEVICE_ATTR(macro_mode, 0644, k90_show_macro_mode, k90_store_macro_mode); static DEVICE_ATTR(current_profile, 0644, k90_show_current_profile, k90_store_current_profile); static struct attribute *k90_attrs[] = { &dev_attr_macro_mode.attr, &dev_attr_current_profile.attr, NULL }; static const struct attribute_group k90_attr_group = { .attrs = k90_attrs, }; /* * Driver functions */ static int k90_init_backlight(struct hid_device *dev) { int ret; struct corsair_drvdata *drvdata = hid_get_drvdata(dev); size_t name_sz; char *name; drvdata->backlight = kzalloc(sizeof(struct k90_led), GFP_KERNEL); if (!drvdata->backlight) { ret = -ENOMEM; goto fail_backlight_alloc; } name_sz = strlen(dev_name(&dev->dev)) + sizeof(K90_BACKLIGHT_LED_SUFFIX); name = kzalloc(name_sz, GFP_KERNEL); if (!name) { ret = -ENOMEM; goto fail_name_alloc; } snprintf(name, name_sz, "%s" K90_BACKLIGHT_LED_SUFFIX, dev_name(&dev->dev)); drvdata->backlight->removed = false; drvdata->backlight->cdev.name = name; drvdata->backlight->cdev.max_brightness = 3; drvdata->backlight->cdev.brightness_set = k90_brightness_set; drvdata->backlight->cdev.brightness_get = k90_backlight_get; INIT_WORK(&drvdata->backlight->work, k90_backlight_work); ret = led_classdev_register(&dev->dev, &drvdata->backlight->cdev); if (ret != 0) goto fail_register_cdev; return 0; fail_register_cdev: kfree(drvdata->backlight->cdev.name); fail_name_alloc: kfree(drvdata->backlight); drvdata->backlight = NULL; fail_backlight_alloc: return ret; } static int k90_init_macro_functions(struct hid_device *dev) { int ret; struct corsair_drvdata *drvdata = hid_get_drvdata(dev); struct k90_drvdata *k90; size_t name_sz; char *name; k90 = kzalloc(sizeof(struct k90_drvdata), GFP_KERNEL); if (!k90) { ret = -ENOMEM; goto fail_drvdata; } drvdata->k90 = k90; /* Init LED device for record LED */ name_sz = strlen(dev_name(&dev->dev)) + sizeof(K90_RECORD_LED_SUFFIX); name = kzalloc(name_sz, GFP_KERNEL); if (!name) { ret = -ENOMEM; goto fail_record_led_alloc; } snprintf(name, name_sz, "%s" K90_RECORD_LED_SUFFIX, dev_name(&dev->dev)); k90->record_led.removed = false; k90->record_led.cdev.name = name; k90->record_led.cdev.max_brightness = 1; k90->record_led.cdev.brightness_set = k90_brightness_set; k90->record_led.cdev.brightness_get = k90_record_led_get; INIT_WORK(&k90->record_led.work, k90_record_led_work); k90->record_led.brightness = 0; ret = led_classdev_register(&dev->dev, &k90->record_led.cdev); if (ret != 0) goto fail_record_led; /* Init attributes */ ret = sysfs_create_group(&dev->dev.kobj, &k90_attr_group); if (ret != 0) goto fail_sysfs; return 0; fail_sysfs: k90->record_led.removed = true; led_classdev_unregister(&k90->record_led.cdev); cancel_work_sync(&k90->record_led.work); fail_record_led: kfree(k90->record_led.cdev.name); fail_record_led_alloc: kfree(k90); fail_drvdata: drvdata->k90 = NULL; return ret; } static void k90_cleanup_backlight(struct hid_device *dev) { struct corsair_drvdata *drvdata = hid_get_drvdata(dev); if (drvdata->backlight) { drvdata->backlight->removed = true; led_classdev_unregister(&drvdata->backlight->cdev); cancel_work_sync(&drvdata->backlight->work); kfree(drvdata->backlight->cdev.name); kfree(drvdata->backlight); } } static void k90_cleanup_macro_functions(struct hid_device *dev) { struct corsair_drvdata *drvdata = hid_get_drvdata(dev); struct k90_drvdata *k90 = drvdata->k90; if (k90) { sysfs_remove_group(&dev->dev.kobj, &k90_attr_group); k90->record_led.removed = true; led_classdev_unregister(&k90->record_led.cdev); cancel_work_sync(&k90->record_led.work); kfree(k90->record_led.cdev.name); kfree(k90); } } static int corsair_probe(struct hid_device *dev, const struct hid_device_id *id) { int ret; unsigned long quirks = id->driver_data; struct corsair_drvdata *drvdata; struct usb_interface *usbif; if (!hid_is_usb(dev)) return -EINVAL; usbif = to_usb_interface(dev->dev.parent); drvdata = devm_kzalloc(&dev->dev, sizeof(struct corsair_drvdata), GFP_KERNEL); if (drvdata == NULL) return -ENOMEM; drvdata->quirks = quirks; hid_set_drvdata(dev, drvdata); ret = hid_parse(dev); if (ret != 0) { hid_err(dev, "parse failed\n"); return ret; } ret = hid_hw_start(dev, HID_CONNECT_DEFAULT); if (ret != 0) { hid_err(dev, "hw start failed\n"); return ret; } if (usbif->cur_altsetting->desc.bInterfaceNumber == 0) { if (quirks & CORSAIR_USE_K90_MACRO) { ret = k90_init_macro_functions(dev); if (ret != 0) hid_warn(dev, "Failed to initialize K90 macro functions.\n"); } if (quirks & CORSAIR_USE_K90_BACKLIGHT) { ret = k90_init_backlight(dev); if (ret != 0) hid_warn(dev, "Failed to initialize K90 backlight.\n"); } } return 0; } static void corsair_remove(struct hid_device *dev) { k90_cleanup_macro_functions(dev); k90_cleanup_backlight(dev); hid_hw_stop(dev); } static int corsair_event(struct hid_device *dev, struct hid_field *field, struct hid_usage *usage, __s32 value) { struct corsair_drvdata *drvdata = hid_get_drvdata(dev); if (!drvdata->k90) return 0; switch (usage->hid & HID_USAGE) { case CORSAIR_USAGE_MACRO_RECORD_START: drvdata->k90->record_led.brightness = 1; break; case CORSAIR_USAGE_MACRO_RECORD_STOP: drvdata->k90->record_led.brightness = 0; break; default: break; } return 0; } static int corsair_input_mapping(struct hid_device *dev, struct hid_input *input, struct hid_field *field, struct hid_usage *usage, unsigned long **bit, int *max) { int gkey; if ((usage->hid & HID_USAGE_PAGE) != HID_UP_KEYBOARD) return 0; gkey = corsair_usage_to_gkey(usage->hid & HID_USAGE); if (gkey != 0) { hid_map_usage_clear(input, usage, bit, max, EV_KEY, corsair_gkey_map[gkey - 1]); return 1; } if ((usage->hid & HID_USAGE) >= CORSAIR_USAGE_SPECIAL_MIN && (usage->hid & HID_USAGE) <= CORSAIR_USAGE_SPECIAL_MAX) { switch (usage->hid & HID_USAGE) { case CORSAIR_USAGE_MACRO_RECORD_START: hid_map_usage_clear(input, usage, bit, max, EV_KEY, corsair_record_keycodes[0]); return 1; case CORSAIR_USAGE_MACRO_RECORD_STOP: hid_map_usage_clear(input, usage, bit, max, EV_KEY, corsair_record_keycodes[1]); return 1; case CORSAIR_USAGE_M1: hid_map_usage_clear(input, usage, bit, max, EV_KEY, corsair_profile_keycodes[0]); return 1; case CORSAIR_USAGE_M2: hid_map_usage_clear(input, usage, bit, max, EV_KEY, corsair_profile_keycodes[1]); return 1; case CORSAIR_USAGE_M3: hid_map_usage_clear(input, usage, bit, max, EV_KEY, corsair_profile_keycodes[2]); return 1; default: return -1; } } return 0; } /* * The report descriptor of some of the Corsair gaming mice is * non parseable as they define two consecutive Logical Minimum for * the Usage Page (Consumer) in rdescs bytes 75 and 77 being 77 0x16 * that should be obviousy 0x26 for Logical Magimum of 16 bits. This * prevents poper parsing of the report descriptor due Logical * Minimum being larger than Logical Maximum. * * This driver fixes the report descriptor for: * - USB ID 1b1c:1b34, sold as GLAIVE RGB Gaming mouse * - USB ID 1b1c:1b3e, sold as Scimitar RGB Pro Gaming mouse */ static const __u8 *corsair_mouse_report_fixup(struct hid_device *hdev, __u8 *rdesc, unsigned int *rsize) { struct usb_interface *intf = to_usb_interface(hdev->dev.parent); if (intf->cur_altsetting->desc.bInterfaceNumber == 1) { /* * Corsair GLAIVE RGB and Scimitar RGB Pro report descriptor is * broken and defines two different Logical Minimum for the * Consumer Application. The byte 77 should be a 0x26 defining * a 16 bits integer for the Logical Maximum but it is a 0x16 * instead (Logical Minimum) */ switch (hdev->product) { case USB_DEVICE_ID_CORSAIR_GLAIVE_RGB: case USB_DEVICE_ID_CORSAIR_SCIMITAR_PRO_RGB: if (*rsize >= 172 && rdesc[75] == 0x15 && rdesc[77] == 0x16 && rdesc[78] == 0xff && rdesc[79] == 0x0f) { hid_info(hdev, "Fixing up report descriptor\n"); rdesc[77] = 0x26; } break; } } return rdesc; } static const struct hid_device_id corsair_devices[] = { { HID_USB_DEVICE(USB_VENDOR_ID_CORSAIR, USB_DEVICE_ID_CORSAIR_K90), .driver_data = CORSAIR_USE_K90_MACRO | CORSAIR_USE_K90_BACKLIGHT }, { HID_USB_DEVICE(USB_VENDOR_ID_CORSAIR, USB_DEVICE_ID_CORSAIR_GLAIVE_RGB) }, { HID_USB_DEVICE(USB_VENDOR_ID_CORSAIR, USB_DEVICE_ID_CORSAIR_SCIMITAR_PRO_RGB) }, /* * Vengeance K70 and K70 RAPIDFIRE share product IDs. */ { HID_USB_DEVICE(USB_VENDOR_ID_CORSAIR, USB_DEVICE_ID_CORSAIR_K70R) }, {} }; MODULE_DEVICE_TABLE(hid, corsair_devices); static struct hid_driver corsair_driver = { .name = "corsair", .id_table = corsair_devices, .probe = corsair_probe, .event = corsair_event, .remove = corsair_remove, .input_mapping = corsair_input_mapping, .report_fixup = corsair_mouse_report_fixup, }; module_hid_driver(corsair_driver); MODULE_LICENSE("GPL"); /* Original K90 driver author */ MODULE_AUTHOR("Clement Vuchener"); /* Scimitar PRO RGB driver author */ MODULE_AUTHOR("Oscar Campos"); MODULE_DESCRIPTION("HID driver for Corsair devices"); |
| 15 15 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. * All Rights Reserved. */ #ifndef __XFS_LOG_H__ #define __XFS_LOG_H__ struct xfs_cil_ctx; struct xfs_log_vec { struct list_head lv_list; /* CIL lv chain ptrs */ uint32_t lv_order_id; /* chain ordering info */ int lv_niovecs; /* number of iovecs in lv */ struct xfs_log_iovec *lv_iovecp; /* iovec array */ struct xfs_log_item *lv_item; /* owner */ char *lv_buf; /* formatted buffer */ int lv_bytes; /* accounted space in buffer */ int lv_buf_used; /* buffer space used so far */ int lv_alloc_size; /* size of allocated lv */ }; /* Region types for iovec's i_type */ #define XLOG_REG_TYPE_BFORMAT 1 #define XLOG_REG_TYPE_BCHUNK 2 #define XLOG_REG_TYPE_EFI_FORMAT 3 #define XLOG_REG_TYPE_EFD_FORMAT 4 #define XLOG_REG_TYPE_IFORMAT 5 #define XLOG_REG_TYPE_ICORE 6 #define XLOG_REG_TYPE_IEXT 7 #define XLOG_REG_TYPE_IBROOT 8 #define XLOG_REG_TYPE_ILOCAL 9 #define XLOG_REG_TYPE_IATTR_EXT 10 #define XLOG_REG_TYPE_IATTR_BROOT 11 #define XLOG_REG_TYPE_IATTR_LOCAL 12 #define XLOG_REG_TYPE_QFORMAT 13 #define XLOG_REG_TYPE_DQUOT 14 #define XLOG_REG_TYPE_QUOTAOFF 15 #define XLOG_REG_TYPE_LRHEADER 16 #define XLOG_REG_TYPE_UNMOUNT 17 #define XLOG_REG_TYPE_COMMIT 18 #define XLOG_REG_TYPE_TRANSHDR 19 #define XLOG_REG_TYPE_ICREATE 20 #define XLOG_REG_TYPE_RUI_FORMAT 21 #define XLOG_REG_TYPE_RUD_FORMAT 22 #define XLOG_REG_TYPE_CUI_FORMAT 23 #define XLOG_REG_TYPE_CUD_FORMAT 24 #define XLOG_REG_TYPE_BUI_FORMAT 25 #define XLOG_REG_TYPE_BUD_FORMAT 26 #define XLOG_REG_TYPE_ATTRI_FORMAT 27 #define XLOG_REG_TYPE_ATTRD_FORMAT 28 #define XLOG_REG_TYPE_ATTR_NAME 29 #define XLOG_REG_TYPE_ATTR_VALUE 30 #define XLOG_REG_TYPE_XMI_FORMAT 31 #define XLOG_REG_TYPE_XMD_FORMAT 32 #define XLOG_REG_TYPE_ATTR_NEWNAME 33 #define XLOG_REG_TYPE_ATTR_NEWVALUE 34 #define XLOG_REG_TYPE_MAX 34 #define XFS_LOG_VEC_ORDERED (-1) /* * Calculate the log iovec length for a given user buffer length. Intended to be * used by ->iop_size implementations when sizing buffers of arbitrary * alignments. */ static inline int xlog_calc_iovec_len(int len) { return roundup(len, sizeof(uint32_t)); } void *xlog_prepare_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp, uint type); static inline void xlog_finish_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec *vec, int data_len) { struct xlog_op_header *oph = vec->i_addr; int len; /* * Always round up the length to the correct alignment so callers don't * need to know anything about this log vec layout requirement. This * means we have to zero the area the data to be written does not cover. * This is complicated by fact the payload region is offset into the * logvec region by the opheader that tracks the payload. */ len = xlog_calc_iovec_len(data_len); if (len - data_len != 0) { char *buf = vec->i_addr + sizeof(struct xlog_op_header); memset(buf + data_len, 0, len - data_len); } /* * The opheader tracks aligned payload length, whilst the logvec tracks * the overall region length. */ oph->oh_len = cpu_to_be32(len); len += sizeof(struct xlog_op_header); lv->lv_buf_used += len; lv->lv_bytes += len; vec->i_len = len; /* Catch buffer overruns */ ASSERT((void *)lv->lv_buf + lv->lv_bytes <= (void *)lv + lv->lv_alloc_size); } /* * Copy the amount of data requested by the caller into a new log iovec. */ static inline void * xlog_copy_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp, uint type, void *data, int len) { void *buf; buf = xlog_prepare_iovec(lv, vecp, type); memcpy(buf, data, len); xlog_finish_iovec(lv, *vecp, len); return buf; } /* * By comparing each component, we don't have to worry about extra * endian issues in treating two 32 bit numbers as one 64 bit number */ static inline xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2) { if (CYCLE_LSN(lsn1) != CYCLE_LSN(lsn2)) return (CYCLE_LSN(lsn1)<CYCLE_LSN(lsn2))? -999 : 999; if (BLOCK_LSN(lsn1) != BLOCK_LSN(lsn2)) return (BLOCK_LSN(lsn1)<BLOCK_LSN(lsn2))? -999 : 999; return 0; } #define XFS_LSN_CMP(x,y) _lsn_cmp(x,y) /* * Flags to xfs_log_force() * * XFS_LOG_SYNC: Synchronous force in-core log to disk */ #define XFS_LOG_SYNC 0x1 /* Log manager interfaces */ struct xfs_mount; struct xlog_in_core; struct xlog_ticket; struct xfs_log_item; struct xfs_item_ops; struct xfs_trans; struct xlog; int xfs_log_force(struct xfs_mount *mp, uint flags); int xfs_log_force_seq(struct xfs_mount *mp, xfs_csn_t seq, uint flags, int *log_forced); int xfs_log_mount(struct xfs_mount *mp, struct xfs_buftarg *log_target, xfs_daddr_t start_block, int num_bblocks); int xfs_log_mount_finish(struct xfs_mount *mp); void xfs_log_mount_cancel(struct xfs_mount *); xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp); xfs_lsn_t xlog_assign_tail_lsn_locked(struct xfs_mount *mp); void xfs_log_space_wake(struct xfs_mount *mp); int xfs_log_reserve(struct xfs_mount *mp, int length, int count, struct xlog_ticket **ticket, bool permanent); int xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic); void xfs_log_unmount(struct xfs_mount *mp); bool xfs_log_writable(struct xfs_mount *mp); struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket); void xfs_log_ticket_put(struct xlog_ticket *ticket); void xlog_cil_process_committed(struct list_head *list); bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); void xfs_log_work_queue(struct xfs_mount *mp); int xfs_log_quiesce(struct xfs_mount *mp); void xfs_log_clean(struct xfs_mount *mp); bool xfs_log_check_lsn(struct xfs_mount *, xfs_lsn_t); bool xlog_force_shutdown(struct xlog *log, uint32_t shutdown_flags); #endif /* __XFS_LOG_H__ */ |
| 1 1 1 1 1 1 1 6 1 1 2 2 6 4 2 6 6 6 6 1 1 1 1 4 4 8 8 1 4 1 1 2 4 4 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 | // SPDX-License-Identifier: GPL-2.0-only /* * Overlayfs NFS export support. * * Amir Goldstein <amir73il@gmail.com> * * Copyright (C) 2017-2018 CTERA Networks. All Rights Reserved. */ #include <linux/fs.h> #include <linux/cred.h> #include <linux/mount.h> #include <linux/namei.h> #include <linux/xattr.h> #include <linux/exportfs.h> #include <linux/ratelimit.h> #include "overlayfs.h" static int ovl_encode_maybe_copy_up(struct dentry *dentry) { int err; if (ovl_dentry_upper(dentry)) return 0; err = ovl_copy_up(dentry); if (err) { pr_warn_ratelimited("failed to copy up on encode (%pd2, err=%i)\n", dentry, err); } return err; } /* * Before encoding a non-upper directory file handle from real layer N, we need * to check if it will be possible to reconnect an overlay dentry from the real * lower decoded dentry. This is done by following the overlay ancestry up to a * "layer N connected" ancestor and verifying that all parents along the way are * "layer N connectable". If an ancestor that is NOT "layer N connectable" is * found, we need to copy up an ancestor, which is "layer N connectable", thus * making that ancestor "layer N connected". For example: * * layer 1: /a * layer 2: /a/b/c * * The overlay dentry /a is NOT "layer 2 connectable", because if dir /a is * copied up and renamed, upper dir /a will be indexed by lower dir /a from * layer 1. The dir /a from layer 2 will never be indexed, so the algorithm (*) * in ovl_lookup_real_ancestor() will not be able to lookup a connected overlay * dentry from the connected lower dentry /a/b/c. * * To avoid this problem on decode time, we need to copy up an ancestor of * /a/b/c, which is "layer 2 connectable", on encode time. That ancestor is * /a/b. After copy up (and index) of /a/b, it will become "layer 2 connected" * and when the time comes to decode the file handle from lower dentry /a/b/c, * ovl_lookup_real_ancestor() will find the indexed ancestor /a/b and decoding * a connected overlay dentry will be accomplished. * * (*) the algorithm in ovl_lookup_real_ancestor() can be improved to lookup an * entry /a in the lower layers above layer N and find the indexed dir /a from * layer 1. If that improvement is made, then the check for "layer N connected" * will need to verify there are no redirects in lower layers above N. In the * example above, /a will be "layer 2 connectable". However, if layer 2 dir /a * is a target of a layer 1 redirect, then /a will NOT be "layer 2 connectable": * * layer 1: /A (redirect = /a) * layer 2: /a/b/c */ /* Return the lowest layer for encoding a connectable file handle */ static int ovl_connectable_layer(struct dentry *dentry) { struct ovl_entry *oe = OVL_E(dentry); /* We can get overlay root from root of any layer */ if (dentry == dentry->d_sb->s_root) return ovl_numlower(oe); /* * If it's an unindexed merge dir, then it's not connectable with any * lower layer */ if (ovl_dentry_upper(dentry) && !ovl_test_flag(OVL_INDEX, d_inode(dentry))) return 0; /* We can get upper/overlay path from indexed/lower dentry */ return ovl_lowerstack(oe)->layer->idx; } /* * @dentry is "connected" if all ancestors up to root or a "connected" ancestor * have the same uppermost lower layer as the origin's layer. We may need to * copy up a "connectable" ancestor to make it "connected". A "connected" dentry * cannot become non "connected", so cache positive result in dentry flags. * * Return the connected origin layer or < 0 on error. */ static int ovl_connect_layer(struct dentry *dentry) { struct dentry *next, *parent = NULL; struct ovl_entry *oe = OVL_E(dentry); int origin_layer; int err = 0; if (WARN_ON(dentry == dentry->d_sb->s_root) || WARN_ON(!ovl_dentry_lower(dentry))) return -EIO; origin_layer = ovl_lowerstack(oe)->layer->idx; if (ovl_dentry_test_flag(OVL_E_CONNECTED, dentry)) return origin_layer; /* Find the topmost origin layer connectable ancestor of @dentry */ next = dget(dentry); for (;;) { parent = dget_parent(next); if (WARN_ON(parent == next)) { err = -EIO; break; } /* * If @parent is not origin layer connectable, then copy up * @next which is origin layer connectable and we are done. */ if (ovl_connectable_layer(parent) < origin_layer) { err = ovl_encode_maybe_copy_up(next); break; } /* If @parent is connected or indexed we are done */ if (ovl_dentry_test_flag(OVL_E_CONNECTED, parent) || ovl_test_flag(OVL_INDEX, d_inode(parent))) break; dput(next); next = parent; } dput(parent); dput(next); if (!err) ovl_dentry_set_flag(OVL_E_CONNECTED, dentry); return err ?: origin_layer; } /* * We only need to encode origin if there is a chance that the same object was * encoded pre copy up and then we need to stay consistent with the same * encoding also after copy up. If non-pure upper is not indexed, then it was * copied up before NFS export was enabled. In that case we don't need to worry * about staying consistent with pre copy up encoding and we encode an upper * file handle. Overlay root dentry is a private case of non-indexed upper. * * The following table summarizes the different file handle encodings used for * different overlay object types: * * Object type | Encoding * -------------------------------- * Pure upper | U * Non-indexed upper | U * Indexed upper | L (*) * Non-upper | L (*) * * U = upper file handle * L = lower file handle * * (*) Decoding a connected overlay dir from real lower dentry is not always * possible when there are redirects in lower layers and non-indexed merge dirs. * To mitigate those case, we may copy up the lower dir ancestor before encode * of a decodable file handle for non-upper dir. * * Return 0 for upper file handle, > 0 for lower file handle or < 0 on error. */ static int ovl_check_encode_origin(struct inode *inode) { struct ovl_fs *ofs = OVL_FS(inode->i_sb); bool decodable = ofs->config.nfs_export; struct dentry *dentry; int err; /* No upper layer? */ if (!ovl_upper_mnt(ofs)) return 1; /* Lower file handle for non-upper non-decodable */ if (!ovl_inode_upper(inode) && !decodable) return 1; /* Upper file handle for pure upper */ if (!ovl_inode_lower(inode)) return 0; /* * Root is never indexed, so if there's an upper layer, encode upper for * root. */ if (inode == d_inode(inode->i_sb->s_root)) return 0; /* * Upper decodable file handle for non-indexed upper. */ if (ovl_inode_upper(inode) && decodable && !ovl_test_flag(OVL_INDEX, inode)) return 0; /* * Decoding a merge dir, whose origin's ancestor is under a redirected * lower dir or under a non-indexed upper is not always possible. * ovl_connect_layer() will try to make origin's layer "connected" by * copying up a "connectable" ancestor. */ if (!decodable || !S_ISDIR(inode->i_mode)) return 1; dentry = d_find_any_alias(inode); if (!dentry) return -ENOENT; err = ovl_connect_layer(dentry); dput(dentry); if (err < 0) return err; /* Lower file handle for indexed and non-upper dir/non-dir */ return 1; } static int ovl_dentry_to_fid(struct ovl_fs *ofs, struct inode *inode, u32 *fid, int buflen) { struct ovl_fh *fh = NULL; int err, enc_lower; int len; /* * Check if we should encode a lower or upper file handle and maybe * copy up an ancestor to make lower file handle connectable. */ err = enc_lower = ovl_check_encode_origin(inode); if (enc_lower < 0) goto fail; /* Encode an upper or lower file handle */ fh = ovl_encode_real_fh(ofs, enc_lower ? ovl_inode_lower(inode) : ovl_inode_upper(inode), !enc_lower); if (IS_ERR(fh)) return PTR_ERR(fh); len = OVL_FH_LEN(fh); if (len <= buflen) memcpy(fid, fh, len); err = len; out: kfree(fh); return err; fail: pr_warn_ratelimited("failed to encode file handle (ino=%lu, err=%i)\n", inode->i_ino, err); goto out; } static int ovl_encode_fh(struct inode *inode, u32 *fid, int *max_len, struct inode *parent) { struct ovl_fs *ofs = OVL_FS(inode->i_sb); int bytes, buflen = *max_len << 2; /* TODO: encode connectable file handles */ if (parent) return FILEID_INVALID; bytes = ovl_dentry_to_fid(ofs, inode, fid, buflen); if (bytes <= 0) return FILEID_INVALID; *max_len = bytes >> 2; if (bytes > buflen) return FILEID_INVALID; return OVL_FILEID_V1; } /* * Find or instantiate an overlay dentry from real dentries and index. */ static struct dentry *ovl_obtain_alias(struct super_block *sb, struct dentry *upper_alias, struct ovl_path *lowerpath, struct dentry *index) { struct dentry *lower = lowerpath ? lowerpath->dentry : NULL; struct dentry *upper = upper_alias ?: index; struct inode *inode = NULL; struct ovl_entry *oe; struct ovl_inode_params oip = { .index = index, }; /* We get overlay directory dentries with ovl_lookup_real() */ if (d_is_dir(upper ?: lower)) return ERR_PTR(-EIO); oe = ovl_alloc_entry(!!lower); if (!oe) return ERR_PTR(-ENOMEM); oip.upperdentry = dget(upper); if (lower) { ovl_lowerstack(oe)->dentry = dget(lower); ovl_lowerstack(oe)->layer = lowerpath->layer; } oip.oe = oe; inode = ovl_get_inode(sb, &oip); if (IS_ERR(inode)) { ovl_free_entry(oe); dput(upper); return ERR_CAST(inode); } if (upper) ovl_set_flag(OVL_UPPERDATA, inode); return d_obtain_alias(inode); } /* Get the upper or lower dentry in stack whose on layer @idx */ static struct dentry *ovl_dentry_real_at(struct dentry *dentry, int idx) { struct ovl_entry *oe = OVL_E(dentry); struct ovl_path *lowerstack = ovl_lowerstack(oe); int i; if (!idx) return ovl_dentry_upper(dentry); for (i = 0; i < ovl_numlower(oe); i++) { if (lowerstack[i].layer->idx == idx) return lowerstack[i].dentry; } return NULL; } /* * Lookup a child overlay dentry to get a connected overlay dentry whose real * dentry is @real. If @real is on upper layer, we lookup a child overlay * dentry with the same name as the real dentry. Otherwise, we need to consult * index for lookup. */ static struct dentry *ovl_lookup_real_one(struct dentry *connected, struct dentry *real, const struct ovl_layer *layer) { struct inode *dir = d_inode(connected); struct dentry *this, *parent = NULL; struct name_snapshot name; int err; /* * Lookup child overlay dentry by real name. The dir mutex protects us * from racing with overlay rename. If the overlay dentry that is above * real has already been moved to a parent that is not under the * connected overlay dir, we return -ECHILD and restart the lookup of * connected real path from the top. */ inode_lock_nested(dir, I_MUTEX_PARENT); err = -ECHILD; parent = dget_parent(real); if (ovl_dentry_real_at(connected, layer->idx) != parent) goto fail; /* * We also need to take a snapshot of real dentry name to protect us * from racing with underlying layer rename. In this case, we don't * care about returning ESTALE, only from dereferencing a free name * pointer because we hold no lock on the real dentry. */ take_dentry_name_snapshot(&name, real); /* * No idmap handling here: it's an internal lookup. */ this = lookup_noperm(&name.name, connected); release_dentry_name_snapshot(&name); err = PTR_ERR(this); if (IS_ERR(this)) { goto fail; } else if (!this || !this->d_inode) { dput(this); err = -ENOENT; goto fail; } else if (ovl_dentry_real_at(this, layer->idx) != real) { dput(this); err = -ESTALE; goto fail; } out: dput(parent); inode_unlock(dir); return this; fail: pr_warn_ratelimited("failed to lookup one by real (%pd2, layer=%d, connected=%pd2, err=%i)\n", real, layer->idx, connected, err); this = ERR_PTR(err); goto out; } static struct dentry *ovl_lookup_real(struct super_block *sb, struct dentry *real, const struct ovl_layer *layer); /* * Lookup an indexed or hashed overlay dentry by real inode. */ static struct dentry *ovl_lookup_real_inode(struct super_block *sb, struct dentry *real, const struct ovl_layer *layer) { struct ovl_fs *ofs = OVL_FS(sb); struct dentry *index = NULL; struct dentry *this = NULL; struct inode *inode; /* * Decoding upper dir from index is expensive, so first try to lookup * overlay dentry in inode/dcache. */ inode = ovl_lookup_inode(sb, real, !layer->idx); if (IS_ERR(inode)) return ERR_CAST(inode); if (inode) { this = d_find_any_alias(inode); iput(inode); } /* * For decoded lower dir file handle, lookup index by origin to check * if lower dir was copied up and and/or removed. */ if (!this && layer->idx && ovl_indexdir(sb) && !WARN_ON(!d_is_dir(real))) { index = ovl_lookup_index(ofs, NULL, real, false); if (IS_ERR(index)) return index; } /* Get connected upper overlay dir from index */ if (index) { struct dentry *upper = ovl_index_upper(ofs, index, true); dput(index); if (IS_ERR_OR_NULL(upper)) return upper; /* * ovl_lookup_real() in lower layer may call recursively once to * ovl_lookup_real() in upper layer. The first level call walks * back lower parents to the topmost indexed parent. The second * recursive call walks back from indexed upper to the topmost * connected/hashed upper parent (or up to root). */ this = ovl_lookup_real(sb, upper, &ofs->layers[0]); dput(upper); } if (IS_ERR_OR_NULL(this)) return this; if (ovl_dentry_real_at(this, layer->idx) != real) { dput(this); this = ERR_PTR(-EIO); } return this; } /* * Lookup an indexed or hashed overlay dentry, whose real dentry is an * ancestor of @real. */ static struct dentry *ovl_lookup_real_ancestor(struct super_block *sb, struct dentry *real, const struct ovl_layer *layer) { struct dentry *next, *parent = NULL; struct dentry *ancestor = ERR_PTR(-EIO); if (real == layer->mnt->mnt_root) return dget(sb->s_root); /* Find the topmost indexed or hashed ancestor */ next = dget(real); for (;;) { parent = dget_parent(next); /* * Lookup a matching overlay dentry in inode/dentry * cache or in index by real inode. */ ancestor = ovl_lookup_real_inode(sb, next, layer); if (ancestor) break; if (parent == layer->mnt->mnt_root) { ancestor = dget(sb->s_root); break; } /* * If @real has been moved out of the layer root directory, * we will eventully hit the real fs root. This cannot happen * by legit overlay rename, so we return error in that case. */ if (parent == next) { ancestor = ERR_PTR(-EXDEV); break; } dput(next); next = parent; } dput(parent); dput(next); return ancestor; } /* * Lookup a connected overlay dentry whose real dentry is @real. * If @real is on upper layer, we lookup a child overlay dentry with the same * path the real dentry. Otherwise, we need to consult index for lookup. */ static struct dentry *ovl_lookup_real(struct super_block *sb, struct dentry *real, const struct ovl_layer *layer) { struct dentry *connected; int err = 0; connected = ovl_lookup_real_ancestor(sb, real, layer); if (IS_ERR(connected)) return connected; while (!err) { struct dentry *next, *this; struct dentry *parent = NULL; struct dentry *real_connected = ovl_dentry_real_at(connected, layer->idx); if (real_connected == real) break; /* Find the topmost dentry not yet connected */ next = dget(real); for (;;) { parent = dget_parent(next); if (parent == real_connected) break; /* * If real has been moved out of 'real_connected', * we will not find 'real_connected' and hit the layer * root. In that case, we need to restart connecting. * This game can go on forever in the worst case. We * may want to consider taking s_vfs_rename_mutex if * this happens more than once. */ if (parent == layer->mnt->mnt_root) { dput(connected); connected = dget(sb->s_root); break; } /* * If real file has been moved out of the layer root * directory, we will eventully hit the real fs root. * This cannot happen by legit overlay rename, so we * return error in that case. */ if (parent == next) { err = -EXDEV; break; } dput(next); next = parent; } if (!err) { this = ovl_lookup_real_one(connected, next, layer); if (IS_ERR(this)) err = PTR_ERR(this); /* * Lookup of child in overlay can fail when racing with * overlay rename of child away from 'connected' parent. * In this case, we need to restart the lookup from the * top, because we cannot trust that 'real_connected' is * still an ancestor of 'real'. There is a good chance * that the renamed overlay ancestor is now in cache, so * ovl_lookup_real_ancestor() will find it and we can * continue to connect exactly from where lookup failed. */ if (err == -ECHILD) { this = ovl_lookup_real_ancestor(sb, real, layer); err = PTR_ERR_OR_ZERO(this); } if (!err) { dput(connected); connected = this; } } dput(parent); dput(next); } if (err) goto fail; return connected; fail: pr_warn_ratelimited("failed to lookup by real (%pd2, layer=%d, connected=%pd2, err=%i)\n", real, layer->idx, connected, err); dput(connected); return ERR_PTR(err); } /* * Get an overlay dentry from upper/lower real dentries and index. */ static struct dentry *ovl_get_dentry(struct super_block *sb, struct dentry *upper, struct ovl_path *lowerpath, struct dentry *index) { struct ovl_fs *ofs = OVL_FS(sb); const struct ovl_layer *layer = upper ? &ofs->layers[0] : lowerpath->layer; struct dentry *real = upper ?: (index ?: lowerpath->dentry); /* * Obtain a disconnected overlay dentry from a non-dir real dentry * and index. */ if (!d_is_dir(real)) return ovl_obtain_alias(sb, upper, lowerpath, index); /* Removed empty directory? */ if ((real->d_flags & DCACHE_DISCONNECTED) || d_unhashed(real)) return ERR_PTR(-ENOENT); /* * If real dentry is connected and hashed, get a connected overlay * dentry whose real dentry is @real. */ return ovl_lookup_real(sb, real, layer); } static struct dentry *ovl_upper_fh_to_d(struct super_block *sb, struct ovl_fh *fh) { struct ovl_fs *ofs = OVL_FS(sb); struct dentry *dentry; struct dentry *upper; if (!ovl_upper_mnt(ofs)) return ERR_PTR(-EACCES); upper = ovl_decode_real_fh(ofs, fh, ovl_upper_mnt(ofs), true); if (IS_ERR_OR_NULL(upper)) return upper; dentry = ovl_get_dentry(sb, upper, NULL, NULL); dput(upper); return dentry; } static struct dentry *ovl_lower_fh_to_d(struct super_block *sb, struct ovl_fh *fh) { struct ovl_fs *ofs = OVL_FS(sb); struct ovl_path origin = { }; struct ovl_path *stack = &origin; struct dentry *dentry = NULL; struct dentry *index = NULL; struct inode *inode; int err; /* First lookup overlay inode in inode cache by origin fh */ err = ovl_check_origin_fh(ofs, fh, false, NULL, &stack); if (err) return ERR_PTR(err); if (!d_is_dir(origin.dentry) || !(origin.dentry->d_flags & DCACHE_DISCONNECTED)) { inode = ovl_lookup_inode(sb, origin.dentry, false); err = PTR_ERR(inode); if (IS_ERR(inode)) goto out_err; if (inode) { dentry = d_find_any_alias(inode); iput(inode); if (dentry) goto out; } } /* Then lookup indexed upper/whiteout by origin fh */ if (ovl_indexdir(sb)) { index = ovl_get_index_fh(ofs, fh); err = PTR_ERR(index); if (IS_ERR(index)) { index = NULL; goto out_err; } } /* Then try to get a connected upper dir by index */ if (index && d_is_dir(index)) { struct dentry *upper = ovl_index_upper(ofs, index, true); err = PTR_ERR(upper); if (IS_ERR_OR_NULL(upper)) goto out_err; dentry = ovl_get_dentry(sb, upper, NULL, NULL); dput(upper); goto out; } /* Find origin.dentry again with ovl_acceptable() layer check */ if (d_is_dir(origin.dentry)) { dput(origin.dentry); origin.dentry = NULL; err = ovl_check_origin_fh(ofs, fh, true, NULL, &stack); if (err) goto out_err; } if (index) { err = ovl_verify_origin(ofs, index, origin.dentry, false); if (err) goto out_err; } /* Get a connected non-upper dir or disconnected non-dir */ dentry = ovl_get_dentry(sb, NULL, &origin, index); out: dput(origin.dentry); dput(index); return dentry; out_err: dentry = ERR_PTR(err); goto out; } static struct ovl_fh *ovl_fid_to_fh(struct fid *fid, int buflen, int fh_type) { struct ovl_fh *fh; /* If on-wire inner fid is aligned - nothing to do */ if (fh_type == OVL_FILEID_V1) return (struct ovl_fh *)fid; if (fh_type != OVL_FILEID_V0) return ERR_PTR(-EINVAL); if (buflen <= OVL_FH_WIRE_OFFSET) return ERR_PTR(-EINVAL); fh = kzalloc(buflen, GFP_KERNEL); if (!fh) return ERR_PTR(-ENOMEM); /* Copy unaligned inner fh into aligned buffer */ memcpy(fh->buf, fid, buflen - OVL_FH_WIRE_OFFSET); return fh; } static struct dentry *ovl_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { struct dentry *dentry = NULL; struct ovl_fh *fh = NULL; int len = fh_len << 2; unsigned int flags = 0; int err; fh = ovl_fid_to_fh(fid, len, fh_type); err = PTR_ERR(fh); if (IS_ERR(fh)) goto out_err; err = ovl_check_fh_len(fh, len); if (err) goto out_err; flags = fh->fb.flags; dentry = (flags & OVL_FH_FLAG_PATH_UPPER) ? ovl_upper_fh_to_d(sb, fh) : ovl_lower_fh_to_d(sb, fh); err = PTR_ERR(dentry); if (IS_ERR(dentry) && err != -ESTALE) goto out_err; out: /* We may have needed to re-align OVL_FILEID_V0 */ if (!IS_ERR_OR_NULL(fh) && fh != (void *)fid) kfree(fh); return dentry; out_err: pr_warn_ratelimited("failed to decode file handle (len=%d, type=%d, flags=%x, err=%i)\n", fh_len, fh_type, flags, err); dentry = ERR_PTR(err); goto out; } static struct dentry *ovl_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { pr_warn_ratelimited("connectable file handles not supported; use 'no_subtree_check' exportfs option.\n"); return ERR_PTR(-EACCES); } static int ovl_get_name(struct dentry *parent, char *name, struct dentry *child) { /* * ovl_fh_to_dentry() returns connected dir overlay dentries and * ovl_fh_to_parent() is not implemented, so we should not get here. */ WARN_ON_ONCE(1); return -EIO; } static struct dentry *ovl_get_parent(struct dentry *dentry) { /* * ovl_fh_to_dentry() returns connected dir overlay dentries, so we * should not get here. */ WARN_ON_ONCE(1); return ERR_PTR(-EIO); } const struct export_operations ovl_export_operations = { .encode_fh = ovl_encode_fh, .fh_to_dentry = ovl_fh_to_dentry, .fh_to_parent = ovl_fh_to_parent, .get_name = ovl_get_name, .get_parent = ovl_get_parent, }; /* encode_fh() encodes non-decodable file handles with nfs_export=off */ const struct export_operations ovl_export_fid_operations = { .encode_fh = ovl_encode_fh, }; |
| 6 6 1 1 2 1 2 5 1 1 1 1 1 5 6 6 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 | // SPDX-License-Identifier: GPL-2.0-only /* * Optimized MPEG FS - inode and super operations. * Copyright (C) 2006 Bob Copeland <me@bobcopeland.com> */ #include <linux/module.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/fs.h> #include <linux/vfs.h> #include <linux/cred.h> #include <linux/buffer_head.h> #include <linux/vmalloc.h> #include <linux/writeback.h> #include <linux/seq_file.h> #include <linux/crc-itu-t.h> #include <linux/fs_struct.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include "omfs.h" MODULE_AUTHOR("Bob Copeland <me@bobcopeland.com>"); MODULE_DESCRIPTION("OMFS (ReplayTV/Karma) Filesystem for Linux"); MODULE_LICENSE("GPL"); struct buffer_head *omfs_bread(struct super_block *sb, sector_t block) { struct omfs_sb_info *sbi = OMFS_SB(sb); if (block >= sbi->s_num_blocks) return NULL; return sb_bread(sb, clus_to_blk(sbi, block)); } struct inode *omfs_new_inode(struct inode *dir, umode_t mode) { struct inode *inode; u64 new_block; int err; int len; struct omfs_sb_info *sbi = OMFS_SB(dir->i_sb); inode = new_inode(dir->i_sb); if (!inode) return ERR_PTR(-ENOMEM); err = omfs_allocate_range(dir->i_sb, sbi->s_mirrors, sbi->s_mirrors, &new_block, &len); if (err) goto fail; inode->i_ino = new_block; inode_init_owner(&nop_mnt_idmap, inode, NULL, mode); inode->i_mapping->a_ops = &omfs_aops; simple_inode_init_ts(inode); switch (mode & S_IFMT) { case S_IFDIR: inode->i_op = &omfs_dir_inops; inode->i_fop = &omfs_dir_operations; inode->i_size = sbi->s_sys_blocksize; inc_nlink(inode); break; case S_IFREG: inode->i_op = &omfs_file_inops; inode->i_fop = &omfs_file_operations; inode->i_size = 0; break; } insert_inode_hash(inode); mark_inode_dirty(inode); return inode; fail: make_bad_inode(inode); iput(inode); return ERR_PTR(err); } /* * Update the header checksums for a dirty inode based on its contents. * Caller is expected to hold the buffer head underlying oi and mark it * dirty. */ static void omfs_update_checksums(struct omfs_inode *oi) { int xor, i, ofs = 0, count; u16 crc = 0; unsigned char *ptr = (unsigned char *) oi; count = be32_to_cpu(oi->i_head.h_body_size); ofs = sizeof(struct omfs_header); crc = crc_itu_t(crc, ptr + ofs, count); oi->i_head.h_crc = cpu_to_be16(crc); xor = ptr[0]; for (i = 1; i < OMFS_XOR_COUNT; i++) xor ^= ptr[i]; oi->i_head.h_check_xor = xor; } static int __omfs_write_inode(struct inode *inode, int wait) { struct omfs_inode *oi; struct omfs_sb_info *sbi = OMFS_SB(inode->i_sb); struct buffer_head *bh, *bh2; u64 ctime; int i; int ret = -EIO; int sync_failed = 0; /* get current inode since we may have written sibling ptrs etc. */ bh = omfs_bread(inode->i_sb, inode->i_ino); if (!bh) goto out; oi = (struct omfs_inode *) bh->b_data; oi->i_head.h_self = cpu_to_be64(inode->i_ino); if (S_ISDIR(inode->i_mode)) oi->i_type = OMFS_DIR; else if (S_ISREG(inode->i_mode)) oi->i_type = OMFS_FILE; else { printk(KERN_WARNING "omfs: unknown file type: %d\n", inode->i_mode); goto out_brelse; } oi->i_head.h_body_size = cpu_to_be32(sbi->s_sys_blocksize - sizeof(struct omfs_header)); oi->i_head.h_version = 1; oi->i_head.h_type = OMFS_INODE_NORMAL; oi->i_head.h_magic = OMFS_IMAGIC; oi->i_size = cpu_to_be64(inode->i_size); ctime = inode_get_ctime_sec(inode) * 1000LL + ((inode_get_ctime_nsec(inode) + 999)/1000); oi->i_ctime = cpu_to_be64(ctime); omfs_update_checksums(oi); mark_buffer_dirty(bh); if (wait) { sync_dirty_buffer(bh); if (buffer_req(bh) && !buffer_uptodate(bh)) sync_failed = 1; } /* if mirroring writes, copy to next fsblock */ for (i = 1; i < sbi->s_mirrors; i++) { bh2 = omfs_bread(inode->i_sb, inode->i_ino + i); if (!bh2) goto out_brelse; memcpy(bh2->b_data, bh->b_data, bh->b_size); mark_buffer_dirty(bh2); if (wait) { sync_dirty_buffer(bh2); if (buffer_req(bh2) && !buffer_uptodate(bh2)) sync_failed = 1; } brelse(bh2); } ret = (sync_failed) ? -EIO : 0; out_brelse: brelse(bh); out: return ret; } static int omfs_write_inode(struct inode *inode, struct writeback_control *wbc) { return __omfs_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL); } int omfs_sync_inode(struct inode *inode) { return __omfs_write_inode(inode, 1); } /* * called when an entry is deleted, need to clear the bits in the * bitmaps. */ static void omfs_evict_inode(struct inode *inode) { truncate_inode_pages_final(&inode->i_data); clear_inode(inode); if (inode->i_nlink) return; if (S_ISREG(inode->i_mode)) { inode->i_size = 0; omfs_shrink_inode(inode); } omfs_clear_range(inode->i_sb, inode->i_ino, 2); } struct inode *omfs_iget(struct super_block *sb, ino_t ino) { struct omfs_sb_info *sbi = OMFS_SB(sb); struct omfs_inode *oi; struct buffer_head *bh; u64 ctime; unsigned long nsecs; struct inode *inode; inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); if (!(inode_state_read_once(inode) & I_NEW)) return inode; bh = omfs_bread(inode->i_sb, ino); if (!bh) goto iget_failed; oi = (struct omfs_inode *)bh->b_data; /* check self */ if (ino != be64_to_cpu(oi->i_head.h_self)) goto fail_bh; inode->i_uid = sbi->s_uid; inode->i_gid = sbi->s_gid; ctime = be64_to_cpu(oi->i_ctime); nsecs = do_div(ctime, 1000) * 1000L; inode_set_atime(inode, ctime, nsecs); inode_set_mtime(inode, ctime, nsecs); inode_set_ctime(inode, ctime, nsecs); inode->i_mapping->a_ops = &omfs_aops; switch (oi->i_type) { case OMFS_DIR: inode->i_mode = S_IFDIR | (S_IRWXUGO & ~sbi->s_dmask); inode->i_op = &omfs_dir_inops; inode->i_fop = &omfs_dir_operations; inode->i_size = sbi->s_sys_blocksize; inc_nlink(inode); break; case OMFS_FILE: inode->i_mode = S_IFREG | (S_IRWXUGO & ~sbi->s_fmask); inode->i_fop = &omfs_file_operations; inode->i_size = be64_to_cpu(oi->i_size); break; } brelse(bh); unlock_new_inode(inode); return inode; fail_bh: brelse(bh); iget_failed: iget_failed(inode); return ERR_PTR(-EIO); } static void omfs_put_super(struct super_block *sb) { struct omfs_sb_info *sbi = OMFS_SB(sb); kfree(sbi->s_imap); kfree(sbi); sb->s_fs_info = NULL; } static int omfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *s = dentry->d_sb; struct omfs_sb_info *sbi = OMFS_SB(s); u64 id = huge_encode_dev(s->s_bdev->bd_dev); buf->f_type = OMFS_MAGIC; buf->f_bsize = sbi->s_blocksize; buf->f_blocks = sbi->s_num_blocks; buf->f_files = sbi->s_num_blocks; buf->f_namelen = OMFS_NAMELEN; buf->f_fsid = u64_to_fsid(id); buf->f_bfree = buf->f_bavail = buf->f_ffree = omfs_count_free(s); return 0; } /* * Display the mount options in /proc/mounts. */ static int omfs_show_options(struct seq_file *m, struct dentry *root) { struct omfs_sb_info *sbi = OMFS_SB(root->d_sb); umode_t cur_umask = current_umask(); if (!uid_eq(sbi->s_uid, current_uid())) seq_printf(m, ",uid=%u", from_kuid_munged(&init_user_ns, sbi->s_uid)); if (!gid_eq(sbi->s_gid, current_gid())) seq_printf(m, ",gid=%u", from_kgid_munged(&init_user_ns, sbi->s_gid)); if (sbi->s_dmask == sbi->s_fmask) { if (sbi->s_fmask != cur_umask) seq_printf(m, ",umask=%o", sbi->s_fmask); } else { if (sbi->s_dmask != cur_umask) seq_printf(m, ",dmask=%o", sbi->s_dmask); if (sbi->s_fmask != cur_umask) seq_printf(m, ",fmask=%o", sbi->s_fmask); } return 0; } static const struct super_operations omfs_sops = { .write_inode = omfs_write_inode, .evict_inode = omfs_evict_inode, .put_super = omfs_put_super, .statfs = omfs_statfs, .show_options = omfs_show_options, }; /* * For Rio Karma, there is an on-disk free bitmap whose location is * stored in the root block. For ReplayTV, there is no such free bitmap * so we have to walk the tree. Both inodes and file data are allocated * from the same map. This array can be big (300k) so we allocate * in units of the blocksize. */ static int omfs_get_imap(struct super_block *sb) { unsigned int bitmap_size, array_size; int count; struct omfs_sb_info *sbi = OMFS_SB(sb); struct buffer_head *bh; unsigned long **ptr; sector_t block; bitmap_size = DIV_ROUND_UP(sbi->s_num_blocks, 8); array_size = DIV_ROUND_UP(bitmap_size, sb->s_blocksize); if (sbi->s_bitmap_ino == ~0ULL) goto out; sbi->s_imap_size = array_size; sbi->s_imap = kcalloc(array_size, sizeof(unsigned long *), GFP_KERNEL); if (!sbi->s_imap) goto nomem; block = clus_to_blk(sbi, sbi->s_bitmap_ino); if (block >= sbi->s_num_blocks) goto nomem; ptr = sbi->s_imap; for (count = bitmap_size; count > 0; count -= sb->s_blocksize) { bh = sb_bread(sb, block++); if (!bh) goto nomem_free; *ptr = kmemdup(bh->b_data, sb->s_blocksize, GFP_KERNEL); if (!*ptr) { brelse(bh); goto nomem_free; } if (count < sb->s_blocksize) memset((void *)*ptr + count, 0xff, sb->s_blocksize - count); brelse(bh); ptr++; } out: return 0; nomem_free: for (count = 0; count < array_size; count++) kfree(sbi->s_imap[count]); kfree(sbi->s_imap); nomem: sbi->s_imap = NULL; sbi->s_imap_size = 0; return -ENOMEM; } struct omfs_mount_options { kuid_t s_uid; kgid_t s_gid; int s_dmask; int s_fmask; }; enum { Opt_uid, Opt_gid, Opt_umask, Opt_dmask, Opt_fmask, }; static const struct fs_parameter_spec omfs_param_spec[] = { fsparam_uid ("uid", Opt_uid), fsparam_gid ("gid", Opt_gid), fsparam_u32oct ("umask", Opt_umask), fsparam_u32oct ("dmask", Opt_dmask), fsparam_u32oct ("fmask", Opt_fmask), {} }; static int omfs_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct omfs_mount_options *opts = fc->fs_private; int token; struct fs_parse_result result; /* All options are ignored on remount */ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) return 0; token = fs_parse(fc, omfs_param_spec, param, &result); if (token < 0) return token; switch (token) { case Opt_uid: opts->s_uid = result.uid; break; case Opt_gid: opts->s_gid = result.gid; break; case Opt_umask: opts->s_fmask = opts->s_dmask = result.uint_32; break; case Opt_dmask: opts->s_dmask = result.uint_32; break; case Opt_fmask: opts->s_fmask = result.uint_32; break; default: return -EINVAL; } return 0; } static void omfs_set_options(struct omfs_sb_info *sbi, struct omfs_mount_options *opts) { sbi->s_uid = opts->s_uid; sbi->s_gid = opts->s_gid; sbi->s_dmask = opts->s_dmask; sbi->s_fmask = opts->s_fmask; } static int omfs_fill_super(struct super_block *sb, struct fs_context *fc) { struct buffer_head *bh, *bh2; struct omfs_super_block *omfs_sb; struct omfs_root_block *omfs_rb; struct omfs_sb_info *sbi; struct inode *root; struct omfs_mount_options *parsed_opts = fc->fs_private; int ret = -EINVAL; int silent = fc->sb_flags & SB_SILENT; sbi = kzalloc(sizeof(struct omfs_sb_info), GFP_KERNEL); if (!sbi) return -ENOMEM; sb->s_fs_info = sbi; omfs_set_options(sbi, parsed_opts); sb->s_maxbytes = 0xffffffff; sb->s_time_gran = NSEC_PER_MSEC; sb->s_time_min = 0; sb->s_time_max = U64_MAX / MSEC_PER_SEC; sb_set_blocksize(sb, 0x200); bh = sb_bread(sb, 0); if (!bh) goto end; omfs_sb = (struct omfs_super_block *)bh->b_data; if (omfs_sb->s_magic != cpu_to_be32(OMFS_MAGIC)) { if (!silent) printk(KERN_ERR "omfs: Invalid superblock (%x)\n", omfs_sb->s_magic); goto out_brelse_bh; } sb->s_magic = OMFS_MAGIC; sbi->s_num_blocks = be64_to_cpu(omfs_sb->s_num_blocks); sbi->s_blocksize = be32_to_cpu(omfs_sb->s_blocksize); sbi->s_mirrors = be32_to_cpu(omfs_sb->s_mirrors); sbi->s_root_ino = be64_to_cpu(omfs_sb->s_root_block); sbi->s_sys_blocksize = be32_to_cpu(omfs_sb->s_sys_blocksize); mutex_init(&sbi->s_bitmap_lock); if (sbi->s_num_blocks > OMFS_MAX_BLOCKS) { printk(KERN_ERR "omfs: sysblock number (%llx) is out of range\n", (unsigned long long)sbi->s_num_blocks); goto out_brelse_bh; } if (sbi->s_sys_blocksize > PAGE_SIZE) { printk(KERN_ERR "omfs: sysblock size (%d) is out of range\n", sbi->s_sys_blocksize); goto out_brelse_bh; } if (sbi->s_blocksize < sbi->s_sys_blocksize || sbi->s_blocksize > OMFS_MAX_BLOCK_SIZE) { printk(KERN_ERR "omfs: block size (%d) is out of range\n", sbi->s_blocksize); goto out_brelse_bh; } /* * Use sys_blocksize as the fs block since it is smaller than a * page while the fs blocksize can be larger. */ sb_set_blocksize(sb, sbi->s_sys_blocksize); /* * ...and the difference goes into a shift. sys_blocksize is always * a power of two factor of blocksize. */ sbi->s_block_shift = get_bitmask_order(sbi->s_blocksize) - get_bitmask_order(sbi->s_sys_blocksize); bh2 = omfs_bread(sb, be64_to_cpu(omfs_sb->s_root_block)); if (!bh2) goto out_brelse_bh; omfs_rb = (struct omfs_root_block *)bh2->b_data; sbi->s_bitmap_ino = be64_to_cpu(omfs_rb->r_bitmap); sbi->s_clustersize = be32_to_cpu(omfs_rb->r_clustersize); if (sbi->s_num_blocks != be64_to_cpu(omfs_rb->r_num_blocks)) { printk(KERN_ERR "omfs: block count discrepancy between " "super and root blocks (%llx, %llx)\n", (unsigned long long)sbi->s_num_blocks, (unsigned long long)be64_to_cpu(omfs_rb->r_num_blocks)); goto out_brelse_bh2; } if (sbi->s_bitmap_ino != ~0ULL && sbi->s_bitmap_ino > sbi->s_num_blocks) { printk(KERN_ERR "omfs: free space bitmap location is corrupt " "(%llx, total blocks %llx)\n", (unsigned long long) sbi->s_bitmap_ino, (unsigned long long) sbi->s_num_blocks); goto out_brelse_bh2; } if (sbi->s_clustersize < 1 || sbi->s_clustersize > OMFS_MAX_CLUSTER_SIZE) { printk(KERN_ERR "omfs: cluster size out of range (%d)", sbi->s_clustersize); goto out_brelse_bh2; } ret = omfs_get_imap(sb); if (ret) goto out_brelse_bh2; sb->s_op = &omfs_sops; root = omfs_iget(sb, be64_to_cpu(omfs_rb->r_root_dir)); if (IS_ERR(root)) { ret = PTR_ERR(root); goto out_brelse_bh2; } sb->s_root = d_make_root(root); if (!sb->s_root) { ret = -ENOMEM; goto out_brelse_bh2; } printk(KERN_DEBUG "omfs: Mounted volume %s\n", omfs_rb->r_name); ret = 0; out_brelse_bh2: brelse(bh2); out_brelse_bh: brelse(bh); end: if (ret) kfree(sbi); return ret; } static int omfs_get_tree(struct fs_context *fc) { return get_tree_bdev(fc, omfs_fill_super); } static void omfs_free_fc(struct fs_context *fc); static const struct fs_context_operations omfs_context_ops = { .parse_param = omfs_parse_param, .get_tree = omfs_get_tree, .free = omfs_free_fc, }; static int omfs_init_fs_context(struct fs_context *fc) { struct omfs_mount_options *opts; opts = kzalloc(sizeof(*opts), GFP_KERNEL); if (!opts) return -ENOMEM; /* Set mount options defaults */ opts->s_uid = current_uid(); opts->s_gid = current_gid(); opts->s_dmask = opts->s_fmask = current_umask(); fc->fs_private = opts; fc->ops = &omfs_context_ops; return 0; } static void omfs_free_fc(struct fs_context *fc) { kfree(fc->fs_private); } static struct file_system_type omfs_fs_type = { .owner = THIS_MODULE, .name = "omfs", .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, .init_fs_context = omfs_init_fs_context, .parameters = omfs_param_spec, }; MODULE_ALIAS_FS("omfs"); static int __init init_omfs_fs(void) { return register_filesystem(&omfs_fs_type); } static void __exit exit_omfs_fs(void) { unregister_filesystem(&omfs_fs_type); } module_init(init_omfs_fs); module_exit(exit_omfs_fs); |
| 5 207 207 207 215 214 208 215 7 198 12 207 208 206 7 214 215 7 1 203 12 5 4 1 211 207 207 207 207 44 13 216 1 215 8 203 1 2 1 4 4 207 207 207 207 207 216 215 207 9 215 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 | // SPDX-License-Identifier: GPL-2.0 /* * * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. * */ #include <linux/blkdev.h> #include <linux/fs.h> #include <linux/random.h> #include <linux/slab.h> #include "debug.h" #include "ntfs.h" #include "ntfs_fs.h" /* * LOG FILE structs */ // clang-format off #define MaxLogFileSize 0x100000000ull #define DefaultLogPageSize 4096 #define MinLogRecordPages 0x30 struct RESTART_HDR { struct NTFS_RECORD_HEADER rhdr; // 'RSTR' __le32 sys_page_size; // 0x10: Page size of the system which initialized the log. __le32 page_size; // 0x14: Log page size used for this log file. __le16 ra_off; // 0x18: __le16 minor_ver; // 0x1A: __le16 major_ver; // 0x1C: __le16 fixups[]; }; #define LFS_NO_CLIENT 0xffff #define LFS_NO_CLIENT_LE cpu_to_le16(0xffff) struct CLIENT_REC { __le64 oldest_lsn; __le64 restart_lsn; // 0x08: __le16 prev_client; // 0x10: __le16 next_client; // 0x12: __le16 seq_num; // 0x14: u8 align[6]; // 0x16: __le32 name_bytes; // 0x1C: In bytes. __le16 name[32]; // 0x20: Name of client. }; static_assert(sizeof(struct CLIENT_REC) == 0x60); /* Two copies of these will exist at the beginning of the log file */ struct RESTART_AREA { __le64 current_lsn; // 0x00: Current logical end of log file. __le16 log_clients; // 0x08: Maximum number of clients. __le16 client_idx[2]; // 0x0A: Free/use index into the client record arrays. __le16 flags; // 0x0E: See RESTART_SINGLE_PAGE_IO. __le32 seq_num_bits; // 0x10: The number of bits in sequence number. __le16 ra_len; // 0x14: __le16 client_off; // 0x16: __le64 l_size; // 0x18: Usable log file size. __le32 last_lsn_data_len; // 0x20: __le16 rec_hdr_len; // 0x24: Log page data offset. __le16 data_off; // 0x26: Log page data length. __le32 open_log_count; // 0x28: __le32 align[5]; // 0x2C: struct CLIENT_REC clients[]; // 0x40: }; struct LOG_REC_HDR { __le16 redo_op; // 0x00: NTFS_LOG_OPERATION __le16 undo_op; // 0x02: NTFS_LOG_OPERATION __le16 redo_off; // 0x04: Offset to Redo record. __le16 redo_len; // 0x06: Redo length. __le16 undo_off; // 0x08: Offset to Undo record. __le16 undo_len; // 0x0A: Undo length. __le16 target_attr; // 0x0C: __le16 lcns_follow; // 0x0E: __le16 record_off; // 0x10: __le16 attr_off; // 0x12: __le16 cluster_off; // 0x14: __le16 reserved; // 0x16: __le64 target_vcn; // 0x18: __le64 page_lcns[]; // 0x20: }; static_assert(sizeof(struct LOG_REC_HDR) == 0x20); #define RESTART_ENTRY_ALLOCATED 0xFFFFFFFF #define RESTART_ENTRY_ALLOCATED_LE cpu_to_le32(0xFFFFFFFF) struct RESTART_TABLE { __le16 size; // 0x00: In bytes __le16 used; // 0x02: Entries __le16 total; // 0x04: Entries __le16 res[3]; // 0x06: __le32 free_goal; // 0x0C: __le32 first_free; // 0x10: __le32 last_free; // 0x14: }; static_assert(sizeof(struct RESTART_TABLE) == 0x18); struct ATTR_NAME_ENTRY { __le16 off; // Offset in the Open attribute Table. __le16 name_bytes; __le16 name[]; }; struct OPEN_ATTR_ENRTY { __le32 next; // 0x00: RESTART_ENTRY_ALLOCATED if allocated __le32 bytes_per_index; // 0x04: enum ATTR_TYPE type; // 0x08: u8 is_dirty_pages; // 0x0C: u8 is_attr_name; // 0x0B: Faked field to manage 'ptr' u8 name_len; // 0x0C: Faked field to manage 'ptr' u8 res; struct MFT_REF ref; // 0x10: File Reference of file containing attribute __le64 open_record_lsn; // 0x18: void *ptr; // 0x20: }; /* 32 bit version of 'struct OPEN_ATTR_ENRTY' */ struct OPEN_ATTR_ENRTY_32 { __le32 next; // 0x00: RESTART_ENTRY_ALLOCATED if allocated __le32 ptr; // 0x04: struct MFT_REF ref; // 0x08: __le64 open_record_lsn; // 0x10: u8 is_dirty_pages; // 0x18: u8 is_attr_name; // 0x19: u8 res1[2]; enum ATTR_TYPE type; // 0x1C: u8 name_len; // 0x20: In wchar u8 res2[3]; __le32 AttributeName; // 0x24: __le32 bytes_per_index; // 0x28: }; #define SIZEOF_OPENATTRIBUTEENTRY0 0x2c // static_assert( 0x2C == sizeof(struct OPEN_ATTR_ENRTY_32) ); static_assert(sizeof(struct OPEN_ATTR_ENRTY) < SIZEOF_OPENATTRIBUTEENTRY0); /* * One entry exists in the Dirty Pages Table for each page which is dirty at * the time the Restart Area is written. */ struct DIR_PAGE_ENTRY { __le32 next; // 0x00: RESTART_ENTRY_ALLOCATED if allocated __le32 target_attr; // 0x04: Index into the Open attribute Table __le32 transfer_len; // 0x08: __le32 lcns_follow; // 0x0C: __le64 vcn; // 0x10: Vcn of dirty page __le64 oldest_lsn; // 0x18: __le64 page_lcns[]; // 0x20: }; static_assert(sizeof(struct DIR_PAGE_ENTRY) == 0x20); /* 32 bit version of 'struct DIR_PAGE_ENTRY' */ struct DIR_PAGE_ENTRY_32 { __le32 next; // 0x00: RESTART_ENTRY_ALLOCATED if allocated __le32 target_attr; // 0x04: Index into the Open attribute Table __le32 transfer_len; // 0x08: __le32 lcns_follow; // 0x0C: __le32 reserved; // 0x10: __le32 vcn_low; // 0x14: Vcn of dirty page __le32 vcn_hi; // 0x18: Vcn of dirty page __le32 oldest_lsn_low; // 0x1C: __le32 oldest_lsn_hi; // 0x1C: __le32 page_lcns_low; // 0x24: __le32 page_lcns_hi; // 0x24: }; static_assert(offsetof(struct DIR_PAGE_ENTRY_32, vcn_low) == 0x14); static_assert(sizeof(struct DIR_PAGE_ENTRY_32) == 0x2c); enum transact_state { TransactionUninitialized = 0, TransactionActive, TransactionPrepared, TransactionCommitted }; struct TRANSACTION_ENTRY { __le32 next; // 0x00: RESTART_ENTRY_ALLOCATED if allocated u8 transact_state; // 0x04: u8 reserved[3]; // 0x05: __le64 first_lsn; // 0x08: __le64 prev_lsn; // 0x10: __le64 undo_next_lsn; // 0x18: __le32 undo_records; // 0x20: Number of undo log records pending abort __le32 undo_len; // 0x24: Total undo size }; static_assert(sizeof(struct TRANSACTION_ENTRY) == 0x28); struct NTFS_RESTART { __le32 major_ver; // 0x00: __le32 minor_ver; // 0x04: __le64 check_point_start; // 0x08: __le64 open_attr_table_lsn; // 0x10: __le64 attr_names_lsn; // 0x18: __le64 dirty_pages_table_lsn; // 0x20: __le64 transact_table_lsn; // 0x28: __le32 open_attr_len; // 0x30: In bytes __le32 attr_names_len; // 0x34: In bytes __le32 dirty_pages_len; // 0x38: In bytes __le32 transact_table_len; // 0x3C: In bytes }; static_assert(sizeof(struct NTFS_RESTART) == 0x40); struct NEW_ATTRIBUTE_SIZES { __le64 alloc_size; __le64 valid_size; __le64 data_size; __le64 total_size; }; struct BITMAP_RANGE { __le32 bitmap_off; __le32 bits; }; struct LCN_RANGE { __le64 lcn; __le64 len; }; /* The following type defines the different log record types. */ #define LfsClientRecord cpu_to_le32(1) #define LfsClientRestart cpu_to_le32(2) /* This is used to uniquely identify a client for a particular log file. */ struct CLIENT_ID { __le16 seq_num; __le16 client_idx; }; /* This is the header that begins every Log Record in the log file. */ struct LFS_RECORD_HDR { __le64 this_lsn; // 0x00: __le64 client_prev_lsn; // 0x08: __le64 client_undo_next_lsn; // 0x10: __le32 client_data_len; // 0x18: struct CLIENT_ID client; // 0x1C: Owner of this log record. __le32 record_type; // 0x20: LfsClientRecord or LfsClientRestart. __le32 transact_id; // 0x24: __le16 flags; // 0x28: LOG_RECORD_MULTI_PAGE u8 align[6]; // 0x2A: }; #define LOG_RECORD_MULTI_PAGE cpu_to_le16(1) static_assert(sizeof(struct LFS_RECORD_HDR) == 0x30); struct LFS_RECORD { __le16 next_record_off; // 0x00: Offset of the free space in the page, u8 align[6]; // 0x02: __le64 last_end_lsn; // 0x08: lsn for the last log record which ends on the page, }; static_assert(sizeof(struct LFS_RECORD) == 0x10); struct RECORD_PAGE_HDR { struct NTFS_RECORD_HEADER rhdr; // 'RCRD' __le32 rflags; // 0x10: See LOG_PAGE_LOG_RECORD_END __le16 page_count; // 0x14: __le16 page_pos; // 0x16: struct LFS_RECORD record_hdr; // 0x18: __le16 fixups[10]; // 0x28: __le32 file_off; // 0x3c: Used when major version >= 2 }; // clang-format on // Page contains the end of a log record. #define LOG_PAGE_LOG_RECORD_END cpu_to_le32(0x00000001) static inline bool is_log_record_end(const struct RECORD_PAGE_HDR *hdr) { return hdr->rflags & LOG_PAGE_LOG_RECORD_END; } static_assert(offsetof(struct RECORD_PAGE_HDR, file_off) == 0x3c); /* * END of NTFS LOG structures */ /* Define some tuning parameters to keep the restart tables a reasonable size. */ #define INITIAL_NUMBER_TRANSACTIONS 5 enum NTFS_LOG_OPERATION { Noop = 0x00, CompensationLogRecord = 0x01, InitializeFileRecordSegment = 0x02, DeallocateFileRecordSegment = 0x03, WriteEndOfFileRecordSegment = 0x04, CreateAttribute = 0x05, DeleteAttribute = 0x06, UpdateResidentValue = 0x07, UpdateNonresidentValue = 0x08, UpdateMappingPairs = 0x09, DeleteDirtyClusters = 0x0A, SetNewAttributeSizes = 0x0B, AddIndexEntryRoot = 0x0C, DeleteIndexEntryRoot = 0x0D, AddIndexEntryAllocation = 0x0E, DeleteIndexEntryAllocation = 0x0F, WriteEndOfIndexBuffer = 0x10, SetIndexEntryVcnRoot = 0x11, SetIndexEntryVcnAllocation = 0x12, UpdateFileNameRoot = 0x13, UpdateFileNameAllocation = 0x14, SetBitsInNonresidentBitMap = 0x15, ClearBitsInNonresidentBitMap = 0x16, HotFix = 0x17, EndTopLevelAction = 0x18, PrepareTransaction = 0x19, CommitTransaction = 0x1A, ForgetTransaction = 0x1B, OpenNonresidentAttribute = 0x1C, OpenAttributeTableDump = 0x1D, AttributeNamesDump = 0x1E, DirtyPageTableDump = 0x1F, TransactionTableDump = 0x20, UpdateRecordDataRoot = 0x21, UpdateRecordDataAllocation = 0x22, UpdateRelativeDataInIndex = 0x23, // NtOfsRestartUpdateRelativeDataInIndex UpdateRelativeDataInIndex2 = 0x24, ZeroEndOfFileRecord = 0x25, }; /* * Array for log records which require a target attribute. * A true indicates that the corresponding restart operation * requires a target attribute. */ static const u8 AttributeRequired[] = { 0xFC, 0xFB, 0xFF, 0x10, 0x06, }; static inline bool is_target_required(u16 op) { bool ret = op <= UpdateRecordDataAllocation && (AttributeRequired[op >> 3] >> (op & 7) & 1); return ret; } static inline bool can_skip_action(enum NTFS_LOG_OPERATION op) { switch (op) { case Noop: case DeleteDirtyClusters: case HotFix: case EndTopLevelAction: case PrepareTransaction: case CommitTransaction: case ForgetTransaction: case CompensationLogRecord: case OpenNonresidentAttribute: case OpenAttributeTableDump: case AttributeNamesDump: case DirtyPageTableDump: case TransactionTableDump: return true; default: return false; } } enum { lcb_ctx_undo_next, lcb_ctx_prev, lcb_ctx_next }; /* Bytes per restart table. */ static inline u32 bytes_per_rt(const struct RESTART_TABLE *rt) { return le16_to_cpu(rt->used) * le16_to_cpu(rt->size) + sizeof(struct RESTART_TABLE); } /* Log record length. */ static inline u32 lrh_length(const struct LOG_REC_HDR *lr) { u16 t16 = le16_to_cpu(lr->lcns_follow); return struct_size(lr, page_lcns, max_t(u16, 1, t16)); } struct lcb { struct LFS_RECORD_HDR *lrh; // Log record header of the current lsn. struct LOG_REC_HDR *log_rec; u32 ctx_mode; // lcb_ctx_undo_next/lcb_ctx_prev/lcb_ctx_next struct CLIENT_ID client; bool alloc; // If true the we should deallocate 'log_rec'. }; static void lcb_put(struct lcb *lcb) { if (lcb->alloc) kfree(lcb->log_rec); kfree(lcb->lrh); kfree(lcb); } /* Find the oldest lsn from active clients. */ static inline void oldest_client_lsn(const struct CLIENT_REC *ca, __le16 next_client, u64 *oldest_lsn) { while (next_client != LFS_NO_CLIENT_LE) { const struct CLIENT_REC *cr = ca + le16_to_cpu(next_client); u64 lsn = le64_to_cpu(cr->oldest_lsn); /* Ignore this block if it's oldest lsn is 0. */ if (lsn && lsn < *oldest_lsn) *oldest_lsn = lsn; next_client = cr->next_client; } } static inline bool is_rst_page_hdr_valid(u32 file_off, const struct RESTART_HDR *rhdr) { u32 sys_page = le32_to_cpu(rhdr->sys_page_size); u32 page_size = le32_to_cpu(rhdr->page_size); u32 end_usa; u16 ro; if (sys_page < SECTOR_SIZE || page_size < SECTOR_SIZE || sys_page & (sys_page - 1) || page_size & (page_size - 1)) { return false; } /* Check that if the file offset isn't 0, it is the system page size. */ if (file_off && file_off != sys_page) return false; /* Check support version 1.1+. */ if (le16_to_cpu(rhdr->major_ver) <= 1 && !rhdr->minor_ver) return false; if (le16_to_cpu(rhdr->major_ver) > 2) return false; ro = le16_to_cpu(rhdr->ra_off); if (!IS_ALIGNED(ro, 8) || ro > sys_page) return false; end_usa = ((sys_page >> SECTOR_SHIFT) + 1) * sizeof(short); end_usa += le16_to_cpu(rhdr->rhdr.fix_off); if (ro < end_usa) return false; return true; } static inline bool is_rst_area_valid(const struct RESTART_HDR *rhdr) { const struct RESTART_AREA *ra; u16 cl, fl, ul; u32 off, l_size, seq_bits; u16 ro = le16_to_cpu(rhdr->ra_off); u32 sys_page = le32_to_cpu(rhdr->sys_page_size); if (ro + offsetof(struct RESTART_AREA, l_size) > SECTOR_SIZE - sizeof(short)) return false; ra = Add2Ptr(rhdr, ro); cl = le16_to_cpu(ra->log_clients); if (cl > 1) return false; off = le16_to_cpu(ra->client_off); if (!IS_ALIGNED(off, 8) || ro + off > SECTOR_SIZE - sizeof(short)) return false; off += cl * sizeof(struct CLIENT_REC); if (off > sys_page) return false; /* * Check the restart length field and whether the entire * restart area is contained that length. */ if (le16_to_cpu(rhdr->ra_off) + le16_to_cpu(ra->ra_len) > sys_page || off > le16_to_cpu(ra->ra_len)) { return false; } /* * As a final check make sure that the use list and the free list * are either empty or point to a valid client. */ fl = le16_to_cpu(ra->client_idx[0]); ul = le16_to_cpu(ra->client_idx[1]); if ((fl != LFS_NO_CLIENT && fl >= cl) || (ul != LFS_NO_CLIENT && ul >= cl)) return false; /* Make sure the sequence number bits match the log file size. */ l_size = le64_to_cpu(ra->l_size); seq_bits = sizeof(u64) * 8 + 3; while (l_size) { l_size >>= 1; seq_bits -= 1; } if (seq_bits != le32_to_cpu(ra->seq_num_bits)) return false; /* The log page data offset and record header length must be quad-aligned. */ if (!IS_ALIGNED(le16_to_cpu(ra->data_off), 8) || !IS_ALIGNED(le16_to_cpu(ra->rec_hdr_len), 8)) return false; return true; } static inline bool is_client_area_valid(const struct RESTART_HDR *rhdr, bool usa_error) { u16 ro = le16_to_cpu(rhdr->ra_off); const struct RESTART_AREA *ra = Add2Ptr(rhdr, ro); u16 ra_len = le16_to_cpu(ra->ra_len); const struct CLIENT_REC *ca; u32 i; if (usa_error && ra_len + ro > SECTOR_SIZE - sizeof(short)) return false; /* Find the start of the client array. */ ca = Add2Ptr(ra, le16_to_cpu(ra->client_off)); /* * Start with the free list. * Check that all the clients are valid and that there isn't a cycle. * Do the in-use list on the second pass. */ for (i = 0; i < 2; i++) { u16 client_idx = le16_to_cpu(ra->client_idx[i]); bool first_client = true; u16 clients = le16_to_cpu(ra->log_clients); while (client_idx != LFS_NO_CLIENT) { const struct CLIENT_REC *cr; if (!clients || client_idx >= le16_to_cpu(ra->log_clients)) return false; clients -= 1; cr = ca + client_idx; client_idx = le16_to_cpu(cr->next_client); if (first_client) { first_client = false; if (cr->prev_client != LFS_NO_CLIENT_LE) return false; } } } return true; } /* * remove_client * * Remove a client record from a client record list an restart area. */ static inline void remove_client(struct CLIENT_REC *ca, const struct CLIENT_REC *cr, __le16 *head) { if (cr->prev_client == LFS_NO_CLIENT_LE) *head = cr->next_client; else ca[le16_to_cpu(cr->prev_client)].next_client = cr->next_client; if (cr->next_client != LFS_NO_CLIENT_LE) ca[le16_to_cpu(cr->next_client)].prev_client = cr->prev_client; } /* * add_client - Add a client record to the start of a list. */ static inline void add_client(struct CLIENT_REC *ca, u16 index, __le16 *head) { struct CLIENT_REC *cr = ca + index; cr->prev_client = LFS_NO_CLIENT_LE; cr->next_client = *head; if (*head != LFS_NO_CLIENT_LE) ca[le16_to_cpu(*head)].prev_client = cpu_to_le16(index); *head = cpu_to_le16(index); } /* * Enumerate restart table. * * @t - table to enumerate. * @c - current enumerated element. * * enumeration starts with @c == NULL * returns next element or NULL */ static inline void *enum_rstbl(struct RESTART_TABLE *t, void *c) { __le32 *e; u32 bprt; u16 rsize; if (!t) return NULL; rsize = le16_to_cpu(t->size); if (!c) { /* start enumeration. */ if (!t->total) return NULL; e = Add2Ptr(t, sizeof(struct RESTART_TABLE)); } else { e = Add2Ptr(c, rsize); } /* Loop until we hit the first one allocated, or the end of the list. */ for (bprt = bytes_per_rt(t); PtrOffset(t, e) < bprt; e = Add2Ptr(e, rsize)) { if (*e == RESTART_ENTRY_ALLOCATED_LE) return e; } return NULL; } /* * find_dp - Search for a @vcn in Dirty Page Table. */ static inline struct DIR_PAGE_ENTRY *find_dp(struct RESTART_TABLE *dptbl, u32 target_attr, u64 vcn) { __le32 ta = cpu_to_le32(target_attr); struct DIR_PAGE_ENTRY *dp = NULL; while ((dp = enum_rstbl(dptbl, dp))) { u64 dp_vcn = le64_to_cpu(dp->vcn); if (dp->target_attr == ta && vcn >= dp_vcn && vcn < dp_vcn + le32_to_cpu(dp->lcns_follow)) { return dp; } } return NULL; } static inline u32 norm_file_page(u32 page_size, u32 *l_size, bool use_default) { if (use_default) page_size = DefaultLogPageSize; /* Round the file size down to a system page boundary. */ *l_size &= ~(page_size - 1); /* File should contain at least 2 restart pages and MinLogRecordPages pages. */ if (*l_size < (MinLogRecordPages + 2) * page_size) return 0; return page_size; } static bool check_log_rec(const struct LOG_REC_HDR *lr, u32 bytes, u32 tr, u32 bytes_per_attr_entry) { u16 t16; if (bytes < sizeof(struct LOG_REC_HDR)) return false; if (!tr) return false; if ((tr - sizeof(struct RESTART_TABLE)) % sizeof(struct TRANSACTION_ENTRY)) return false; if (le16_to_cpu(lr->redo_off) & 7) return false; if (le16_to_cpu(lr->undo_off) & 7) return false; if (lr->target_attr) goto check_lcns; if (is_target_required(le16_to_cpu(lr->redo_op))) return false; if (is_target_required(le16_to_cpu(lr->undo_op))) return false; check_lcns: if (!lr->lcns_follow) goto check_length; t16 = le16_to_cpu(lr->target_attr); if ((t16 - sizeof(struct RESTART_TABLE)) % bytes_per_attr_entry) return false; check_length: if (bytes < lrh_length(lr)) return false; return true; } static bool check_rstbl(const struct RESTART_TABLE *rt, size_t bytes) { u32 ts; u32 i, off; u16 rsize = le16_to_cpu(rt->size); u16 ne = le16_to_cpu(rt->used); u32 ff = le32_to_cpu(rt->first_free); u32 lf = le32_to_cpu(rt->last_free); ts = rsize * ne + sizeof(struct RESTART_TABLE); if (!rsize || rsize > bytes || rsize + sizeof(struct RESTART_TABLE) > bytes || bytes < ts || le16_to_cpu(rt->total) > ne || ff > ts - sizeof(__le32) || lf > ts - sizeof(__le32) || (ff && ff < sizeof(struct RESTART_TABLE)) || (lf && lf < sizeof(struct RESTART_TABLE))) { return false; } /* * Verify each entry is either allocated or points * to a valid offset the table. */ for (i = 0; i < ne; i++) { off = le32_to_cpu(*(__le32 *)Add2Ptr( rt, i * rsize + sizeof(struct RESTART_TABLE))); if (off != RESTART_ENTRY_ALLOCATED && off && (off < sizeof(struct RESTART_TABLE) || ((off - sizeof(struct RESTART_TABLE)) % rsize))) { return false; } } /* * Walk through the list headed by the first entry to make * sure none of the entries are currently being used. */ for (off = ff; off;) { if (off == RESTART_ENTRY_ALLOCATED) return false; off = le32_to_cpu(*(__le32 *)Add2Ptr(rt, off)); if (off > ts - sizeof(__le32)) return false; } return true; } /* * free_rsttbl_idx - Free a previously allocated index a Restart Table. */ static inline void free_rsttbl_idx(struct RESTART_TABLE *rt, u32 off) { __le32 *e; u32 lf = le32_to_cpu(rt->last_free); __le32 off_le = cpu_to_le32(off); e = Add2Ptr(rt, off); if (off < le32_to_cpu(rt->free_goal)) { *e = rt->first_free; rt->first_free = off_le; if (!lf) rt->last_free = off_le; } else { if (lf) *(__le32 *)Add2Ptr(rt, lf) = off_le; else rt->first_free = off_le; rt->last_free = off_le; *e = 0; } le16_sub_cpu(&rt->total, 1); } static inline struct RESTART_TABLE *init_rsttbl(u16 esize, u16 used) { __le32 *e, *last_free; u32 off; u32 bytes = esize * used + sizeof(struct RESTART_TABLE); u32 lf = sizeof(struct RESTART_TABLE) + (used - 1) * esize; struct RESTART_TABLE *t = kzalloc(bytes, GFP_NOFS); if (!t) return NULL; t->size = cpu_to_le16(esize); t->used = cpu_to_le16(used); t->free_goal = cpu_to_le32(~0u); t->first_free = cpu_to_le32(sizeof(struct RESTART_TABLE)); t->last_free = cpu_to_le32(lf); e = (__le32 *)(t + 1); last_free = Add2Ptr(t, lf); for (off = sizeof(struct RESTART_TABLE) + esize; e < last_free; e = Add2Ptr(e, esize), off += esize) { *e = cpu_to_le32(off); } return t; } static inline struct RESTART_TABLE *extend_rsttbl(struct RESTART_TABLE *tbl, u32 add, u32 free_goal) { u16 esize = le16_to_cpu(tbl->size); __le32 osize = cpu_to_le32(bytes_per_rt(tbl)); u32 used = le16_to_cpu(tbl->used); struct RESTART_TABLE *rt; rt = init_rsttbl(esize, used + add); if (!rt) return NULL; memcpy(rt + 1, tbl + 1, esize * used); rt->free_goal = free_goal == ~0u ? cpu_to_le32(~0u) : cpu_to_le32(sizeof(struct RESTART_TABLE) + free_goal * esize); if (tbl->first_free) { rt->first_free = tbl->first_free; *(__le32 *)Add2Ptr(rt, le32_to_cpu(tbl->last_free)) = osize; } else { rt->first_free = osize; } rt->total = tbl->total; kfree(tbl); return rt; } /* * alloc_rsttbl_idx * * Allocate an index from within a previously initialized Restart Table. */ static inline void *alloc_rsttbl_idx(struct RESTART_TABLE **tbl) { u32 off; __le32 *e; struct RESTART_TABLE *t = *tbl; if (!t->first_free) { *tbl = t = extend_rsttbl(t, 16, ~0u); if (!t) return NULL; } off = le32_to_cpu(t->first_free); /* Dequeue this entry and zero it. */ e = Add2Ptr(t, off); t->first_free = *e; memset(e, 0, le16_to_cpu(t->size)); *e = RESTART_ENTRY_ALLOCATED_LE; /* If list is going empty, then we fix the last_free as well. */ if (!t->first_free) t->last_free = 0; le16_add_cpu(&t->total, 1); return Add2Ptr(t, off); } /* * alloc_rsttbl_from_idx * * Allocate a specific index from within a previously initialized Restart Table. */ static inline void *alloc_rsttbl_from_idx(struct RESTART_TABLE **tbl, u32 vbo) { u32 off; __le32 *e; struct RESTART_TABLE *rt = *tbl; u32 bytes = bytes_per_rt(rt); u16 esize = le16_to_cpu(rt->size); /* If the entry is not the table, we will have to extend the table. */ if (vbo >= bytes) { /* * Extend the size by computing the number of entries between * the existing size and the desired index and adding 1 to that. */ u32 bytes2idx = vbo - bytes; /* * There should always be an integral number of entries * being added. Now extend the table. */ *tbl = rt = extend_rsttbl(rt, bytes2idx / esize + 1, bytes); if (!rt) return NULL; } /* See if the entry is already allocated, and just return if it is. */ e = Add2Ptr(rt, vbo); if (*e == RESTART_ENTRY_ALLOCATED_LE) return e; /* * Walk through the table, looking for the entry we're * interested and the previous entry. */ off = le32_to_cpu(rt->first_free); e = Add2Ptr(rt, off); if (off == vbo) { /* this is a match */ rt->first_free = *e; goto skip_looking; } /* * Need to walk through the list looking for the predecessor * of our entry. */ for (;;) { /* Remember the entry just found */ u32 last_off = off; __le32 *last_e = e; /* Should never run of entries. */ /* Lookup up the next entry the list. */ off = le32_to_cpu(*last_e); e = Add2Ptr(rt, off); /* If this is our match we are done. */ if (off == vbo) { *last_e = *e; /* * If this was the last entry, we update that * table as well. */ if (le32_to_cpu(rt->last_free) == off) rt->last_free = cpu_to_le32(last_off); break; } } skip_looking: /* If the list is now empty, we fix the last_free as well. */ if (!rt->first_free) rt->last_free = 0; /* Zero this entry. */ memset(e, 0, esize); *e = RESTART_ENTRY_ALLOCATED_LE; le16_add_cpu(&rt->total, 1); return e; } struct restart_info { u64 last_lsn; struct RESTART_HDR *r_page; u32 vbo; bool chkdsk_was_run; bool valid_page; bool initialized; bool restart; }; #define RESTART_SINGLE_PAGE_IO cpu_to_le16(0x0001) #define NTFSLOG_WRAPPED 0x00000001 #define NTFSLOG_MULTIPLE_PAGE_IO 0x00000002 #define NTFSLOG_NO_LAST_LSN 0x00000004 #define NTFSLOG_REUSE_TAIL 0x00000010 #define NTFSLOG_NO_OLDEST_LSN 0x00000020 /* Helper struct to work with NTFS $LogFile. */ struct ntfs_log { struct ntfs_inode *ni; u32 l_size; u32 orig_file_size; u32 sys_page_size; u32 sys_page_mask; u32 page_size; u32 page_mask; // page_size - 1 u8 page_bits; struct RECORD_PAGE_HDR *one_page_buf; struct RESTART_TABLE *open_attr_tbl; u32 transaction_id; u32 clst_per_page; u32 first_page; u32 next_page; u32 ra_off; u32 data_off; u32 restart_size; u32 data_size; u16 record_header_len; u64 seq_num; u32 seq_num_bits; u32 file_data_bits; u32 seq_num_mask; /* (1 << file_data_bits) - 1 */ struct RESTART_AREA *ra; /* In-memory image of the next restart area. */ u32 ra_size; /* The usable size of the restart area. */ /* * If true, then the in-memory restart area is to be written * to the first position on the disk. */ bool init_ra; bool set_dirty; /* True if we need to set dirty flag. */ u64 oldest_lsn; u32 oldest_lsn_off; u64 last_lsn; u32 total_avail; u32 total_avail_pages; u32 total_undo_commit; u32 max_current_avail; u32 current_avail; u32 reserved; short major_ver; short minor_ver; u32 l_flags; /* See NTFSLOG_XXX */ u32 current_openlog_count; /* On-disk value for open_log_count. */ struct CLIENT_ID client_id; u32 client_undo_commit; struct restart_info rst_info, rst_info2; }; static inline u32 lsn_to_vbo(struct ntfs_log *log, const u64 lsn) { u32 vbo = (lsn << log->seq_num_bits) >> (log->seq_num_bits - 3); return vbo; } /* Compute the offset in the log file of the next log page. */ static inline u32 next_page_off(struct ntfs_log *log, u32 off) { off = (off & ~log->sys_page_mask) + log->page_size; return off >= log->l_size ? log->first_page : off; } static inline u32 lsn_to_page_off(struct ntfs_log *log, u64 lsn) { return (((u32)lsn) << 3) & log->page_mask; } static inline u64 vbo_to_lsn(struct ntfs_log *log, u32 off, u64 Seq) { return (off >> 3) + (Seq << log->file_data_bits); } static inline bool is_lsn_in_file(struct ntfs_log *log, u64 lsn) { return lsn >= log->oldest_lsn && lsn <= le64_to_cpu(log->ra->current_lsn); } static inline u32 hdr_file_off(struct ntfs_log *log, struct RECORD_PAGE_HDR *hdr) { if (log->major_ver < 2) return le64_to_cpu(hdr->rhdr.lsn); return le32_to_cpu(hdr->file_off); } static inline u64 base_lsn(struct ntfs_log *log, const struct RECORD_PAGE_HDR *hdr, u64 lsn) { u64 h_lsn = le64_to_cpu(hdr->rhdr.lsn); u64 ret = (((h_lsn >> log->file_data_bits) + (lsn < (lsn_to_vbo(log, h_lsn) & ~log->page_mask) ? 1 : 0)) << log->file_data_bits) + ((((is_log_record_end(hdr) && h_lsn <= le64_to_cpu(hdr->record_hdr.last_end_lsn)) ? le16_to_cpu(hdr->record_hdr.next_record_off) : log->page_size) + lsn) >> 3); return ret; } static inline bool verify_client_lsn(struct ntfs_log *log, const struct CLIENT_REC *client, u64 lsn) { return lsn >= le64_to_cpu(client->oldest_lsn) && lsn <= le64_to_cpu(log->ra->current_lsn) && lsn; } static int read_log_page(struct ntfs_log *log, u32 vbo, struct RECORD_PAGE_HDR **buffer, bool *usa_error) { int err = 0; u32 page_idx = vbo >> log->page_bits; u32 page_off = vbo & log->page_mask; u32 bytes = log->page_size - page_off; void *to_free = NULL; u32 page_vbo = page_idx << log->page_bits; struct RECORD_PAGE_HDR *page_buf; struct ntfs_inode *ni = log->ni; bool bBAAD; if (vbo >= log->l_size) return -EINVAL; if (!*buffer) { to_free = kmalloc(log->page_size, GFP_NOFS); if (!to_free) return -ENOMEM; *buffer = to_free; } page_buf = page_off ? log->one_page_buf : *buffer; err = ntfs_read_run_nb(ni->mi.sbi, &ni->file.run, page_vbo, page_buf, log->page_size, NULL); if (err) goto out; if (page_buf->rhdr.sign != NTFS_FFFF_SIGNATURE) ntfs_fix_post_read(&page_buf->rhdr, PAGE_SIZE, false); if (page_buf != *buffer) memcpy(*buffer, Add2Ptr(page_buf, page_off), bytes); bBAAD = page_buf->rhdr.sign == NTFS_BAAD_SIGNATURE; if (usa_error) *usa_error = bBAAD; /* Check that the update sequence array for this page is valid */ /* If we don't allow errors, raise an error status */ else if (bBAAD) err = -EINVAL; out: if (err && to_free) { kfree(to_free); *buffer = NULL; } return err; } /* * log_read_rst * * It walks through 512 blocks of the file looking for a valid * restart page header. It will stop the first time we find a * valid page header. */ static int log_read_rst(struct ntfs_log *log, bool first, struct restart_info *info) { u32 skip; u64 vbo; struct RESTART_HDR *r_page = NULL; /* Determine which restart area we are looking for. */ if (first) { vbo = 0; skip = 512; } else { vbo = 512; skip = 0; } /* Loop continuously until we succeed. */ for (; vbo < log->l_size; vbo = 2 * vbo + skip, skip = 0) { bool usa_error; bool brst, bchk; struct RESTART_AREA *ra; /* Read a page header at the current offset. */ if (read_log_page(log, vbo, (struct RECORD_PAGE_HDR **)&r_page, &usa_error)) { /* Ignore any errors. */ continue; } /* Exit if the signature is a log record page. */ if (r_page->rhdr.sign == NTFS_RCRD_SIGNATURE) { info->initialized = true; break; } brst = r_page->rhdr.sign == NTFS_RSTR_SIGNATURE; bchk = r_page->rhdr.sign == NTFS_CHKD_SIGNATURE; if (!bchk && !brst) { if (r_page->rhdr.sign != NTFS_FFFF_SIGNATURE) { /* * Remember if the signature does not * indicate uninitialized file. */ info->initialized = true; } continue; } ra = NULL; info->valid_page = false; info->initialized = true; info->vbo = vbo; /* Let's check the restart area if this is a valid page. */ if (!is_rst_page_hdr_valid(vbo, r_page)) goto check_result; ra = Add2Ptr(r_page, le16_to_cpu(r_page->ra_off)); if (!is_rst_area_valid(r_page)) goto check_result; /* * We have a valid restart page header and restart area. * If chkdsk was run or we have no clients then we have * no more checking to do. */ if (bchk || ra->client_idx[1] == LFS_NO_CLIENT_LE) { info->valid_page = true; goto check_result; } if (is_client_area_valid(r_page, usa_error)) { info->valid_page = true; ra = Add2Ptr(r_page, le16_to_cpu(r_page->ra_off)); } check_result: /* * If chkdsk was run then update the caller's * values and return. */ if (r_page->rhdr.sign == NTFS_CHKD_SIGNATURE) { info->chkdsk_was_run = true; info->last_lsn = le64_to_cpu(r_page->rhdr.lsn); info->restart = true; info->r_page = r_page; return 0; } /* * If we have a valid page then copy the values * we need from it. */ if (info->valid_page) { info->last_lsn = le64_to_cpu(ra->current_lsn); info->restart = true; info->r_page = r_page; return 0; } } kfree(r_page); return 0; } /* * Ilog_init_pg_hdr - Init @log from restart page header. */ static void log_init_pg_hdr(struct ntfs_log *log, u16 major_ver, u16 minor_ver) { log->sys_page_size = log->page_size; log->sys_page_mask = log->page_mask; log->clst_per_page = log->page_size >> log->ni->mi.sbi->cluster_bits; if (!log->clst_per_page) log->clst_per_page = 1; log->first_page = major_ver >= 2 ? 0x22 * log->page_size : 4 * log->page_size; log->major_ver = major_ver; log->minor_ver = minor_ver; } /* * log_create - Init @log in cases when we don't have a restart area to use. */ static void log_create(struct ntfs_log *log, const u64 last_lsn, u32 open_log_count, bool wrapped, bool use_multi_page) { /* All file offsets must be quadword aligned. */ log->file_data_bits = blksize_bits(log->l_size) - 3; log->seq_num_mask = (8 << log->file_data_bits) - 1; log->seq_num_bits = sizeof(u64) * 8 - log->file_data_bits; log->seq_num = (last_lsn >> log->file_data_bits) + 2; log->next_page = log->first_page; log->oldest_lsn = log->seq_num << log->file_data_bits; log->oldest_lsn_off = 0; log->last_lsn = log->oldest_lsn; log->l_flags |= NTFSLOG_NO_LAST_LSN | NTFSLOG_NO_OLDEST_LSN; /* Set the correct flags for the I/O and indicate if we have wrapped. */ if (wrapped) log->l_flags |= NTFSLOG_WRAPPED; if (use_multi_page) log->l_flags |= NTFSLOG_MULTIPLE_PAGE_IO; /* Compute the log page values. */ log->data_off = ALIGN( offsetof(struct RECORD_PAGE_HDR, fixups) + sizeof(short) * ((log->page_size >> SECTOR_SHIFT) + 1), 8); log->data_size = log->page_size - log->data_off; log->record_header_len = sizeof(struct LFS_RECORD_HDR); /* Remember the different page sizes for reservation. */ log->reserved = log->data_size - log->record_header_len; /* Compute the restart page values. */ log->ra_off = ALIGN( offsetof(struct RESTART_HDR, fixups) + sizeof(short) * ((log->sys_page_size >> SECTOR_SHIFT) + 1), 8); log->restart_size = log->sys_page_size - log->ra_off; log->ra_size = struct_size(log->ra, clients, 1); log->current_openlog_count = open_log_count; /* * The total available log file space is the number of * log file pages times the space available on each page. */ log->total_avail_pages = log->l_size - log->first_page; log->total_avail = log->total_avail_pages >> log->page_bits; /* * We assume that we can't use the end of the page less than * the file record size. * Then we won't need to reserve more than the caller asks for. */ log->max_current_avail = log->total_avail * log->reserved; log->total_avail = log->total_avail * log->data_size; log->current_avail = log->max_current_avail; } /* * log_create_ra - Fill a restart area from the values stored in @log. */ static struct RESTART_AREA *log_create_ra(struct ntfs_log *log) { struct CLIENT_REC *cr; struct RESTART_AREA *ra = kzalloc(log->restart_size, GFP_NOFS); if (!ra) return NULL; ra->current_lsn = cpu_to_le64(log->last_lsn); ra->log_clients = cpu_to_le16(1); ra->client_idx[1] = LFS_NO_CLIENT_LE; if (log->l_flags & NTFSLOG_MULTIPLE_PAGE_IO) ra->flags = RESTART_SINGLE_PAGE_IO; ra->seq_num_bits = cpu_to_le32(log->seq_num_bits); ra->ra_len = cpu_to_le16(log->ra_size); ra->client_off = cpu_to_le16(offsetof(struct RESTART_AREA, clients)); ra->l_size = cpu_to_le64(log->l_size); ra->rec_hdr_len = cpu_to_le16(log->record_header_len); ra->data_off = cpu_to_le16(log->data_off); ra->open_log_count = cpu_to_le32(log->current_openlog_count + 1); cr = ra->clients; cr->prev_client = LFS_NO_CLIENT_LE; cr->next_client = LFS_NO_CLIENT_LE; return ra; } static u32 final_log_off(struct ntfs_log *log, u64 lsn, u32 data_len) { u32 base_vbo = lsn << 3; u32 final_log_off = (base_vbo & log->seq_num_mask) & ~log->page_mask; u32 page_off = base_vbo & log->page_mask; u32 tail = log->page_size - page_off; page_off -= 1; /* Add the length of the header. */ data_len += log->record_header_len; /* * If this lsn is contained this log page we are done. * Otherwise we need to walk through several log pages. */ if (data_len > tail) { data_len -= tail; tail = log->data_size; page_off = log->data_off - 1; for (;;) { final_log_off = next_page_off(log, final_log_off); /* * We are done if the remaining bytes * fit on this page. */ if (data_len <= tail) break; data_len -= tail; } } /* * We add the remaining bytes to our starting position on this page * and then add that value to the file offset of this log page. */ return final_log_off + data_len + page_off; } static int next_log_lsn(struct ntfs_log *log, const struct LFS_RECORD_HDR *rh, u64 *lsn) { int err; u64 this_lsn = le64_to_cpu(rh->this_lsn); u32 vbo = lsn_to_vbo(log, this_lsn); u32 end = final_log_off(log, this_lsn, le32_to_cpu(rh->client_data_len)); u32 hdr_off = end & ~log->sys_page_mask; u64 seq = this_lsn >> log->file_data_bits; struct RECORD_PAGE_HDR *page = NULL; /* Remember if we wrapped. */ if (end <= vbo) seq += 1; /* Log page header for this page. */ err = read_log_page(log, hdr_off, &page, NULL); if (err) return err; /* * If the lsn we were given was not the last lsn on this page, * then the starting offset for the next lsn is on a quad word * boundary following the last file offset for the current lsn. * Otherwise the file offset is the start of the data on the next page. */ if (this_lsn == le64_to_cpu(page->rhdr.lsn)) { /* If we wrapped, we need to increment the sequence number. */ hdr_off = next_page_off(log, hdr_off); if (hdr_off == log->first_page) seq += 1; vbo = hdr_off + log->data_off; } else { vbo = ALIGN(end, 8); } /* Compute the lsn based on the file offset and the sequence count. */ *lsn = vbo_to_lsn(log, vbo, seq); /* * If this lsn is within the legal range for the file, we return true. * Otherwise false indicates that there are no more lsn's. */ if (!is_lsn_in_file(log, *lsn)) *lsn = 0; kfree(page); return 0; } /* * current_log_avail - Calculate the number of bytes available for log records. */ static u32 current_log_avail(struct ntfs_log *log) { u32 oldest_off, next_free_off, free_bytes; if (log->l_flags & NTFSLOG_NO_LAST_LSN) { /* The entire file is available. */ return log->max_current_avail; } /* * If there is a last lsn the restart area then we know that we will * have to compute the free range. * If there is no oldest lsn then start at the first page of the file. */ oldest_off = (log->l_flags & NTFSLOG_NO_OLDEST_LSN) ? log->first_page : (log->oldest_lsn_off & ~log->sys_page_mask); /* * We will use the next log page offset to compute the next free page. * If we are going to reuse this page go to the next page. * If we are at the first page then use the end of the file. */ next_free_off = (log->l_flags & NTFSLOG_REUSE_TAIL) ? log->next_page + log->page_size : log->next_page == log->first_page ? log->l_size : log->next_page; /* If the two offsets are the same then there is no available space. */ if (oldest_off == next_free_off) return 0; /* * If the free offset follows the oldest offset then subtract * this range from the total available pages. */ free_bytes = oldest_off < next_free_off ? log->total_avail_pages - (next_free_off - oldest_off) : oldest_off - next_free_off; free_bytes >>= log->page_bits; return free_bytes * log->reserved; } static bool check_subseq_log_page(struct ntfs_log *log, const struct RECORD_PAGE_HDR *rp, u32 vbo, u64 seq) { u64 lsn_seq; const struct NTFS_RECORD_HEADER *rhdr = &rp->rhdr; u64 lsn = le64_to_cpu(rhdr->lsn); if (rhdr->sign == NTFS_FFFF_SIGNATURE || !rhdr->sign) return false; /* * If the last lsn on the page occurs was written after the page * that caused the original error then we have a fatal error. */ lsn_seq = lsn >> log->file_data_bits; /* * If the sequence number for the lsn the page is equal or greater * than lsn we expect, then this is a subsequent write. */ return lsn_seq >= seq || (lsn_seq == seq - 1 && log->first_page == vbo && vbo != (lsn_to_vbo(log, lsn) & ~log->page_mask)); } /* * last_log_lsn * * Walks through the log pages for a file, searching for the * last log page written to the file. */ static int last_log_lsn(struct ntfs_log *log) { int err; bool usa_error = false; bool replace_page = false; bool reuse_page = log->l_flags & NTFSLOG_REUSE_TAIL; bool wrapped_file, wrapped; u32 page_cnt = 1, page_pos = 1; u32 page_off = 0, page_off1 = 0, saved_off = 0; u32 final_off, second_off, final_off_prev = 0, second_off_prev = 0; u32 first_file_off = 0, second_file_off = 0; u32 part_io_count = 0; u32 tails = 0; u32 this_off, curpage_off, nextpage_off, remain_pages; u64 expected_seq, seq_base = 0, lsn_base = 0; u64 best_lsn, best_lsn1, best_lsn2; u64 lsn_cur, lsn1, lsn2; u64 last_ok_lsn = reuse_page ? log->last_lsn : 0; u16 cur_pos, best_page_pos; struct RECORD_PAGE_HDR *page = NULL; struct RECORD_PAGE_HDR *tst_page = NULL; struct RECORD_PAGE_HDR *first_tail = NULL; struct RECORD_PAGE_HDR *second_tail = NULL; struct RECORD_PAGE_HDR *tail_page = NULL; struct RECORD_PAGE_HDR *second_tail_prev = NULL; struct RECORD_PAGE_HDR *first_tail_prev = NULL; struct RECORD_PAGE_HDR *page_bufs = NULL; struct RECORD_PAGE_HDR *best_page; if (log->major_ver >= 2) { final_off = 0x02 * log->page_size; second_off = 0x12 * log->page_size; // 0x10 == 0x12 - 0x2 page_bufs = kmalloc(log->page_size * 0x10, GFP_NOFS); if (!page_bufs) return -ENOMEM; } else { second_off = log->first_page - log->page_size; final_off = second_off - log->page_size; } next_tail: /* Read second tail page (at pos 3/0x12000). */ if (read_log_page(log, second_off, &second_tail, &usa_error) || usa_error || second_tail->rhdr.sign != NTFS_RCRD_SIGNATURE) { kfree(second_tail); second_tail = NULL; second_file_off = 0; lsn2 = 0; } else { second_file_off = hdr_file_off(log, second_tail); lsn2 = le64_to_cpu(second_tail->record_hdr.last_end_lsn); } /* Read first tail page (at pos 2/0x2000). */ if (read_log_page(log, final_off, &first_tail, &usa_error) || usa_error || first_tail->rhdr.sign != NTFS_RCRD_SIGNATURE) { kfree(first_tail); first_tail = NULL; first_file_off = 0; lsn1 = 0; } else { first_file_off = hdr_file_off(log, first_tail); lsn1 = le64_to_cpu(first_tail->record_hdr.last_end_lsn); } if (log->major_ver < 2) { int best_page; first_tail_prev = first_tail; final_off_prev = first_file_off; second_tail_prev = second_tail; second_off_prev = second_file_off; tails = 1; if (!first_tail && !second_tail) goto tail_read; if (first_tail && second_tail) best_page = lsn1 < lsn2 ? 1 : 0; else if (first_tail) best_page = 0; else best_page = 1; page_off = best_page ? second_file_off : first_file_off; seq_base = (best_page ? lsn2 : lsn1) >> log->file_data_bits; goto tail_read; } best_lsn1 = first_tail ? base_lsn(log, first_tail, first_file_off) : 0; best_lsn2 = second_tail ? base_lsn(log, second_tail, second_file_off) : 0; if (first_tail && second_tail) { if (best_lsn1 > best_lsn2) { best_lsn = best_lsn1; best_page = first_tail; this_off = first_file_off; } else { best_lsn = best_lsn2; best_page = second_tail; this_off = second_file_off; } } else if (first_tail) { best_lsn = best_lsn1; best_page = first_tail; this_off = first_file_off; } else if (second_tail) { best_lsn = best_lsn2; best_page = second_tail; this_off = second_file_off; } else { goto tail_read; } best_page_pos = le16_to_cpu(best_page->page_pos); if (!tails) { if (best_page_pos == page_pos) { seq_base = best_lsn >> log->file_data_bits; saved_off = page_off = le32_to_cpu(best_page->file_off); lsn_base = best_lsn; memmove(page_bufs, best_page, log->page_size); page_cnt = le16_to_cpu(best_page->page_count); if (page_cnt > 1) page_pos += 1; tails = 1; } } else if (seq_base == (best_lsn >> log->file_data_bits) && saved_off + log->page_size == this_off && lsn_base < best_lsn && (page_pos != page_cnt || best_page_pos == page_pos || best_page_pos == 1) && (page_pos >= page_cnt || best_page_pos == page_pos)) { u16 bppc = le16_to_cpu(best_page->page_count); saved_off += log->page_size; lsn_base = best_lsn; memmove(Add2Ptr(page_bufs, tails * log->page_size), best_page, log->page_size); tails += 1; if (best_page_pos != bppc) { page_cnt = bppc; page_pos = best_page_pos; if (page_cnt > 1) page_pos += 1; } else { page_pos = page_cnt = 1; } } else { kfree(first_tail); kfree(second_tail); goto tail_read; } kfree(first_tail_prev); first_tail_prev = first_tail; final_off_prev = first_file_off; first_tail = NULL; kfree(second_tail_prev); second_tail_prev = second_tail; second_off_prev = second_file_off; second_tail = NULL; final_off += log->page_size; second_off += log->page_size; if (tails < 0x10) goto next_tail; tail_read: first_tail = first_tail_prev; final_off = final_off_prev; second_tail = second_tail_prev; second_off = second_off_prev; page_cnt = page_pos = 1; curpage_off = seq_base == log->seq_num ? min(log->next_page, page_off) : log->next_page; wrapped_file = curpage_off == log->first_page && !(log->l_flags & (NTFSLOG_NO_LAST_LSN | NTFSLOG_REUSE_TAIL)); expected_seq = wrapped_file ? (log->seq_num + 1) : log->seq_num; nextpage_off = curpage_off; next_page: tail_page = NULL; /* Read the next log page. */ err = read_log_page(log, curpage_off, &page, &usa_error); /* Compute the next log page offset the file. */ nextpage_off = next_page_off(log, curpage_off); wrapped = nextpage_off == log->first_page; if (tails > 1) { struct RECORD_PAGE_HDR *cur_page = Add2Ptr(page_bufs, curpage_off - page_off); if (curpage_off == saved_off) { tail_page = cur_page; goto use_tail_page; } if (page_off > curpage_off || curpage_off >= saved_off) goto use_tail_page; if (page_off1) goto use_cur_page; if (!err && !usa_error && page->rhdr.sign == NTFS_RCRD_SIGNATURE && cur_page->rhdr.lsn == page->rhdr.lsn && cur_page->record_hdr.next_record_off == page->record_hdr.next_record_off && ((page_pos == page_cnt && le16_to_cpu(page->page_pos) == 1) || (page_pos != page_cnt && le16_to_cpu(page->page_pos) == page_pos + 1 && le16_to_cpu(page->page_count) == page_cnt))) { cur_page = NULL; goto use_tail_page; } page_off1 = page_off; use_cur_page: lsn_cur = le64_to_cpu(cur_page->rhdr.lsn); if (last_ok_lsn != le64_to_cpu(cur_page->record_hdr.last_end_lsn) && ((lsn_cur >> log->file_data_bits) + ((curpage_off < (lsn_to_vbo(log, lsn_cur) & ~log->page_mask)) ? 1 : 0)) != expected_seq) { goto check_tail; } if (!is_log_record_end(cur_page)) { tail_page = NULL; last_ok_lsn = lsn_cur; goto next_page_1; } log->seq_num = expected_seq; log->l_flags &= ~NTFSLOG_NO_LAST_LSN; log->last_lsn = le64_to_cpu(cur_page->record_hdr.last_end_lsn); log->ra->current_lsn = cur_page->record_hdr.last_end_lsn; if (log->record_header_len <= log->page_size - le16_to_cpu(cur_page->record_hdr.next_record_off)) { log->l_flags |= NTFSLOG_REUSE_TAIL; log->next_page = curpage_off; } else { log->l_flags &= ~NTFSLOG_REUSE_TAIL; log->next_page = nextpage_off; } if (wrapped_file) log->l_flags |= NTFSLOG_WRAPPED; last_ok_lsn = le64_to_cpu(cur_page->record_hdr.last_end_lsn); goto next_page_1; } /* * If we are at the expected first page of a transfer check to see * if either tail copy is at this offset. * If this page is the last page of a transfer, check if we wrote * a subsequent tail copy. */ if (page_cnt == page_pos || page_cnt == page_pos + 1) { /* * Check if the offset matches either the first or second * tail copy. It is possible it will match both. */ if (curpage_off == final_off) tail_page = first_tail; /* * If we already matched on the first page then * check the ending lsn's. */ if (curpage_off == second_off) { if (!tail_page || (second_tail && le64_to_cpu(second_tail->record_hdr.last_end_lsn) > le64_to_cpu(first_tail->record_hdr .last_end_lsn))) { tail_page = second_tail; } } } use_tail_page: if (tail_page) { /* We have a candidate for a tail copy. */ lsn_cur = le64_to_cpu(tail_page->record_hdr.last_end_lsn); if (last_ok_lsn < lsn_cur) { /* * If the sequence number is not expected, * then don't use the tail copy. */ if (expected_seq != (lsn_cur >> log->file_data_bits)) tail_page = NULL; } else if (last_ok_lsn > lsn_cur) { /* * If the last lsn is greater than the one on * this page then forget this tail. */ tail_page = NULL; } } /* *If we have an error on the current page, * we will break of this loop. */ if (err || usa_error) goto check_tail; /* * Done if the last lsn on this page doesn't match the previous known * last lsn or the sequence number is not expected. */ lsn_cur = le64_to_cpu(page->rhdr.lsn); if (last_ok_lsn != lsn_cur && expected_seq != (lsn_cur >> log->file_data_bits)) { goto check_tail; } /* * Check that the page position and page count values are correct. * If this is the first page of a transfer the position must be 1 * and the count will be unknown. */ if (page_cnt == page_pos) { if (page->page_pos != cpu_to_le16(1) && (!reuse_page || page->page_pos != page->page_count)) { /* * If the current page is the first page we are * looking at and we are reusing this page then * it can be either the first or last page of a * transfer. Otherwise it can only be the first. */ goto check_tail; } } else if (le16_to_cpu(page->page_count) != page_cnt || le16_to_cpu(page->page_pos) != page_pos + 1) { /* * The page position better be 1 more than the last page * position and the page count better match. */ goto check_tail; } /* * We have a valid page the file and may have a valid page * the tail copy area. * If the tail page was written after the page the file then * break of the loop. */ if (tail_page && le64_to_cpu(tail_page->record_hdr.last_end_lsn) > lsn_cur) { /* Remember if we will replace the page. */ replace_page = true; goto check_tail; } tail_page = NULL; if (is_log_record_end(page)) { /* * Since we have read this page we know the sequence number * is the same as our expected value. */ log->seq_num = expected_seq; log->last_lsn = le64_to_cpu(page->record_hdr.last_end_lsn); log->ra->current_lsn = page->record_hdr.last_end_lsn; log->l_flags &= ~NTFSLOG_NO_LAST_LSN; /* * If there is room on this page for another header then * remember we want to reuse the page. */ if (log->record_header_len <= log->page_size - le16_to_cpu(page->record_hdr.next_record_off)) { log->l_flags |= NTFSLOG_REUSE_TAIL; log->next_page = curpage_off; } else { log->l_flags &= ~NTFSLOG_REUSE_TAIL; log->next_page = nextpage_off; } /* Remember if we wrapped the log file. */ if (wrapped_file) log->l_flags |= NTFSLOG_WRAPPED; } /* * Remember the last page count and position. * Also remember the last known lsn. */ page_cnt = le16_to_cpu(page->page_count); page_pos = le16_to_cpu(page->page_pos); last_ok_lsn = le64_to_cpu(page->rhdr.lsn); next_page_1: if (wrapped) { expected_seq += 1; wrapped_file = 1; } curpage_off = nextpage_off; kfree(page); page = NULL; reuse_page = 0; goto next_page; check_tail: if (tail_page) { log->seq_num = expected_seq; log->last_lsn = le64_to_cpu(tail_page->record_hdr.last_end_lsn); log->ra->current_lsn = tail_page->record_hdr.last_end_lsn; log->l_flags &= ~NTFSLOG_NO_LAST_LSN; if (log->page_size - le16_to_cpu( tail_page->record_hdr.next_record_off) >= log->record_header_len) { log->l_flags |= NTFSLOG_REUSE_TAIL; log->next_page = curpage_off; } else { log->l_flags &= ~NTFSLOG_REUSE_TAIL; log->next_page = nextpage_off; } if (wrapped) log->l_flags |= NTFSLOG_WRAPPED; } /* Remember that the partial IO will start at the next page. */ second_off = nextpage_off; /* * If the next page is the first page of the file then update * the sequence number for log records which begon the next page. */ if (wrapped) expected_seq += 1; /* * If we have a tail copy or are performing single page I/O we can * immediately look at the next page. */ if (replace_page || (log->ra->flags & RESTART_SINGLE_PAGE_IO)) { page_cnt = 2; page_pos = 1; goto check_valid; } if (page_pos != page_cnt) goto check_valid; /* * If the next page causes us to wrap to the beginning of the log * file then we know which page to check next. */ if (wrapped) { page_cnt = 2; page_pos = 1; goto check_valid; } cur_pos = 2; next_test_page: kfree(tst_page); tst_page = NULL; /* Walk through the file, reading log pages. */ err = read_log_page(log, nextpage_off, &tst_page, &usa_error); /* * If we get a USA error then assume that we correctly found * the end of the original transfer. */ if (usa_error) goto file_is_valid; /* * If we were able to read the page, we examine it to see if it * is the same or different Io block. */ if (err) goto next_test_page_1; if (le16_to_cpu(tst_page->page_pos) == cur_pos && check_subseq_log_page(log, tst_page, nextpage_off, expected_seq)) { page_cnt = le16_to_cpu(tst_page->page_count) + 1; page_pos = le16_to_cpu(tst_page->page_pos); goto check_valid; } else { goto file_is_valid; } next_test_page_1: nextpage_off = next_page_off(log, curpage_off); wrapped = nextpage_off == log->first_page; if (wrapped) { expected_seq += 1; page_cnt = 2; page_pos = 1; } cur_pos += 1; part_io_count += 1; if (!wrapped) goto next_test_page; check_valid: /* Skip over the remaining pages this transfer. */ remain_pages = page_cnt - page_pos - 1; part_io_count += remain_pages; while (remain_pages--) { nextpage_off = next_page_off(log, curpage_off); wrapped = nextpage_off == log->first_page; if (wrapped) expected_seq += 1; } /* Call our routine to check this log page. */ kfree(tst_page); tst_page = NULL; err = read_log_page(log, nextpage_off, &tst_page, &usa_error); if (!err && !usa_error && check_subseq_log_page(log, tst_page, nextpage_off, expected_seq)) { err = -EINVAL; goto out; } file_is_valid: /* We have a valid file. */ if (page_off1 || tail_page) { struct RECORD_PAGE_HDR *tmp_page; if (sb_rdonly(log->ni->mi.sbi->sb)) { err = -EROFS; goto out; } if (page_off1) { tmp_page = Add2Ptr(page_bufs, page_off1 - page_off); tails -= (page_off1 - page_off) / log->page_size; if (!tail_page) tails -= 1; } else { tmp_page = tail_page; tails = 1; } while (tails--) { u64 off = hdr_file_off(log, tmp_page); if (!page) { page = kmalloc(log->page_size, GFP_NOFS); if (!page) { err = -ENOMEM; goto out; } } /* * Correct page and copy the data from this page * into it and flush it to disk. */ memcpy(page, tmp_page, log->page_size); /* Fill last flushed lsn value flush the page. */ if (log->major_ver < 2) page->rhdr.lsn = page->record_hdr.last_end_lsn; else page->file_off = 0; page->page_pos = page->page_count = cpu_to_le16(1); ntfs_fix_pre_write(&page->rhdr, log->page_size); err = ntfs_sb_write_run(log->ni->mi.sbi, &log->ni->file.run, off, page, log->page_size, 0); if (err) goto out; if (part_io_count && second_off == off) { second_off += log->page_size; part_io_count -= 1; } tmp_page = Add2Ptr(tmp_page, log->page_size); } } if (part_io_count) { if (sb_rdonly(log->ni->mi.sbi->sb)) { err = -EROFS; goto out; } } out: kfree(second_tail); kfree(first_tail); kfree(page); kfree(tst_page); kfree(page_bufs); return err; } /* * read_log_rec_buf - Copy a log record from the file to a buffer. * * The log record may span several log pages and may even wrap the file. */ static int read_log_rec_buf(struct ntfs_log *log, const struct LFS_RECORD_HDR *rh, void *buffer) { int err; struct RECORD_PAGE_HDR *ph = NULL; u64 lsn = le64_to_cpu(rh->this_lsn); u32 vbo = lsn_to_vbo(log, lsn) & ~log->page_mask; u32 off = lsn_to_page_off(log, lsn) + log->record_header_len; u32 data_len = le32_to_cpu(rh->client_data_len); /* * While there are more bytes to transfer, * we continue to attempt to perform the read. */ for (;;) { bool usa_error; u32 tail = log->page_size - off; if (tail >= data_len) tail = data_len; data_len -= tail; err = read_log_page(log, vbo, &ph, &usa_error); if (err) goto out; /* * The last lsn on this page better be greater or equal * to the lsn we are copying. */ if (lsn > le64_to_cpu(ph->rhdr.lsn)) { err = -EINVAL; goto out; } memcpy(buffer, Add2Ptr(ph, off), tail); /* If there are no more bytes to transfer, we exit the loop. */ if (!data_len) { if (!is_log_record_end(ph) || lsn > le64_to_cpu(ph->record_hdr.last_end_lsn)) { err = -EINVAL; goto out; } break; } if (ph->rhdr.lsn == ph->record_hdr.last_end_lsn || lsn > le64_to_cpu(ph->rhdr.lsn)) { err = -EINVAL; goto out; } vbo = next_page_off(log, vbo); off = log->data_off; /* * Adjust our pointer the user's buffer to transfer * the next block to. */ buffer = Add2Ptr(buffer, tail); } out: kfree(ph); return err; } static int read_rst_area(struct ntfs_log *log, struct NTFS_RESTART **rst_, u64 *lsn) { int err; struct LFS_RECORD_HDR *rh = NULL; const struct CLIENT_REC *cr = Add2Ptr(log->ra, le16_to_cpu(log->ra->client_off)); u64 lsnr, lsnc = le64_to_cpu(cr->restart_lsn); u32 len; struct NTFS_RESTART *rst; *lsn = 0; *rst_ = NULL; /* If the client doesn't have a restart area, go ahead and exit now. */ if (!lsnc) return 0; err = read_log_page(log, lsn_to_vbo(log, lsnc), (struct RECORD_PAGE_HDR **)&rh, NULL); if (err) return err; rst = NULL; lsnr = le64_to_cpu(rh->this_lsn); if (lsnc != lsnr) { /* If the lsn values don't match, then the disk is corrupt. */ err = -EINVAL; goto out; } *lsn = lsnr; len = le32_to_cpu(rh->client_data_len); if (!len) { err = 0; goto out; } if (len < sizeof(struct NTFS_RESTART)) { err = -EINVAL; goto out; } rst = kmalloc(len, GFP_NOFS); if (!rst) { err = -ENOMEM; goto out; } /* Copy the data into the 'rst' buffer. */ err = read_log_rec_buf(log, rh, rst); if (err) goto out; *rst_ = rst; rst = NULL; out: kfree(rh); kfree(rst); return err; } static int find_log_rec(struct ntfs_log *log, u64 lsn, struct lcb *lcb) { int err; struct LFS_RECORD_HDR *rh = lcb->lrh; u32 rec_len, len; /* Read the record header for this lsn. */ if (!rh) { err = read_log_page(log, lsn_to_vbo(log, lsn), (struct RECORD_PAGE_HDR **)&rh, NULL); lcb->lrh = rh; if (err) return err; } /* * If the lsn the log record doesn't match the desired * lsn then the disk is corrupt. */ if (lsn != le64_to_cpu(rh->this_lsn)) return -EINVAL; len = le32_to_cpu(rh->client_data_len); /* * Check that the length field isn't greater than the total * available space the log file. */ rec_len = len + log->record_header_len; if (rec_len >= log->total_avail) return -EINVAL; /* * If the entire log record is on this log page, * put a pointer to the log record the context block. */ if (rh->flags & LOG_RECORD_MULTI_PAGE) { void *lr = kmalloc(len, GFP_NOFS); if (!lr) return -ENOMEM; lcb->log_rec = lr; lcb->alloc = true; /* Copy the data into the buffer returned. */ err = read_log_rec_buf(log, rh, lr); if (err) return err; } else { /* If beyond the end of the current page -> an error. */ u32 page_off = lsn_to_page_off(log, lsn); if (page_off + len + log->record_header_len > log->page_size) return -EINVAL; lcb->log_rec = Add2Ptr(rh, sizeof(struct LFS_RECORD_HDR)); lcb->alloc = false; } return 0; } /* * read_log_rec_lcb - Init the query operation. */ static int read_log_rec_lcb(struct ntfs_log *log, u64 lsn, u32 ctx_mode, struct lcb **lcb_) { int err; const struct CLIENT_REC *cr; struct lcb *lcb; switch (ctx_mode) { case lcb_ctx_undo_next: case lcb_ctx_prev: case lcb_ctx_next: break; default: return -EINVAL; } /* Check that the given lsn is the legal range for this client. */ cr = Add2Ptr(log->ra, le16_to_cpu(log->ra->client_off)); if (!verify_client_lsn(log, cr, lsn)) return -EINVAL; lcb = kzalloc(sizeof(struct lcb), GFP_NOFS); if (!lcb) return -ENOMEM; lcb->client = log->client_id; lcb->ctx_mode = ctx_mode; /* Find the log record indicated by the given lsn. */ err = find_log_rec(log, lsn, lcb); if (err) goto out; *lcb_ = lcb; return 0; out: lcb_put(lcb); *lcb_ = NULL; return err; } /* * find_client_next_lsn * * Attempt to find the next lsn to return to a client based on the context mode. */ static int find_client_next_lsn(struct ntfs_log *log, struct lcb *lcb, u64 *lsn) { int err; u64 next_lsn; struct LFS_RECORD_HDR *hdr; hdr = lcb->lrh; *lsn = 0; if (lcb_ctx_next != lcb->ctx_mode) goto check_undo_next; /* Loop as long as another lsn can be found. */ for (;;) { u64 current_lsn; err = next_log_lsn(log, hdr, ¤t_lsn); if (err) goto out; if (!current_lsn) break; if (hdr != lcb->lrh) kfree(hdr); hdr = NULL; err = read_log_page(log, lsn_to_vbo(log, current_lsn), (struct RECORD_PAGE_HDR **)&hdr, NULL); if (err) goto out; if (memcmp(&hdr->client, &lcb->client, sizeof(struct CLIENT_ID))) { /*err = -EINVAL; */ } else if (LfsClientRecord == hdr->record_type) { kfree(lcb->lrh); lcb->lrh = hdr; *lsn = current_lsn; return 0; } } out: if (hdr != lcb->lrh) kfree(hdr); return err; check_undo_next: if (lcb_ctx_undo_next == lcb->ctx_mode) next_lsn = le64_to_cpu(hdr->client_undo_next_lsn); else if (lcb_ctx_prev == lcb->ctx_mode) next_lsn = le64_to_cpu(hdr->client_prev_lsn); else return 0; if (!next_lsn) return 0; if (!verify_client_lsn( log, Add2Ptr(log->ra, le16_to_cpu(log->ra->client_off)), next_lsn)) return 0; hdr = NULL; err = read_log_page(log, lsn_to_vbo(log, next_lsn), (struct RECORD_PAGE_HDR **)&hdr, NULL); if (err) return err; kfree(lcb->lrh); lcb->lrh = hdr; *lsn = next_lsn; return 0; } static int read_next_log_rec(struct ntfs_log *log, struct lcb *lcb, u64 *lsn) { int err; err = find_client_next_lsn(log, lcb, lsn); if (err) return err; if (!*lsn) return 0; if (lcb->alloc) kfree(lcb->log_rec); lcb->log_rec = NULL; lcb->alloc = false; kfree(lcb->lrh); lcb->lrh = NULL; return find_log_rec(log, *lsn, lcb); } bool check_index_header(const struct INDEX_HDR *hdr, size_t bytes) { __le16 mask; u32 min_de, de_off, used, total; const struct NTFS_DE *e; if (hdr_has_subnode(hdr)) { min_de = sizeof(struct NTFS_DE) + sizeof(u64); mask = NTFS_IE_HAS_SUBNODES; } else { min_de = sizeof(struct NTFS_DE); mask = 0; } de_off = le32_to_cpu(hdr->de_off); used = le32_to_cpu(hdr->used); total = le32_to_cpu(hdr->total); if (de_off > bytes - min_de || used > bytes || total > bytes || de_off + min_de > used || used > total) { return false; } e = Add2Ptr(hdr, de_off); for (;;) { u16 esize = le16_to_cpu(e->size); struct NTFS_DE *next = Add2Ptr(e, esize); if (esize < min_de || PtrOffset(hdr, next) > used || (e->flags & NTFS_IE_HAS_SUBNODES) != mask) { return false; } if (de_is_last(e)) break; e = next; } return true; } static inline bool check_index_buffer(const struct INDEX_BUFFER *ib, u32 bytes) { u16 fo; const struct NTFS_RECORD_HEADER *r = &ib->rhdr; if (r->sign != NTFS_INDX_SIGNATURE) return false; fo = (SECTOR_SIZE - ((bytes >> SECTOR_SHIFT) + 1) * sizeof(short)); if (le16_to_cpu(r->fix_off) > fo) return false; if ((le16_to_cpu(r->fix_num) - 1) * SECTOR_SIZE != bytes) return false; return check_index_header(&ib->ihdr, bytes - offsetof(struct INDEX_BUFFER, ihdr)); } static inline bool check_index_root(const struct ATTRIB *attr, struct ntfs_sb_info *sbi) { bool ret; const struct INDEX_ROOT *root = resident_data(attr); u8 index_bits = le32_to_cpu(root->index_block_size) >= sbi->cluster_size ? sbi->cluster_bits : SECTOR_SHIFT; u8 block_clst = root->index_block_clst; if (le32_to_cpu(attr->res.data_size) < sizeof(struct INDEX_ROOT) || (root->type != ATTR_NAME && root->type != ATTR_ZERO) || (root->type == ATTR_NAME && root->rule != NTFS_COLLATION_TYPE_FILENAME) || (le32_to_cpu(root->index_block_size) != (block_clst << index_bits)) || (block_clst != 1 && block_clst != 2 && block_clst != 4 && block_clst != 8 && block_clst != 0x10 && block_clst != 0x20 && block_clst != 0x40 && block_clst != 0x80)) { return false; } ret = check_index_header(&root->ihdr, le32_to_cpu(attr->res.data_size) - offsetof(struct INDEX_ROOT, ihdr)); return ret; } static inline bool check_attr(const struct MFT_REC *rec, const struct ATTRIB *attr, struct ntfs_sb_info *sbi) { u32 asize = le32_to_cpu(attr->size); u32 rsize = 0; u64 dsize, svcn, evcn; u16 run_off; /* Check the fixed part of the attribute record header. */ if (asize >= sbi->record_size || asize + PtrOffset(rec, attr) >= sbi->record_size || (attr->name_len && le16_to_cpu(attr->name_off) + attr->name_len * sizeof(short) > asize)) { return false; } /* Check the attribute fields. */ switch (attr->non_res) { case 0: rsize = le32_to_cpu(attr->res.data_size); if (rsize >= asize || le16_to_cpu(attr->res.data_off) + rsize > asize) { return false; } break; case 1: dsize = le64_to_cpu(attr->nres.data_size); svcn = le64_to_cpu(attr->nres.svcn); evcn = le64_to_cpu(attr->nres.evcn); run_off = le16_to_cpu(attr->nres.run_off); if (svcn > evcn + 1 || run_off >= asize || le64_to_cpu(attr->nres.valid_size) > dsize || dsize > le64_to_cpu(attr->nres.alloc_size)) { return false; } if (run_off > asize) return false; if (run_unpack(NULL, sbi, 0, svcn, evcn, svcn, Add2Ptr(attr, run_off), asize - run_off) < 0) { return false; } return true; default: return false; } switch (attr->type) { case ATTR_NAME: if (fname_full_size(Add2Ptr( attr, le16_to_cpu(attr->res.data_off))) > asize) { return false; } break; case ATTR_ROOT: return check_index_root(attr, sbi); case ATTR_STD: if (rsize < sizeof(struct ATTR_STD_INFO5) && rsize != sizeof(struct ATTR_STD_INFO)) { return false; } break; case ATTR_LIST: case ATTR_ID: case ATTR_SECURE: case ATTR_LABEL: case ATTR_VOL_INFO: case ATTR_DATA: case ATTR_ALLOC: case ATTR_BITMAP: case ATTR_REPARSE: case ATTR_EA_INFO: case ATTR_EA: case ATTR_PROPERTYSET: case ATTR_LOGGED_UTILITY_STREAM: break; default: return false; } return true; } static inline bool check_file_record(const struct MFT_REC *rec, const struct MFT_REC *rec2, struct ntfs_sb_info *sbi) { const struct ATTRIB *attr; u16 fo = le16_to_cpu(rec->rhdr.fix_off); u16 fn = le16_to_cpu(rec->rhdr.fix_num); u16 ao = le16_to_cpu(rec->attr_off); u32 rs = sbi->record_size; /* Check the file record header for consistency. */ if (rec->rhdr.sign != NTFS_FILE_SIGNATURE || fo > (SECTOR_SIZE - ((rs >> SECTOR_SHIFT) + 1) * sizeof(short)) || (fn - 1) * SECTOR_SIZE != rs || ao < MFTRECORD_FIXUP_OFFSET_1 || ao > sbi->record_size - SIZEOF_RESIDENT || !is_rec_inuse(rec) || le32_to_cpu(rec->total) != rs) { return false; } /* Loop to check all of the attributes. */ for (attr = Add2Ptr(rec, ao); attr->type != ATTR_END; attr = Add2Ptr(attr, le32_to_cpu(attr->size))) { if (check_attr(rec, attr, sbi)) continue; return false; } return true; } static inline int check_lsn(const struct NTFS_RECORD_HEADER *hdr, const u64 *rlsn) { u64 lsn; if (!rlsn) return true; lsn = le64_to_cpu(hdr->lsn); if (hdr->sign == NTFS_HOLE_SIGNATURE) return false; if (*rlsn > lsn) return true; return false; } static inline bool check_if_attr(const struct MFT_REC *rec, const struct LOG_REC_HDR *lrh) { u16 ro = le16_to_cpu(lrh->record_off); u16 o = le16_to_cpu(rec->attr_off); const struct ATTRIB *attr = Add2Ptr(rec, o); while (o < ro) { u32 asize; if (attr->type == ATTR_END) break; asize = le32_to_cpu(attr->size); if (!asize) break; o += asize; attr = Add2Ptr(attr, asize); } return o == ro; } static inline bool check_if_index_root(const struct MFT_REC *rec, const struct LOG_REC_HDR *lrh) { u16 ro = le16_to_cpu(lrh->record_off); u16 o = le16_to_cpu(rec->attr_off); const struct ATTRIB *attr = Add2Ptr(rec, o); while (o < ro) { u32 asize; if (attr->type == ATTR_END) break; asize = le32_to_cpu(attr->size); if (!asize) break; o += asize; attr = Add2Ptr(attr, asize); } return o == ro && attr->type == ATTR_ROOT; } static inline bool check_if_root_index(const struct ATTRIB *attr, const struct INDEX_HDR *hdr, const struct LOG_REC_HDR *lrh) { u16 ao = le16_to_cpu(lrh->attr_off); u32 de_off = le32_to_cpu(hdr->de_off); u32 o = PtrOffset(attr, hdr) + de_off; const struct NTFS_DE *e = Add2Ptr(hdr, de_off); u32 asize = le32_to_cpu(attr->size); while (o < ao) { u16 esize; if (o >= asize) break; esize = le16_to_cpu(e->size); if (!esize) break; o += esize; e = Add2Ptr(e, esize); } return o == ao; } static inline bool check_if_alloc_index(const struct INDEX_HDR *hdr, u32 attr_off) { u32 de_off = le32_to_cpu(hdr->de_off); u32 o = offsetof(struct INDEX_BUFFER, ihdr) + de_off; const struct NTFS_DE *e = Add2Ptr(hdr, de_off); u32 used = le32_to_cpu(hdr->used); while (o < attr_off) { u16 esize; if (de_off >= used) break; esize = le16_to_cpu(e->size); if (!esize) break; o += esize; de_off += esize; e = Add2Ptr(e, esize); } return o == attr_off; } static inline void change_attr_size(struct MFT_REC *rec, struct ATTRIB *attr, u32 nsize) { u32 asize = le32_to_cpu(attr->size); int dsize = nsize - asize; u8 *next = Add2Ptr(attr, asize); u32 used = le32_to_cpu(rec->used); memmove(Add2Ptr(attr, nsize), next, used - PtrOffset(rec, next)); rec->used = cpu_to_le32(used + dsize); attr->size = cpu_to_le32(nsize); } struct OpenAttr { struct ATTRIB *attr; struct runs_tree *run1; struct runs_tree run0; struct ntfs_inode *ni; // CLST rno; }; /* * cmp_type_and_name * * Return: 0 if 'attr' has the same type and name. */ static inline int cmp_type_and_name(const struct ATTRIB *a1, const struct ATTRIB *a2) { return a1->type != a2->type || a1->name_len != a2->name_len || (a1->name_len && memcmp(attr_name(a1), attr_name(a2), a1->name_len * sizeof(short))); } static struct OpenAttr *find_loaded_attr(struct ntfs_log *log, const struct ATTRIB *attr, CLST rno) { struct OPEN_ATTR_ENRTY *oe = NULL; while ((oe = enum_rstbl(log->open_attr_tbl, oe))) { struct OpenAttr *op_attr; if (ino_get(&oe->ref) != rno) continue; op_attr = (struct OpenAttr *)oe->ptr; if (!cmp_type_and_name(op_attr->attr, attr)) return op_attr; } return NULL; } static struct ATTRIB *attr_create_nonres_log(struct ntfs_sb_info *sbi, enum ATTR_TYPE type, u64 size, const u16 *name, size_t name_len, __le16 flags) { struct ATTRIB *attr; u32 name_size = ALIGN(name_len * sizeof(short), 8); bool is_ext = flags & (ATTR_FLAG_COMPRESSED | ATTR_FLAG_SPARSED); u32 asize = name_size + (is_ext ? SIZEOF_NONRESIDENT_EX : SIZEOF_NONRESIDENT); attr = kzalloc(asize, GFP_NOFS); if (!attr) return NULL; attr->type = type; attr->size = cpu_to_le32(asize); attr->flags = flags; attr->non_res = 1; attr->name_len = name_len; attr->nres.evcn = cpu_to_le64((u64)bytes_to_cluster(sbi, size) - 1); attr->nres.alloc_size = cpu_to_le64(ntfs_up_cluster(sbi, size)); attr->nres.data_size = cpu_to_le64(size); attr->nres.valid_size = attr->nres.data_size; if (is_ext) { attr->name_off = SIZEOF_NONRESIDENT_EX_LE; if (is_attr_compressed(attr)) attr->nres.c_unit = NTFS_LZNT_CUNIT; attr->nres.run_off = cpu_to_le16(SIZEOF_NONRESIDENT_EX + name_size); memcpy(Add2Ptr(attr, SIZEOF_NONRESIDENT_EX), name, name_len * sizeof(short)); } else { attr->name_off = SIZEOF_NONRESIDENT_LE; attr->nres.run_off = cpu_to_le16(SIZEOF_NONRESIDENT + name_size); memcpy(Add2Ptr(attr, SIZEOF_NONRESIDENT), name, name_len * sizeof(short)); } return attr; } /* * do_action - Common routine for the Redo and Undo Passes. * @rlsn: If it is NULL then undo. */ static int do_action(struct ntfs_log *log, struct OPEN_ATTR_ENRTY *oe, const struct LOG_REC_HDR *lrh, u32 op, void *data, u32 dlen, u32 rec_len, const u64 *rlsn) { int err = 0; struct ntfs_sb_info *sbi = log->ni->mi.sbi; struct inode *inode = NULL, *inode_parent; struct mft_inode *mi = NULL, *mi2_child = NULL; CLST rno = 0, rno_base = 0; struct INDEX_BUFFER *ib = NULL; struct MFT_REC *rec = NULL; struct ATTRIB *attr = NULL, *attr2; struct INDEX_HDR *hdr; struct INDEX_ROOT *root; struct NTFS_DE *e, *e1, *e2; struct NEW_ATTRIBUTE_SIZES *new_sz; struct ATTR_FILE_NAME *fname; struct OpenAttr *oa, *oa2; u32 nsize, t32, asize, used, esize, off, bits; u16 id, id2; u32 record_size = sbi->record_size; u64 t64; u16 roff = le16_to_cpu(lrh->record_off); u16 aoff = le16_to_cpu(lrh->attr_off); u64 lco = 0; u64 cbo = (u64)le16_to_cpu(lrh->cluster_off) << SECTOR_SHIFT; u64 tvo = le64_to_cpu(lrh->target_vcn) << sbi->cluster_bits; u64 vbo = cbo + tvo; void *buffer_le = NULL; u32 bytes = 0; bool a_dirty = false; u16 data_off; oa = oe->ptr; /* Big switch to prepare. */ switch (op) { /* ============================================================ * Process MFT records, as described by the current log record. * ============================================================ */ case InitializeFileRecordSegment: case DeallocateFileRecordSegment: case WriteEndOfFileRecordSegment: case CreateAttribute: case DeleteAttribute: case UpdateResidentValue: case UpdateMappingPairs: case SetNewAttributeSizes: case AddIndexEntryRoot: case DeleteIndexEntryRoot: case SetIndexEntryVcnRoot: case UpdateFileNameRoot: case UpdateRecordDataRoot: case ZeroEndOfFileRecord: rno = vbo >> sbi->record_bits; inode = ilookup(sbi->sb, rno); if (inode) { mi = &ntfs_i(inode)->mi; } else { /* Read from disk. */ err = mi_get(sbi, rno, &mi); if (err && op == InitializeFileRecordSegment) { mi = kzalloc(sizeof(struct mft_inode), GFP_NOFS); if (!mi) return -ENOMEM; err = mi_format_new(mi, sbi, rno, 0, false); } if (err) return err; } rec = mi->mrec; if (op == DeallocateFileRecordSegment) goto skip_load_parent; if (rec->rhdr.sign == NTFS_BAAD_SIGNATURE) goto dirty_vol; if (!check_lsn(&rec->rhdr, rlsn)) goto out; if (!check_file_record(rec, NULL, sbi)) goto dirty_vol; attr = Add2Ptr(rec, roff); if (is_rec_base(rec) || InitializeFileRecordSegment == op) { rno_base = rno; goto skip_load_parent; } rno_base = ino_get(&rec->parent_ref); inode_parent = ntfs_iget5(sbi->sb, &rec->parent_ref, NULL); if (IS_ERR(inode_parent)) goto skip_load_parent; if (is_bad_inode(inode_parent)) { iput(inode_parent); goto skip_load_parent; } if (ni_load_mi_ex(ntfs_i(inode_parent), rno, &mi2_child)) { iput(inode_parent); } else { if (mi2_child->mrec != mi->mrec) memcpy(mi2_child->mrec, mi->mrec, sbi->record_size); if (inode) iput(inode); else mi_put(mi); inode = inode_parent; mi = mi2_child; rec = mi2_child->mrec; attr = Add2Ptr(rec, roff); } skip_load_parent: inode_parent = NULL; break; /* * Process attributes, as described by the current log record. */ case UpdateNonresidentValue: case AddIndexEntryAllocation: case DeleteIndexEntryAllocation: case WriteEndOfIndexBuffer: case SetIndexEntryVcnAllocation: case UpdateFileNameAllocation: case SetBitsInNonresidentBitMap: case ClearBitsInNonresidentBitMap: case UpdateRecordDataAllocation: attr = oa->attr; bytes = UpdateNonresidentValue == op ? dlen : 0; lco = (u64)le16_to_cpu(lrh->lcns_follow) << sbi->cluster_bits; if (attr->type == ATTR_ALLOC) { t32 = le32_to_cpu(oe->bytes_per_index); if (bytes < t32) bytes = t32; } if (!bytes) bytes = lco - cbo; bytes += roff; if (attr->type == ATTR_ALLOC) bytes = (bytes + 511) & ~511; // align buffer_le = kmalloc(bytes, GFP_NOFS); if (!buffer_le) return -ENOMEM; err = ntfs_read_run_nb(sbi, oa->run1, vbo, buffer_le, bytes, NULL); if (err) goto out; if (attr->type == ATTR_ALLOC && *(int *)buffer_le) ntfs_fix_post_read(buffer_le, bytes, false); break; default: WARN_ON(1); } /* Big switch to do operation. */ switch (op) { case InitializeFileRecordSegment: if (roff + dlen > record_size) goto dirty_vol; memcpy(Add2Ptr(rec, roff), data, dlen); mi->dirty = true; break; case DeallocateFileRecordSegment: clear_rec_inuse(rec); le16_add_cpu(&rec->seq, 1); mi->dirty = true; break; case WriteEndOfFileRecordSegment: attr2 = (struct ATTRIB *)data; if (!check_if_attr(rec, lrh) || roff + dlen > record_size) goto dirty_vol; memmove(attr, attr2, dlen); rec->used = cpu_to_le32(ALIGN(roff + dlen, 8)); mi->dirty = true; break; case CreateAttribute: attr2 = (struct ATTRIB *)data; asize = le32_to_cpu(attr2->size); used = le32_to_cpu(rec->used); if (!check_if_attr(rec, lrh) || dlen < SIZEOF_RESIDENT || !IS_ALIGNED(asize, 8) || Add2Ptr(attr2, asize) > Add2Ptr(lrh, rec_len) || dlen > record_size - used) { goto dirty_vol; } memmove(Add2Ptr(attr, asize), attr, used - roff); memcpy(attr, attr2, asize); rec->used = cpu_to_le32(used + asize); id = le16_to_cpu(rec->next_attr_id); id2 = le16_to_cpu(attr2->id); if (id <= id2) rec->next_attr_id = cpu_to_le16(id2 + 1); if (is_attr_indexed(attr)) le16_add_cpu(&rec->hard_links, 1); oa2 = find_loaded_attr(log, attr, rno_base); if (oa2) { void *p2 = kmemdup(attr, le32_to_cpu(attr->size), GFP_NOFS); if (p2) { // run_close(oa2->run1); kfree(oa2->attr); oa2->attr = p2; } } mi->dirty = true; break; case DeleteAttribute: asize = le32_to_cpu(attr->size); used = le32_to_cpu(rec->used); if (!check_if_attr(rec, lrh)) goto dirty_vol; rec->used = cpu_to_le32(used - asize); if (is_attr_indexed(attr)) le16_add_cpu(&rec->hard_links, -1); memmove(attr, Add2Ptr(attr, asize), used - asize - roff); mi->dirty = true; break; case UpdateResidentValue: nsize = aoff + dlen; if (!check_if_attr(rec, lrh)) goto dirty_vol; asize = le32_to_cpu(attr->size); used = le32_to_cpu(rec->used); if (lrh->redo_len == lrh->undo_len) { if (nsize > asize) goto dirty_vol; goto move_data; } if (nsize > asize && nsize - asize > record_size - used) goto dirty_vol; nsize = ALIGN(nsize, 8); data_off = le16_to_cpu(attr->res.data_off); if (nsize < asize) { memmove(Add2Ptr(attr, aoff), data, dlen); data = NULL; // To skip below memmove(). } memmove(Add2Ptr(attr, nsize), Add2Ptr(attr, asize), used - le16_to_cpu(lrh->record_off) - asize); rec->used = cpu_to_le32(used + nsize - asize); attr->size = cpu_to_le32(nsize); attr->res.data_size = cpu_to_le32(aoff + dlen - data_off); move_data: if (data) memmove(Add2Ptr(attr, aoff), data, dlen); oa2 = find_loaded_attr(log, attr, rno_base); if (oa2) { void *p2 = kmemdup(attr, le32_to_cpu(attr->size), GFP_NOFS); if (p2) { // run_close(&oa2->run0); oa2->run1 = &oa2->run0; kfree(oa2->attr); oa2->attr = p2; } } mi->dirty = true; break; case UpdateMappingPairs: nsize = aoff + dlen; asize = le32_to_cpu(attr->size); used = le32_to_cpu(rec->used); if (!check_if_attr(rec, lrh) || !attr->non_res || aoff < le16_to_cpu(attr->nres.run_off) || aoff > asize || (nsize > asize && nsize - asize > record_size - used)) { goto dirty_vol; } nsize = ALIGN(nsize, 8); memmove(Add2Ptr(attr, nsize), Add2Ptr(attr, asize), used - le16_to_cpu(lrh->record_off) - asize); rec->used = cpu_to_le32(used + nsize - asize); attr->size = cpu_to_le32(nsize); memmove(Add2Ptr(attr, aoff), data, dlen); if (run_get_highest_vcn(le64_to_cpu(attr->nres.svcn), attr_run(attr), &t64)) { goto dirty_vol; } attr->nres.evcn = cpu_to_le64(t64); oa2 = find_loaded_attr(log, attr, rno_base); if (oa2 && oa2->attr->non_res) oa2->attr->nres.evcn = attr->nres.evcn; mi->dirty = true; break; case SetNewAttributeSizes: new_sz = data; if (!check_if_attr(rec, lrh) || !attr->non_res) goto dirty_vol; attr->nres.alloc_size = new_sz->alloc_size; attr->nres.data_size = new_sz->data_size; attr->nres.valid_size = new_sz->valid_size; if (dlen >= sizeof(struct NEW_ATTRIBUTE_SIZES)) attr->nres.total_size = new_sz->total_size; oa2 = find_loaded_attr(log, attr, rno_base); if (oa2) { void *p2 = kmemdup(attr, le32_to_cpu(attr->size), GFP_NOFS); if (p2) { kfree(oa2->attr); oa2->attr = p2; } } mi->dirty = true; break; case AddIndexEntryRoot: e = (struct NTFS_DE *)data; esize = le16_to_cpu(e->size); root = resident_data(attr); hdr = &root->ihdr; used = le32_to_cpu(hdr->used); if (!check_if_index_root(rec, lrh) || !check_if_root_index(attr, hdr, lrh) || Add2Ptr(data, esize) > Add2Ptr(lrh, rec_len) || esize > le32_to_cpu(rec->total) - le32_to_cpu(rec->used)) { goto dirty_vol; } e1 = Add2Ptr(attr, le16_to_cpu(lrh->attr_off)); change_attr_size(rec, attr, le32_to_cpu(attr->size) + esize); memmove(Add2Ptr(e1, esize), e1, PtrOffset(e1, Add2Ptr(hdr, used))); memmove(e1, e, esize); le32_add_cpu(&attr->res.data_size, esize); hdr->used = cpu_to_le32(used + esize); le32_add_cpu(&hdr->total, esize); mi->dirty = true; break; case DeleteIndexEntryRoot: root = resident_data(attr); hdr = &root->ihdr; used = le32_to_cpu(hdr->used); if (!check_if_index_root(rec, lrh) || !check_if_root_index(attr, hdr, lrh)) { goto dirty_vol; } e1 = Add2Ptr(attr, le16_to_cpu(lrh->attr_off)); esize = le16_to_cpu(e1->size); e2 = Add2Ptr(e1, esize); memmove(e1, e2, PtrOffset(e2, Add2Ptr(hdr, used))); le32_sub_cpu(&attr->res.data_size, esize); hdr->used = cpu_to_le32(used - esize); le32_sub_cpu(&hdr->total, esize); change_attr_size(rec, attr, le32_to_cpu(attr->size) - esize); mi->dirty = true; break; case SetIndexEntryVcnRoot: root = resident_data(attr); hdr = &root->ihdr; if (!check_if_index_root(rec, lrh) || !check_if_root_index(attr, hdr, lrh)) { goto dirty_vol; } e = Add2Ptr(attr, le16_to_cpu(lrh->attr_off)); de_set_vbn_le(e, *(__le64 *)data); mi->dirty = true; break; case UpdateFileNameRoot: root = resident_data(attr); hdr = &root->ihdr; if (!check_if_index_root(rec, lrh) || !check_if_root_index(attr, hdr, lrh)) { goto dirty_vol; } e = Add2Ptr(attr, le16_to_cpu(lrh->attr_off)); fname = (struct ATTR_FILE_NAME *)(e + 1); memmove(&fname->dup, data, sizeof(fname->dup)); // mi->dirty = true; break; case UpdateRecordDataRoot: root = resident_data(attr); hdr = &root->ihdr; if (!check_if_index_root(rec, lrh) || !check_if_root_index(attr, hdr, lrh)) { goto dirty_vol; } e = Add2Ptr(attr, le16_to_cpu(lrh->attr_off)); memmove(Add2Ptr(e, le16_to_cpu(e->view.data_off)), data, dlen); mi->dirty = true; break; case ZeroEndOfFileRecord: if (roff + dlen > record_size) goto dirty_vol; memset(attr, 0, dlen); mi->dirty = true; break; case UpdateNonresidentValue: if (lco < cbo + roff + dlen) goto dirty_vol; memcpy(Add2Ptr(buffer_le, roff), data, dlen); a_dirty = true; if (attr->type == ATTR_ALLOC) ntfs_fix_pre_write(buffer_le, bytes); break; case AddIndexEntryAllocation: ib = Add2Ptr(buffer_le, roff); hdr = &ib->ihdr; e = data; esize = le16_to_cpu(e->size); e1 = Add2Ptr(ib, aoff); if (is_baad(&ib->rhdr)) goto dirty_vol; if (!check_lsn(&ib->rhdr, rlsn)) goto out; used = le32_to_cpu(hdr->used); if (!check_index_buffer(ib, bytes) || !check_if_alloc_index(hdr, aoff) || Add2Ptr(e, esize) > Add2Ptr(lrh, rec_len) || used + esize > le32_to_cpu(hdr->total)) { goto dirty_vol; } memmove(Add2Ptr(e1, esize), e1, PtrOffset(e1, Add2Ptr(hdr, used))); memcpy(e1, e, esize); hdr->used = cpu_to_le32(used + esize); a_dirty = true; ntfs_fix_pre_write(&ib->rhdr, bytes); break; case DeleteIndexEntryAllocation: ib = Add2Ptr(buffer_le, roff); hdr = &ib->ihdr; e = Add2Ptr(ib, aoff); esize = le16_to_cpu(e->size); if (is_baad(&ib->rhdr)) goto dirty_vol; if (!check_lsn(&ib->rhdr, rlsn)) goto out; if (!check_index_buffer(ib, bytes) || !check_if_alloc_index(hdr, aoff)) { goto dirty_vol; } e1 = Add2Ptr(e, esize); nsize = esize; used = le32_to_cpu(hdr->used); memmove(e, e1, PtrOffset(e1, Add2Ptr(hdr, used))); hdr->used = cpu_to_le32(used - nsize); a_dirty = true; ntfs_fix_pre_write(&ib->rhdr, bytes); break; case WriteEndOfIndexBuffer: ib = Add2Ptr(buffer_le, roff); hdr = &ib->ihdr; e = Add2Ptr(ib, aoff); if (is_baad(&ib->rhdr)) goto dirty_vol; if (!check_lsn(&ib->rhdr, rlsn)) goto out; if (!check_index_buffer(ib, bytes) || !check_if_alloc_index(hdr, aoff) || aoff + dlen > offsetof(struct INDEX_BUFFER, ihdr) + le32_to_cpu(hdr->total)) { goto dirty_vol; } hdr->used = cpu_to_le32(dlen + PtrOffset(hdr, e)); memmove(e, data, dlen); a_dirty = true; ntfs_fix_pre_write(&ib->rhdr, bytes); break; case SetIndexEntryVcnAllocation: ib = Add2Ptr(buffer_le, roff); hdr = &ib->ihdr; e = Add2Ptr(ib, aoff); if (is_baad(&ib->rhdr)) goto dirty_vol; if (!check_lsn(&ib->rhdr, rlsn)) goto out; if (!check_index_buffer(ib, bytes) || !check_if_alloc_index(hdr, aoff)) { goto dirty_vol; } de_set_vbn_le(e, *(__le64 *)data); a_dirty = true; ntfs_fix_pre_write(&ib->rhdr, bytes); break; case UpdateFileNameAllocation: ib = Add2Ptr(buffer_le, roff); hdr = &ib->ihdr; e = Add2Ptr(ib, aoff); if (is_baad(&ib->rhdr)) goto dirty_vol; if (!check_lsn(&ib->rhdr, rlsn)) goto out; if (!check_index_buffer(ib, bytes) || !check_if_alloc_index(hdr, aoff)) { goto dirty_vol; } fname = (struct ATTR_FILE_NAME *)(e + 1); memmove(&fname->dup, data, sizeof(fname->dup)); a_dirty = true; ntfs_fix_pre_write(&ib->rhdr, bytes); break; case SetBitsInNonresidentBitMap: off = le32_to_cpu(((struct BITMAP_RANGE *)data)->bitmap_off); bits = le32_to_cpu(((struct BITMAP_RANGE *)data)->bits); if (cbo + (off + 7) / 8 > lco || cbo + ((off + bits + 7) / 8) > lco) { goto dirty_vol; } ntfs_bitmap_set_le(Add2Ptr(buffer_le, roff), off, bits); a_dirty = true; break; case ClearBitsInNonresidentBitMap: off = le32_to_cpu(((struct BITMAP_RANGE *)data)->bitmap_off); bits = le32_to_cpu(((struct BITMAP_RANGE *)data)->bits); if (cbo + (off + 7) / 8 > lco || cbo + ((off + bits + 7) / 8) > lco) { goto dirty_vol; } ntfs_bitmap_clear_le(Add2Ptr(buffer_le, roff), off, bits); a_dirty = true; break; case UpdateRecordDataAllocation: ib = Add2Ptr(buffer_le, roff); hdr = &ib->ihdr; e = Add2Ptr(ib, aoff); if (is_baad(&ib->rhdr)) goto dirty_vol; if (!check_lsn(&ib->rhdr, rlsn)) goto out; if (!check_index_buffer(ib, bytes) || !check_if_alloc_index(hdr, aoff)) { goto dirty_vol; } memmove(Add2Ptr(e, le16_to_cpu(e->view.data_off)), data, dlen); a_dirty = true; ntfs_fix_pre_write(&ib->rhdr, bytes); break; default: WARN_ON(1); } if (rlsn) { __le64 t64 = cpu_to_le64(*rlsn); if (rec) rec->rhdr.lsn = t64; if (ib) ib->rhdr.lsn = t64; } if (mi && mi->dirty) { err = mi_write(mi, 0); if (err) goto out; } if (a_dirty) { attr = oa->attr; err = ntfs_sb_write_run(sbi, oa->run1, vbo, buffer_le, bytes, 0); if (err) goto out; } out: if (inode) iput(inode); else if (mi != mi2_child) mi_put(mi); kfree(buffer_le); return err; dirty_vol: log->set_dirty = true; goto out; } /* * log_replay - Replays log and empties it. * * This function is called during mount operation. * It replays log and empties it. * Initialized is set false if logfile contains '-1'. */ int log_replay(struct ntfs_inode *ni, bool *initialized) { int err; struct ntfs_sb_info *sbi = ni->mi.sbi; struct ntfs_log *log; u64 rec_lsn, checkpt_lsn = 0, rlsn = 0; struct ATTR_NAME_ENTRY *attr_names = NULL; u32 attr_names_bytes = 0; u32 oatbl_bytes = 0; struct RESTART_TABLE *dptbl = NULL; struct RESTART_TABLE *trtbl = NULL; const struct RESTART_TABLE *rt; struct RESTART_TABLE *oatbl = NULL; struct inode *inode; struct OpenAttr *oa; struct ntfs_inode *ni_oe; struct ATTRIB *attr = NULL; u64 size, vcn, undo_next_lsn; CLST rno, lcn, lcn0, len0, clen; void *data; struct NTFS_RESTART *rst = NULL; struct lcb *lcb = NULL; struct OPEN_ATTR_ENRTY *oe; struct ATTR_NAME_ENTRY *ane; struct TRANSACTION_ENTRY *tr; struct DIR_PAGE_ENTRY *dp; u32 i, bytes_per_attr_entry; u32 vbo, tail, off, dlen; u32 saved_len, rec_len, transact_id; bool use_second_page; struct RESTART_AREA *ra2, *ra = NULL; struct CLIENT_REC *ca, *cr; __le16 client; struct RESTART_HDR *rh; const struct LFS_RECORD_HDR *frh; const struct LOG_REC_HDR *lrh; bool is_mapped; bool is_ro = sb_rdonly(sbi->sb); u64 t64; u16 t16; u32 t32; log = kzalloc(sizeof(struct ntfs_log), GFP_NOFS); if (!log) return -ENOMEM; log->ni = ni; log->l_size = log->orig_file_size = ni->vfs_inode.i_size; /* Get the size of page. NOTE: To replay we can use default page. */ #if PAGE_SIZE >= DefaultLogPageSize && PAGE_SIZE <= DefaultLogPageSize * 2 log->page_size = norm_file_page(PAGE_SIZE, &log->l_size, true); #else log->page_size = norm_file_page(PAGE_SIZE, &log->l_size, false); #endif if (!log->page_size) { err = -EINVAL; goto out; } log->one_page_buf = kmalloc(log->page_size, GFP_NOFS); if (!log->one_page_buf) { err = -ENOMEM; goto out; } log->page_mask = log->page_size - 1; log->page_bits = blksize_bits(log->page_size); /* Look for a restart area on the disk. */ err = log_read_rst(log, true, &log->rst_info); if (err) goto out; /* remember 'initialized' */ *initialized = log->rst_info.initialized; if (!log->rst_info.restart) { if (log->rst_info.initialized) { /* No restart area but the file is not initialized. */ err = -EINVAL; goto out; } log_init_pg_hdr(log, 1, 1); log_create(log, 0, get_random_u32(), false, false); ra = log_create_ra(log); if (!ra) { err = -ENOMEM; goto out; } log->ra = ra; log->init_ra = true; goto process_log; } /* * If the restart offset above wasn't zero then we won't * look for a second restart. */ if (log->rst_info.vbo) goto check_restart_area; err = log_read_rst(log, false, &log->rst_info2); if (err) goto out; /* Determine which restart area to use. */ if (!log->rst_info2.restart || log->rst_info2.last_lsn <= log->rst_info.last_lsn) goto use_first_page; use_second_page = true; if (log->rst_info.chkdsk_was_run && log->page_size != log->rst_info.vbo) { struct RECORD_PAGE_HDR *sp = NULL; bool usa_error; if (!read_log_page(log, log->page_size, &sp, &usa_error) && sp->rhdr.sign == NTFS_CHKD_SIGNATURE) { use_second_page = false; } kfree(sp); } if (use_second_page) { kfree(log->rst_info.r_page); memcpy(&log->rst_info, &log->rst_info2, sizeof(struct restart_info)); log->rst_info2.r_page = NULL; } use_first_page: kfree(log->rst_info2.r_page); check_restart_area: /* * If the restart area is at offset 0, we want * to write the second restart area first. */ log->init_ra = !!log->rst_info.vbo; /* If we have a valid page then grab a pointer to the restart area. */ ra2 = log->rst_info.valid_page ? Add2Ptr(log->rst_info.r_page, le16_to_cpu(log->rst_info.r_page->ra_off)) : NULL; if (log->rst_info.chkdsk_was_run || (ra2 && ra2->client_idx[1] == LFS_NO_CLIENT_LE)) { bool wrapped = false; bool use_multi_page = false; u32 open_log_count; /* Do some checks based on whether we have a valid log page. */ open_log_count = log->rst_info.valid_page ? le32_to_cpu(ra2->open_log_count) : get_random_u32(); log_init_pg_hdr(log, 1, 1); log_create(log, log->rst_info.last_lsn, open_log_count, wrapped, use_multi_page); ra = log_create_ra(log); if (!ra) { err = -ENOMEM; goto out; } log->ra = ra; /* Put the restart areas and initialize * the log file as required. */ goto process_log; } if (!ra2) { err = -EINVAL; goto out; } /* * If the log page or the system page sizes have changed, we can't * use the log file. We must use the system page size instead of the * default size if there is not a clean shutdown. */ t32 = le32_to_cpu(log->rst_info.r_page->sys_page_size); if (log->page_size != t32) { log->l_size = log->orig_file_size; log->page_size = norm_file_page(t32, &log->l_size, t32 == DefaultLogPageSize); } if (log->page_size != t32 || log->page_size != le32_to_cpu(log->rst_info.r_page->page_size)) { err = -EINVAL; goto out; } log->page_mask = log->page_size - 1; log->page_bits = blksize_bits(log->page_size); /* If the file size has shrunk then we won't mount it. */ if (log->l_size < le64_to_cpu(ra2->l_size)) { err = -EINVAL; goto out; } log_init_pg_hdr(log, le16_to_cpu(log->rst_info.r_page->major_ver), le16_to_cpu(log->rst_info.r_page->minor_ver)); log->l_size = le64_to_cpu(ra2->l_size); log->seq_num_bits = le32_to_cpu(ra2->seq_num_bits); log->file_data_bits = sizeof(u64) * 8 - log->seq_num_bits; log->seq_num_mask = (8 << log->file_data_bits) - 1; log->last_lsn = le64_to_cpu(ra2->current_lsn); log->seq_num = log->last_lsn >> log->file_data_bits; log->ra_off = le16_to_cpu(log->rst_info.r_page->ra_off); log->restart_size = log->sys_page_size - log->ra_off; log->record_header_len = le16_to_cpu(ra2->rec_hdr_len); log->ra_size = le16_to_cpu(ra2->ra_len); log->data_off = le16_to_cpu(ra2->data_off); log->data_size = log->page_size - log->data_off; log->reserved = log->data_size - log->record_header_len; vbo = lsn_to_vbo(log, log->last_lsn); if (vbo < log->first_page) { /* This is a pseudo lsn. */ log->l_flags |= NTFSLOG_NO_LAST_LSN; log->next_page = log->first_page; goto find_oldest; } /* Find the end of this log record. */ off = final_log_off(log, log->last_lsn, le32_to_cpu(ra2->last_lsn_data_len)); /* If we wrapped the file then increment the sequence number. */ if (off <= vbo) { log->seq_num += 1; log->l_flags |= NTFSLOG_WRAPPED; } /* Now compute the next log page to use. */ vbo &= ~log->sys_page_mask; tail = log->page_size - (off & log->page_mask) - 1; /* *If we can fit another log record on the page, * move back a page the log file. */ if (tail >= log->record_header_len) { log->l_flags |= NTFSLOG_REUSE_TAIL; log->next_page = vbo; } else { log->next_page = next_page_off(log, vbo); } find_oldest: /* * Find the oldest client lsn. Use the last * flushed lsn as a starting point. */ log->oldest_lsn = log->last_lsn; oldest_client_lsn(Add2Ptr(ra2, le16_to_cpu(ra2->client_off)), ra2->client_idx[1], &log->oldest_lsn); log->oldest_lsn_off = lsn_to_vbo(log, log->oldest_lsn); if (log->oldest_lsn_off < log->first_page) log->l_flags |= NTFSLOG_NO_OLDEST_LSN; if (!(ra2->flags & RESTART_SINGLE_PAGE_IO)) log->l_flags |= NTFSLOG_WRAPPED | NTFSLOG_MULTIPLE_PAGE_IO; log->current_openlog_count = le32_to_cpu(ra2->open_log_count); log->total_avail_pages = log->l_size - log->first_page; log->total_avail = log->total_avail_pages >> log->page_bits; log->max_current_avail = log->total_avail * log->reserved; log->total_avail = log->total_avail * log->data_size; log->current_avail = current_log_avail(log); ra = kzalloc(log->restart_size, GFP_NOFS); if (!ra) { err = -ENOMEM; goto out; } log->ra = ra; t16 = le16_to_cpu(ra2->client_off); if (t16 == offsetof(struct RESTART_AREA, clients)) { memcpy(ra, ra2, log->ra_size); } else { memcpy(ra, ra2, offsetof(struct RESTART_AREA, clients)); memcpy(ra->clients, Add2Ptr(ra2, t16), le16_to_cpu(ra2->ra_len) - t16); log->current_openlog_count = get_random_u32(); ra->open_log_count = cpu_to_le32(log->current_openlog_count); log->ra_size = offsetof(struct RESTART_AREA, clients) + sizeof(struct CLIENT_REC); ra->client_off = cpu_to_le16(offsetof(struct RESTART_AREA, clients)); ra->ra_len = cpu_to_le16(log->ra_size); } le32_add_cpu(&ra->open_log_count, 1); /* Now we need to walk through looking for the last lsn. */ err = last_log_lsn(log); if (err) goto out; log->current_avail = current_log_avail(log); /* Remember which restart area to write first. */ log->init_ra = log->rst_info.vbo; process_log: /* 1.0, 1.1, 2.0 log->major_ver/minor_ver - short values. */ switch ((log->major_ver << 16) + log->minor_ver) { case 0x10000: case 0x10001: case 0x20000: break; default: ntfs_warn(sbi->sb, "\x24LogFile version %d.%d is not supported", log->major_ver, log->minor_ver); err = -EOPNOTSUPP; log->set_dirty = true; goto out; } /* One client "NTFS" per logfile. */ ca = Add2Ptr(ra, le16_to_cpu(ra->client_off)); for (client = ra->client_idx[1];; client = cr->next_client) { if (client == LFS_NO_CLIENT_LE) { /* Insert "NTFS" client LogFile. */ client = ra->client_idx[0]; if (client == LFS_NO_CLIENT_LE) { err = -EINVAL; goto out; } t16 = le16_to_cpu(client); cr = ca + t16; remove_client(ca, cr, &ra->client_idx[0]); cr->restart_lsn = 0; cr->oldest_lsn = cpu_to_le64(log->oldest_lsn); cr->name_bytes = cpu_to_le32(8); cr->name[0] = cpu_to_le16('N'); cr->name[1] = cpu_to_le16('T'); cr->name[2] = cpu_to_le16('F'); cr->name[3] = cpu_to_le16('S'); add_client(ca, t16, &ra->client_idx[1]); break; } cr = ca + le16_to_cpu(client); if (cpu_to_le32(8) == cr->name_bytes && cpu_to_le16('N') == cr->name[0] && cpu_to_le16('T') == cr->name[1] && cpu_to_le16('F') == cr->name[2] && cpu_to_le16('S') == cr->name[3]) break; } /* Update the client handle with the client block information. */ log->client_id.seq_num = cr->seq_num; log->client_id.client_idx = client; err = read_rst_area(log, &rst, &checkpt_lsn); if (err) goto out; if (!rst) goto out; bytes_per_attr_entry = !rst->major_ver ? 0x2C : 0x28; if (rst->check_point_start) checkpt_lsn = le64_to_cpu(rst->check_point_start); /* Allocate and Read the Transaction Table. */ if (!rst->transact_table_len) goto check_dirty_page_table; /* reduce tab pressure. */ t64 = le64_to_cpu(rst->transact_table_lsn); err = read_log_rec_lcb(log, t64, lcb_ctx_prev, &lcb); if (err) goto out; lrh = lcb->log_rec; frh = lcb->lrh; rec_len = le32_to_cpu(frh->client_data_len); if (!check_log_rec(lrh, rec_len, le32_to_cpu(frh->transact_id), bytes_per_attr_entry)) { err = -EINVAL; goto out; } t16 = le16_to_cpu(lrh->redo_off); rt = Add2Ptr(lrh, t16); t32 = rec_len - t16; /* Now check that this is a valid restart table. */ if (!check_rstbl(rt, t32)) { err = -EINVAL; goto out; } trtbl = kmemdup(rt, t32, GFP_NOFS); if (!trtbl) { err = -ENOMEM; goto out; } lcb_put(lcb); lcb = NULL; check_dirty_page_table: /* The next record back should be the Dirty Pages Table. */ if (!rst->dirty_pages_len) goto check_attribute_names; /* reduce tab pressure. */ t64 = le64_to_cpu(rst->dirty_pages_table_lsn); err = read_log_rec_lcb(log, t64, lcb_ctx_prev, &lcb); if (err) goto out; lrh = lcb->log_rec; frh = lcb->lrh; rec_len = le32_to_cpu(frh->client_data_len); if (!check_log_rec(lrh, rec_len, le32_to_cpu(frh->transact_id), bytes_per_attr_entry)) { err = -EINVAL; goto out; } t16 = le16_to_cpu(lrh->redo_off); rt = Add2Ptr(lrh, t16); t32 = rec_len - t16; /* Now check that this is a valid restart table. */ if (!check_rstbl(rt, t32)) { err = -EINVAL; goto out; } dptbl = kmemdup(rt, t32, GFP_NOFS); if (!dptbl) { err = -ENOMEM; goto out; } /* Convert Ra version '0' into version '1'. */ if (rst->major_ver) goto end_conv_1; /* reduce tab pressure. */ dp = NULL; while ((dp = enum_rstbl(dptbl, dp))) { struct DIR_PAGE_ENTRY_32 *dp0 = (struct DIR_PAGE_ENTRY_32 *)dp; // NOTE: Danger. Check for of boundary. memmove(&dp->vcn, &dp0->vcn_low, 2 * sizeof(u64) + le32_to_cpu(dp->lcns_follow) * sizeof(u64)); } end_conv_1: lcb_put(lcb); lcb = NULL; /* * Go through the table and remove the duplicates, * remembering the oldest lsn values. */ if (sbi->cluster_size <= log->page_size) goto trace_dp_table; /* reduce tab pressure. */ dp = NULL; while ((dp = enum_rstbl(dptbl, dp))) { struct DIR_PAGE_ENTRY *next = dp; while ((next = enum_rstbl(dptbl, next))) { if (next->target_attr == dp->target_attr && next->vcn == dp->vcn) { if (le64_to_cpu(next->oldest_lsn) < le64_to_cpu(dp->oldest_lsn)) { dp->oldest_lsn = next->oldest_lsn; } free_rsttbl_idx(dptbl, PtrOffset(dptbl, next)); } } } trace_dp_table: check_attribute_names: /* The next record should be the Attribute Names. */ if (!rst->attr_names_len) goto check_attr_table; /* reduce tab pressure. */ t64 = le64_to_cpu(rst->attr_names_lsn); err = read_log_rec_lcb(log, t64, lcb_ctx_prev, &lcb); if (err) goto out; lrh = lcb->log_rec; frh = lcb->lrh; rec_len = le32_to_cpu(frh->client_data_len); if (!check_log_rec(lrh, rec_len, le32_to_cpu(frh->transact_id), bytes_per_attr_entry)) { err = -EINVAL; goto out; } t32 = lrh_length(lrh); attr_names_bytes = rec_len - t32; attr_names = kmemdup(Add2Ptr(lrh, t32), attr_names_bytes, GFP_NOFS); if (!attr_names) { err = -ENOMEM; goto out; } lcb_put(lcb); lcb = NULL; check_attr_table: /* The next record should be the attribute Table. */ if (!rst->open_attr_len) goto check_attribute_names2; /* reduce tab pressure. */ t64 = le64_to_cpu(rst->open_attr_table_lsn); err = read_log_rec_lcb(log, t64, lcb_ctx_prev, &lcb); if (err) goto out; lrh = lcb->log_rec; frh = lcb->lrh; rec_len = le32_to_cpu(frh->client_data_len); if (!check_log_rec(lrh, rec_len, le32_to_cpu(frh->transact_id), bytes_per_attr_entry)) { err = -EINVAL; goto out; } t16 = le16_to_cpu(lrh->redo_off); rt = Add2Ptr(lrh, t16); oatbl_bytes = rec_len - t16; if (!check_rstbl(rt, oatbl_bytes)) { err = -EINVAL; goto out; } oatbl = kmemdup(rt, oatbl_bytes, GFP_NOFS); if (!oatbl) { err = -ENOMEM; goto out; } log->open_attr_tbl = oatbl; /* Clear all of the Attr pointers. */ oe = NULL; while ((oe = enum_rstbl(oatbl, oe))) { if (!rst->major_ver) { struct OPEN_ATTR_ENRTY_32 oe0; /* Really 'oe' points to OPEN_ATTR_ENRTY_32. */ memcpy(&oe0, oe, SIZEOF_OPENATTRIBUTEENTRY0); oe->bytes_per_index = oe0.bytes_per_index; oe->type = oe0.type; oe->is_dirty_pages = oe0.is_dirty_pages; oe->name_len = 0; oe->ref = oe0.ref; oe->open_record_lsn = oe0.open_record_lsn; } oe->is_attr_name = 0; oe->ptr = NULL; } lcb_put(lcb); lcb = NULL; check_attribute_names2: if (attr_names && oatbl) { off = 0; for (;;) { /* Check we can use attribute name entry 'ane'. */ static_assert(sizeof(*ane) == 4); if (off + sizeof(*ane) > attr_names_bytes) { /* just ignore the rest. */ break; } ane = Add2Ptr(attr_names, off); t16 = le16_to_cpu(ane->off); if (!t16) { /* this is the only valid exit. */ break; } /* Check we can use open attribute entry 'oe'. */ if (t16 + sizeof(*oe) > oatbl_bytes) { /* just ignore the rest. */ break; } /* TODO: Clear table on exit! */ oe = Add2Ptr(oatbl, t16); t16 = le16_to_cpu(ane->name_bytes); off += t16 + sizeof(*ane); if (off > attr_names_bytes) { /* just ignore the rest. */ break; } oe->name_len = t16 / sizeof(short); oe->ptr = ane->name; oe->is_attr_name = 2; } } /* * If the checkpt_lsn is zero, then this is a freshly * formatted disk and we have no work to do. */ if (!checkpt_lsn) { err = 0; goto out; } if (!oatbl) { oatbl = init_rsttbl(bytes_per_attr_entry, 8); if (!oatbl) { err = -ENOMEM; goto out; } } log->open_attr_tbl = oatbl; /* Start the analysis pass from the Checkpoint lsn. */ rec_lsn = checkpt_lsn; /* Read the first lsn. */ err = read_log_rec_lcb(log, checkpt_lsn, lcb_ctx_next, &lcb); if (err) goto out; /* Loop to read all subsequent records to the end of the log file. */ next_log_record_analyze: err = read_next_log_rec(log, lcb, &rec_lsn); if (err) goto out; if (!rec_lsn) goto end_log_records_enumerate; frh = lcb->lrh; transact_id = le32_to_cpu(frh->transact_id); rec_len = le32_to_cpu(frh->client_data_len); lrh = lcb->log_rec; if (!check_log_rec(lrh, rec_len, transact_id, bytes_per_attr_entry)) { err = -EINVAL; goto out; } /* * The first lsn after the previous lsn remembered * the checkpoint is the first candidate for the rlsn. */ if (!rlsn) rlsn = rec_lsn; if (LfsClientRecord != frh->record_type) goto next_log_record_analyze; /* * Now update the Transaction Table for this transaction. If there * is no entry present or it is unallocated we allocate the entry. */ if (!trtbl) { trtbl = init_rsttbl(sizeof(struct TRANSACTION_ENTRY), INITIAL_NUMBER_TRANSACTIONS); if (!trtbl) { err = -ENOMEM; goto out; } } tr = Add2Ptr(trtbl, transact_id); if (transact_id >= bytes_per_rt(trtbl) || tr->next != RESTART_ENTRY_ALLOCATED_LE) { tr = alloc_rsttbl_from_idx(&trtbl, transact_id); if (!tr) { err = -ENOMEM; goto out; } tr->transact_state = TransactionActive; tr->first_lsn = cpu_to_le64(rec_lsn); } tr->prev_lsn = tr->undo_next_lsn = cpu_to_le64(rec_lsn); /* * If this is a compensation log record, then change * the undo_next_lsn to be the undo_next_lsn of this record. */ if (lrh->undo_op == cpu_to_le16(CompensationLogRecord)) tr->undo_next_lsn = frh->client_undo_next_lsn; /* Dispatch to handle log record depending on type. */ switch (le16_to_cpu(lrh->redo_op)) { case InitializeFileRecordSegment: case DeallocateFileRecordSegment: case WriteEndOfFileRecordSegment: case CreateAttribute: case DeleteAttribute: case UpdateResidentValue: case UpdateNonresidentValue: case UpdateMappingPairs: case SetNewAttributeSizes: case AddIndexEntryRoot: case DeleteIndexEntryRoot: case AddIndexEntryAllocation: case DeleteIndexEntryAllocation: case WriteEndOfIndexBuffer: case SetIndexEntryVcnRoot: case SetIndexEntryVcnAllocation: case UpdateFileNameRoot: case UpdateFileNameAllocation: case SetBitsInNonresidentBitMap: case ClearBitsInNonresidentBitMap: case UpdateRecordDataRoot: case UpdateRecordDataAllocation: case ZeroEndOfFileRecord: t16 = le16_to_cpu(lrh->target_attr); t64 = le64_to_cpu(lrh->target_vcn); dp = find_dp(dptbl, t16, t64); if (dp) goto copy_lcns; /* * Calculate the number of clusters per page the system * which wrote the checkpoint, possibly creating the table. */ if (dptbl) { t32 = (le16_to_cpu(dptbl->size) - sizeof(struct DIR_PAGE_ENTRY)) / sizeof(u64); } else { t32 = log->clst_per_page; kfree(dptbl); dptbl = init_rsttbl(struct_size(dp, page_lcns, t32), 32); if (!dptbl) { err = -ENOMEM; goto out; } } dp = alloc_rsttbl_idx(&dptbl); if (!dp) { err = -ENOMEM; goto out; } dp->target_attr = cpu_to_le32(t16); dp->transfer_len = cpu_to_le32(t32 << sbi->cluster_bits); dp->lcns_follow = cpu_to_le32(t32); dp->vcn = cpu_to_le64(t64 & ~((u64)t32 - 1)); dp->oldest_lsn = cpu_to_le64(rec_lsn); copy_lcns: /* * Copy the Lcns from the log record into the Dirty Page Entry. * TODO: For different page size support, must somehow make * whole routine a loop, case Lcns do not fit below. */ t16 = le16_to_cpu(lrh->lcns_follow); for (i = 0; i < t16; i++) { size_t j = (size_t)(le64_to_cpu(lrh->target_vcn) - le64_to_cpu(dp->vcn)); dp->page_lcns[j + i] = lrh->page_lcns[i]; } goto next_log_record_analyze; case DeleteDirtyClusters: { u32 range_count = le16_to_cpu(lrh->redo_len) / sizeof(struct LCN_RANGE); const struct LCN_RANGE *r = Add2Ptr(lrh, le16_to_cpu(lrh->redo_off)); /* Loop through all of the Lcn ranges this log record. */ for (i = 0; i < range_count; i++, r++) { u64 lcn0 = le64_to_cpu(r->lcn); u64 lcn_e = lcn0 + le64_to_cpu(r->len) - 1; dp = NULL; while ((dp = enum_rstbl(dptbl, dp))) { u32 j; t32 = le32_to_cpu(dp->lcns_follow); for (j = 0; j < t32; j++) { t64 = le64_to_cpu(dp->page_lcns[j]); if (t64 >= lcn0 && t64 <= lcn_e) dp->page_lcns[j] = 0; } } } goto next_log_record_analyze; } case OpenNonresidentAttribute: t16 = le16_to_cpu(lrh->target_attr); if (t16 >= bytes_per_rt(oatbl)) { /* * Compute how big the table needs to be. * Add 10 extra entries for some cushion. */ u32 new_e = t16 / le16_to_cpu(oatbl->size); new_e += 10 - le16_to_cpu(oatbl->used); oatbl = extend_rsttbl(oatbl, new_e, ~0u); log->open_attr_tbl = oatbl; if (!oatbl) { err = -ENOMEM; goto out; } } /* Point to the entry being opened. */ oe = alloc_rsttbl_from_idx(&oatbl, t16); log->open_attr_tbl = oatbl; if (!oe) { err = -ENOMEM; goto out; } /* Initialize this entry from the log record. */ t16 = le16_to_cpu(lrh->redo_off); if (!rst->major_ver) { /* Convert version '0' into version '1'. */ struct OPEN_ATTR_ENRTY_32 *oe0 = Add2Ptr(lrh, t16); oe->bytes_per_index = oe0->bytes_per_index; oe->type = oe0->type; oe->is_dirty_pages = oe0->is_dirty_pages; oe->name_len = 0; //oe0.name_len; oe->ref = oe0->ref; oe->open_record_lsn = oe0->open_record_lsn; } else { memcpy(oe, Add2Ptr(lrh, t16), bytes_per_attr_entry); } t16 = le16_to_cpu(lrh->undo_len); if (t16) { oe->ptr = kmalloc(t16, GFP_NOFS); if (!oe->ptr) { err = -ENOMEM; goto out; } oe->name_len = t16 / sizeof(short); memcpy(oe->ptr, Add2Ptr(lrh, le16_to_cpu(lrh->undo_off)), t16); oe->is_attr_name = 1; } else { oe->ptr = NULL; oe->is_attr_name = 0; } goto next_log_record_analyze; case HotFix: t16 = le16_to_cpu(lrh->target_attr); t64 = le64_to_cpu(lrh->target_vcn); dp = find_dp(dptbl, t16, t64); if (dp) { size_t j = le64_to_cpu(lrh->target_vcn) - le64_to_cpu(dp->vcn); if (dp->page_lcns[j]) dp->page_lcns[j] = lrh->page_lcns[0]; } goto next_log_record_analyze; case EndTopLevelAction: tr = Add2Ptr(trtbl, transact_id); tr->prev_lsn = cpu_to_le64(rec_lsn); tr->undo_next_lsn = frh->client_undo_next_lsn; goto next_log_record_analyze; case PrepareTransaction: tr = Add2Ptr(trtbl, transact_id); tr->transact_state = TransactionPrepared; goto next_log_record_analyze; case CommitTransaction: tr = Add2Ptr(trtbl, transact_id); tr->transact_state = TransactionCommitted; goto next_log_record_analyze; case ForgetTransaction: free_rsttbl_idx(trtbl, transact_id); goto next_log_record_analyze; case Noop: case OpenAttributeTableDump: case AttributeNamesDump: case DirtyPageTableDump: case TransactionTableDump: /* The following cases require no action the Analysis Pass. */ goto next_log_record_analyze; default: /* * All codes will be explicitly handled. * If we see a code we do not expect, then we are trouble. */ goto next_log_record_analyze; } end_log_records_enumerate: lcb_put(lcb); lcb = NULL; /* * Scan the Dirty Page Table and Transaction Table for * the lowest lsn, and return it as the Redo lsn. */ dp = NULL; while ((dp = enum_rstbl(dptbl, dp))) { t64 = le64_to_cpu(dp->oldest_lsn); if (t64 && t64 < rlsn) rlsn = t64; } tr = NULL; while ((tr = enum_rstbl(trtbl, tr))) { t64 = le64_to_cpu(tr->first_lsn); if (t64 && t64 < rlsn) rlsn = t64; } /* * Only proceed if the Dirty Page Table or Transaction * table are not empty. */ if ((!dptbl || !dptbl->total) && (!trtbl || !trtbl->total)) goto end_replay; sbi->flags |= NTFS_FLAGS_NEED_REPLAY; if (is_ro) goto out; /* Reopen all of the attributes with dirty pages. */ oe = NULL; next_open_attribute: oe = enum_rstbl(oatbl, oe); if (!oe) { err = 0; dp = NULL; goto next_dirty_page; } oa = kzalloc(sizeof(struct OpenAttr), GFP_NOFS); if (!oa) { err = -ENOMEM; goto out; } inode = ntfs_iget5(sbi->sb, &oe->ref, NULL); if (IS_ERR(inode)) goto fake_attr; if (is_bad_inode(inode)) { iput(inode); fake_attr: if (oa->ni) { iput(&oa->ni->vfs_inode); oa->ni = NULL; } attr = attr_create_nonres_log(sbi, oe->type, 0, oe->ptr, oe->name_len, 0); if (!attr) { kfree(oa); err = -ENOMEM; goto out; } oa->attr = attr; oa->run1 = &oa->run0; goto final_oe; } ni_oe = ntfs_i(inode); oa->ni = ni_oe; attr = ni_find_attr(ni_oe, NULL, NULL, oe->type, oe->ptr, oe->name_len, NULL, NULL); if (!attr) goto fake_attr; t32 = le32_to_cpu(attr->size); oa->attr = kmemdup(attr, t32, GFP_NOFS); if (!oa->attr) goto fake_attr; if (!S_ISDIR(inode->i_mode)) { if (attr->type == ATTR_DATA && !attr->name_len) { oa->run1 = &ni_oe->file.run; goto final_oe; } } else { if (attr->type == ATTR_ALLOC && attr->name_len == ARRAY_SIZE(I30_NAME) && !memcmp(attr_name(attr), I30_NAME, sizeof(I30_NAME))) { oa->run1 = &ni_oe->dir.alloc_run; goto final_oe; } } if (attr->non_res) { u16 roff = le16_to_cpu(attr->nres.run_off); CLST svcn = le64_to_cpu(attr->nres.svcn); if (roff > t32) { kfree(oa->attr); oa->attr = NULL; goto fake_attr; } err = run_unpack(&oa->run0, sbi, inode->i_ino, svcn, le64_to_cpu(attr->nres.evcn), svcn, Add2Ptr(attr, roff), t32 - roff); if (err < 0) { kfree(oa->attr); oa->attr = NULL; goto fake_attr; } err = 0; } oa->run1 = &oa->run0; attr = oa->attr; final_oe: if (oe->is_attr_name == 1) kfree(oe->ptr); oe->is_attr_name = 0; oe->ptr = oa; oe->name_len = attr->name_len; goto next_open_attribute; /* * Now loop through the dirty page table to extract all of the Vcn/Lcn. * Mapping that we have, and insert it into the appropriate run. */ next_dirty_page: dp = enum_rstbl(dptbl, dp); if (!dp) goto do_redo_1; oe = Add2Ptr(oatbl, le32_to_cpu(dp->target_attr)); if (oe->next != RESTART_ENTRY_ALLOCATED_LE) goto next_dirty_page; oa = oe->ptr; if (!oa) goto next_dirty_page; i = -1; next_dirty_page_vcn: i += 1; if (i >= le32_to_cpu(dp->lcns_follow)) goto next_dirty_page; vcn = le64_to_cpu(dp->vcn) + i; size = (vcn + 1) << sbi->cluster_bits; if (!dp->page_lcns[i]) goto next_dirty_page_vcn; rno = ino_get(&oe->ref); if (rno <= MFT_REC_MIRR && size < (MFT_REC_VOL + 1) * sbi->record_size && oe->type == ATTR_DATA) { goto next_dirty_page_vcn; } lcn = le64_to_cpu(dp->page_lcns[i]); if ((!run_lookup_entry(oa->run1, vcn, &lcn0, &len0, NULL) || lcn0 != lcn) && !run_add_entry(oa->run1, vcn, lcn, 1, false)) { err = -ENOMEM; goto out; } attr = oa->attr; if (size > le64_to_cpu(attr->nres.alloc_size)) { attr->nres.valid_size = attr->nres.data_size = attr->nres.alloc_size = cpu_to_le64(size); } goto next_dirty_page_vcn; do_redo_1: /* * Perform the Redo Pass, to restore all of the dirty pages to the same * contents that they had immediately before the crash. If the dirty * page table is empty, then we can skip the entire Redo Pass. */ if (!dptbl || !dptbl->total) goto do_undo_action; rec_lsn = rlsn; /* * Read the record at the Redo lsn, before falling * into common code to handle each record. */ err = read_log_rec_lcb(log, rlsn, lcb_ctx_next, &lcb); if (err) goto out; /* * Now loop to read all of our log records forwards, until * we hit the end of the file, cleaning up at the end. */ do_action_next: frh = lcb->lrh; if (LfsClientRecord != frh->record_type) goto read_next_log_do_action; transact_id = le32_to_cpu(frh->transact_id); rec_len = le32_to_cpu(frh->client_data_len); lrh = lcb->log_rec; if (!check_log_rec(lrh, rec_len, transact_id, bytes_per_attr_entry)) { err = -EINVAL; goto out; } /* Ignore log records that do not update pages. */ if (lrh->lcns_follow) goto find_dirty_page; goto read_next_log_do_action; find_dirty_page: t16 = le16_to_cpu(lrh->target_attr); t64 = le64_to_cpu(lrh->target_vcn); dp = find_dp(dptbl, t16, t64); if (!dp) goto read_next_log_do_action; if (rec_lsn < le64_to_cpu(dp->oldest_lsn)) goto read_next_log_do_action; t16 = le16_to_cpu(lrh->target_attr); if (t16 >= bytes_per_rt(oatbl)) { err = -EINVAL; goto out; } oe = Add2Ptr(oatbl, t16); if (oe->next != RESTART_ENTRY_ALLOCATED_LE) { err = -EINVAL; goto out; } oa = oe->ptr; if (!oa) { err = -EINVAL; goto out; } attr = oa->attr; vcn = le64_to_cpu(lrh->target_vcn); if (!run_lookup_entry(oa->run1, vcn, &lcn, NULL, NULL) || lcn == SPARSE_LCN) { goto read_next_log_do_action; } /* Point to the Redo data and get its length. */ data = Add2Ptr(lrh, le16_to_cpu(lrh->redo_off)); dlen = le16_to_cpu(lrh->redo_len); /* Shorten length by any Lcns which were deleted. */ saved_len = dlen; for (i = le16_to_cpu(lrh->lcns_follow); i; i--) { size_t j; u32 alen, voff; voff = le16_to_cpu(lrh->record_off) + le16_to_cpu(lrh->attr_off); voff += le16_to_cpu(lrh->cluster_off) << SECTOR_SHIFT; /* If the Vcn question is allocated, we can just get out. */ j = le64_to_cpu(lrh->target_vcn) - le64_to_cpu(dp->vcn); if (dp->page_lcns[j + i - 1]) break; if (!saved_len) saved_len = 1; /* * Calculate the allocated space left relative to the * log record Vcn, after removing this unallocated Vcn. */ alen = (i - 1) << sbi->cluster_bits; /* * If the update described this log record goes beyond * the allocated space, then we will have to reduce the length. */ if (voff >= alen) dlen = 0; else if (voff + dlen > alen) dlen = alen - voff; } /* * If the resulting dlen from above is now zero, * we can skip this log record. */ if (!dlen && saved_len) goto read_next_log_do_action; t16 = le16_to_cpu(lrh->redo_op); if (can_skip_action(t16)) goto read_next_log_do_action; /* Apply the Redo operation a common routine. */ err = do_action(log, oe, lrh, t16, data, dlen, rec_len, &rec_lsn); if (err) goto out; /* Keep reading and looping back until end of file. */ read_next_log_do_action: err = read_next_log_rec(log, lcb, &rec_lsn); if (!err && rec_lsn) goto do_action_next; lcb_put(lcb); lcb = NULL; do_undo_action: /* Scan Transaction Table. */ tr = NULL; transaction_table_next: tr = enum_rstbl(trtbl, tr); if (!tr) goto undo_action_done; if (TransactionActive != tr->transact_state || !tr->undo_next_lsn) { free_rsttbl_idx(trtbl, PtrOffset(trtbl, tr)); goto transaction_table_next; } log->transaction_id = PtrOffset(trtbl, tr); undo_next_lsn = le64_to_cpu(tr->undo_next_lsn); /* * We only have to do anything if the transaction has * something its undo_next_lsn field. */ if (!undo_next_lsn) goto commit_undo; /* Read the first record to be undone by this transaction. */ err = read_log_rec_lcb(log, undo_next_lsn, lcb_ctx_undo_next, &lcb); if (err) goto out; /* * Now loop to read all of our log records forwards, * until we hit the end of the file, cleaning up at the end. */ undo_action_next: lrh = lcb->log_rec; frh = lcb->lrh; transact_id = le32_to_cpu(frh->transact_id); rec_len = le32_to_cpu(frh->client_data_len); if (!check_log_rec(lrh, rec_len, transact_id, bytes_per_attr_entry)) { err = -EINVAL; goto out; } if (lrh->undo_op == cpu_to_le16(Noop)) goto read_next_log_undo_action; oe = Add2Ptr(oatbl, le16_to_cpu(lrh->target_attr)); oa = oe->ptr; t16 = le16_to_cpu(lrh->lcns_follow); if (!t16) goto add_allocated_vcns; is_mapped = run_lookup_entry(oa->run1, le64_to_cpu(lrh->target_vcn), &lcn, &clen, NULL); /* * If the mapping isn't already the table or the mapping * corresponds to a hole the mapping, we need to make sure * there is no partial page already memory. */ if (is_mapped && lcn != SPARSE_LCN && clen >= t16) goto add_allocated_vcns; vcn = le64_to_cpu(lrh->target_vcn); vcn &= ~(u64)(log->clst_per_page - 1); add_allocated_vcns: for (i = 0, vcn = le64_to_cpu(lrh->target_vcn), size = (vcn + 1) << sbi->cluster_bits; i < t16; i++, vcn += 1, size += sbi->cluster_size) { attr = oa->attr; if (!attr->non_res) { if (size > le32_to_cpu(attr->res.data_size)) attr->res.data_size = cpu_to_le32(size); } else { if (size > le64_to_cpu(attr->nres.data_size)) attr->nres.valid_size = attr->nres.data_size = attr->nres.alloc_size = cpu_to_le64(size); } } t16 = le16_to_cpu(lrh->undo_op); if (can_skip_action(t16)) goto read_next_log_undo_action; /* Point to the Redo data and get its length. */ data = Add2Ptr(lrh, le16_to_cpu(lrh->undo_off)); dlen = le16_to_cpu(lrh->undo_len); /* It is time to apply the undo action. */ err = do_action(log, oe, lrh, t16, data, dlen, rec_len, NULL); read_next_log_undo_action: /* * Keep reading and looping back until we have read the * last record for this transaction. */ err = read_next_log_rec(log, lcb, &rec_lsn); if (err) goto out; if (rec_lsn) goto undo_action_next; lcb_put(lcb); lcb = NULL; commit_undo: free_rsttbl_idx(trtbl, log->transaction_id); log->transaction_id = 0; goto transaction_table_next; undo_action_done: ntfs_update_mftmirr(sbi, 0); sbi->flags &= ~NTFS_FLAGS_NEED_REPLAY; end_replay: err = 0; if (is_ro) goto out; rh = kzalloc(log->page_size, GFP_NOFS); if (!rh) { err = -ENOMEM; goto out; } rh->rhdr.sign = NTFS_RSTR_SIGNATURE; rh->rhdr.fix_off = cpu_to_le16(offsetof(struct RESTART_HDR, fixups)); t16 = (log->page_size >> SECTOR_SHIFT) + 1; rh->rhdr.fix_num = cpu_to_le16(t16); rh->sys_page_size = cpu_to_le32(log->page_size); rh->page_size = cpu_to_le32(log->page_size); t16 = ALIGN(offsetof(struct RESTART_HDR, fixups) + sizeof(short) * t16, 8); rh->ra_off = cpu_to_le16(t16); rh->minor_ver = cpu_to_le16(1); // 0x1A: rh->major_ver = cpu_to_le16(1); // 0x1C: ra2 = Add2Ptr(rh, t16); memcpy(ra2, ra, sizeof(struct RESTART_AREA)); ra2->client_idx[0] = 0; ra2->client_idx[1] = LFS_NO_CLIENT_LE; ra2->flags = cpu_to_le16(2); le32_add_cpu(&ra2->open_log_count, 1); ntfs_fix_pre_write(&rh->rhdr, log->page_size); err = ntfs_sb_write_run(sbi, &ni->file.run, 0, rh, log->page_size, 0); if (!err) err = ntfs_sb_write_run(sbi, &log->ni->file.run, log->page_size, rh, log->page_size, 0); kfree(rh); if (err) goto out; out: kfree(rst); if (lcb) lcb_put(lcb); /* * Scan the Open Attribute Table to close all of * the open attributes. */ oe = NULL; while ((oe = enum_rstbl(oatbl, oe))) { rno = ino_get(&oe->ref); if (oe->is_attr_name == 1) { kfree(oe->ptr); oe->ptr = NULL; continue; } if (oe->is_attr_name) continue; oa = oe->ptr; if (!oa) continue; run_close(&oa->run0); kfree(oa->attr); if (oa->ni) iput(&oa->ni->vfs_inode); kfree(oa); } kfree(trtbl); kfree(oatbl); kfree(dptbl); kfree(attr_names); kfree(log->rst_info.r_page); kfree(ra); kfree(log->one_page_buf); if (err) sbi->flags |= NTFS_FLAGS_NEED_REPLAY; if (err == -EROFS) err = 0; else if (log->set_dirty) ntfs_set_state(sbi, NTFS_DIRTY_ERROR); kfree(log); return err; } |
| 8 7 6 7 7 24 20 17 3 16 4 13 6 13 6 20 12 8 12 8 20 20 2 1 2 2 2 1 2 2 5 5 1 9 1 1 1 1 1 1 2 2 3 3 3 3 5 1 1 2 3 3 2 2 47 14 1 30 2 5 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2014 Fraunhofer ITWM * * Written by: * Phoebe Buckheister <phoebe.buckheister@itwm.fraunhofer.de> */ #include <linux/err.h> #include <linux/bug.h> #include <linux/completion.h> #include <linux/ieee802154.h> #include <linux/rculist.h> #include <crypto/aead.h> #include <crypto/skcipher.h> #include "ieee802154_i.h" #include "llsec.h" static void llsec_key_put(struct mac802154_llsec_key *key); static bool llsec_key_id_equal(const struct ieee802154_llsec_key_id *a, const struct ieee802154_llsec_key_id *b); static void llsec_dev_free(struct mac802154_llsec_device *dev); void mac802154_llsec_init(struct mac802154_llsec *sec) { memset(sec, 0, sizeof(*sec)); memset(&sec->params.default_key_source, 0xFF, IEEE802154_ADDR_LEN); INIT_LIST_HEAD(&sec->table.security_levels); INIT_LIST_HEAD(&sec->table.devices); INIT_LIST_HEAD(&sec->table.keys); hash_init(sec->devices_short); hash_init(sec->devices_hw); rwlock_init(&sec->lock); } void mac802154_llsec_destroy(struct mac802154_llsec *sec) { struct ieee802154_llsec_seclevel *sl, *sn; struct ieee802154_llsec_device *dev, *dn; struct ieee802154_llsec_key_entry *key, *kn; list_for_each_entry_safe(sl, sn, &sec->table.security_levels, list) { struct mac802154_llsec_seclevel *msl; msl = container_of(sl, struct mac802154_llsec_seclevel, level); list_del(&sl->list); kfree_sensitive(msl); } list_for_each_entry_safe(dev, dn, &sec->table.devices, list) { struct mac802154_llsec_device *mdev; mdev = container_of(dev, struct mac802154_llsec_device, dev); list_del(&dev->list); llsec_dev_free(mdev); } list_for_each_entry_safe(key, kn, &sec->table.keys, list) { struct mac802154_llsec_key *mkey; mkey = container_of(key->key, struct mac802154_llsec_key, key); list_del(&key->list); llsec_key_put(mkey); kfree_sensitive(key); } } int mac802154_llsec_get_params(struct mac802154_llsec *sec, struct ieee802154_llsec_params *params) { read_lock_bh(&sec->lock); *params = sec->params; read_unlock_bh(&sec->lock); return 0; } int mac802154_llsec_set_params(struct mac802154_llsec *sec, const struct ieee802154_llsec_params *params, int changed) { write_lock_bh(&sec->lock); if (changed & IEEE802154_LLSEC_PARAM_ENABLED) sec->params.enabled = params->enabled; if (changed & IEEE802154_LLSEC_PARAM_FRAME_COUNTER) sec->params.frame_counter = params->frame_counter; if (changed & IEEE802154_LLSEC_PARAM_OUT_LEVEL) sec->params.out_level = params->out_level; if (changed & IEEE802154_LLSEC_PARAM_OUT_KEY) sec->params.out_key = params->out_key; if (changed & IEEE802154_LLSEC_PARAM_KEY_SOURCE) sec->params.default_key_source = params->default_key_source; if (changed & IEEE802154_LLSEC_PARAM_PAN_ID) sec->params.pan_id = params->pan_id; if (changed & IEEE802154_LLSEC_PARAM_HWADDR) sec->params.hwaddr = params->hwaddr; if (changed & IEEE802154_LLSEC_PARAM_COORD_HWADDR) sec->params.coord_hwaddr = params->coord_hwaddr; if (changed & IEEE802154_LLSEC_PARAM_COORD_SHORTADDR) sec->params.coord_shortaddr = params->coord_shortaddr; write_unlock_bh(&sec->lock); return 0; } static struct mac802154_llsec_key* llsec_key_alloc(const struct ieee802154_llsec_key *template) { const int authsizes[3] = { 4, 8, 16 }; struct mac802154_llsec_key *key; int i; key = kzalloc(sizeof(*key), GFP_KERNEL); if (!key) return NULL; kref_init(&key->ref); key->key = *template; BUILD_BUG_ON(ARRAY_SIZE(authsizes) != ARRAY_SIZE(key->tfm)); for (i = 0; i < ARRAY_SIZE(key->tfm); i++) { key->tfm[i] = crypto_alloc_aead("ccm(aes)", 0, CRYPTO_ALG_ASYNC); if (IS_ERR(key->tfm[i])) goto err_tfm; if (crypto_aead_setkey(key->tfm[i], template->key, IEEE802154_LLSEC_KEY_SIZE)) goto err_tfm; if (crypto_aead_setauthsize(key->tfm[i], authsizes[i])) goto err_tfm; } key->tfm0 = crypto_alloc_sync_skcipher("ctr(aes)", 0, 0); if (IS_ERR(key->tfm0)) goto err_tfm; if (crypto_sync_skcipher_setkey(key->tfm0, template->key, IEEE802154_LLSEC_KEY_SIZE)) goto err_tfm0; return key; err_tfm0: crypto_free_sync_skcipher(key->tfm0); err_tfm: for (i = 0; i < ARRAY_SIZE(key->tfm); i++) if (!IS_ERR_OR_NULL(key->tfm[i])) crypto_free_aead(key->tfm[i]); kfree_sensitive(key); return NULL; } static void llsec_key_release(struct kref *ref) { struct mac802154_llsec_key *key; int i; key = container_of(ref, struct mac802154_llsec_key, ref); for (i = 0; i < ARRAY_SIZE(key->tfm); i++) crypto_free_aead(key->tfm[i]); crypto_free_sync_skcipher(key->tfm0); kfree_sensitive(key); } static struct mac802154_llsec_key* llsec_key_get(struct mac802154_llsec_key *key) { kref_get(&key->ref); return key; } static void llsec_key_put(struct mac802154_llsec_key *key) { kref_put(&key->ref, llsec_key_release); } static bool llsec_key_id_equal(const struct ieee802154_llsec_key_id *a, const struct ieee802154_llsec_key_id *b) { if (a->mode != b->mode) return false; if (a->mode == IEEE802154_SCF_KEY_IMPLICIT) return ieee802154_addr_equal(&a->device_addr, &b->device_addr); if (a->id != b->id) return false; switch (a->mode) { case IEEE802154_SCF_KEY_INDEX: return true; case IEEE802154_SCF_KEY_SHORT_INDEX: return a->short_source == b->short_source; case IEEE802154_SCF_KEY_HW_INDEX: return a->extended_source == b->extended_source; } return false; } int mac802154_llsec_key_add(struct mac802154_llsec *sec, const struct ieee802154_llsec_key_id *id, const struct ieee802154_llsec_key *key) { struct mac802154_llsec_key *mkey = NULL; struct ieee802154_llsec_key_entry *pos, *new; if (!(key->frame_types & (1 << IEEE802154_FC_TYPE_MAC_CMD)) && key->cmd_frame_ids) return -EINVAL; list_for_each_entry(pos, &sec->table.keys, list) { if (llsec_key_id_equal(&pos->id, id)) return -EEXIST; if (memcmp(pos->key->key, key->key, IEEE802154_LLSEC_KEY_SIZE)) continue; mkey = container_of(pos->key, struct mac802154_llsec_key, key); /* Don't allow multiple instances of the same AES key to have * different allowed frame types/command frame ids, as this is * not possible in the 802.15.4 PIB. */ if (pos->key->frame_types != key->frame_types || pos->key->cmd_frame_ids != key->cmd_frame_ids) return -EEXIST; break; } new = kzalloc(sizeof(*new), GFP_KERNEL); if (!new) return -ENOMEM; if (!mkey) mkey = llsec_key_alloc(key); else mkey = llsec_key_get(mkey); if (!mkey) goto fail; new->id = *id; new->key = &mkey->key; list_add_rcu(&new->list, &sec->table.keys); return 0; fail: kfree_sensitive(new); return -ENOMEM; } static void mac802154_llsec_key_del_rcu(struct rcu_head *rcu) { struct ieee802154_llsec_key_entry *pos; struct mac802154_llsec_key *mkey; pos = container_of(rcu, struct ieee802154_llsec_key_entry, rcu); mkey = container_of(pos->key, struct mac802154_llsec_key, key); llsec_key_put(mkey); kfree_sensitive(pos); } int mac802154_llsec_key_del(struct mac802154_llsec *sec, const struct ieee802154_llsec_key_id *key) { struct ieee802154_llsec_key_entry *pos; list_for_each_entry(pos, &sec->table.keys, list) { if (llsec_key_id_equal(&pos->id, key)) { list_del_rcu(&pos->list); call_rcu(&pos->rcu, mac802154_llsec_key_del_rcu); return 0; } } return -ENOENT; } static bool llsec_dev_use_shortaddr(__le16 short_addr) { return short_addr != cpu_to_le16(IEEE802154_ADDR_UNDEF) && short_addr != cpu_to_le16(0xffff); } static u32 llsec_dev_hash_short(__le16 short_addr, __le16 pan_id) { return ((__force u16)short_addr) << 16 | (__force u16)pan_id; } static u64 llsec_dev_hash_long(__le64 hwaddr) { return (__force u64)hwaddr; } static struct mac802154_llsec_device* llsec_dev_find_short(struct mac802154_llsec *sec, __le16 short_addr, __le16 pan_id) { struct mac802154_llsec_device *dev; u32 key = llsec_dev_hash_short(short_addr, pan_id); hash_for_each_possible_rcu(sec->devices_short, dev, bucket_s, key) { if (dev->dev.short_addr == short_addr && dev->dev.pan_id == pan_id) return dev; } return NULL; } static struct mac802154_llsec_device* llsec_dev_find_long(struct mac802154_llsec *sec, __le64 hwaddr) { struct mac802154_llsec_device *dev; u64 key = llsec_dev_hash_long(hwaddr); hash_for_each_possible_rcu(sec->devices_hw, dev, bucket_hw, key) { if (dev->dev.hwaddr == hwaddr) return dev; } return NULL; } static void llsec_dev_free(struct mac802154_llsec_device *dev) { struct ieee802154_llsec_device_key *pos, *pn; struct mac802154_llsec_device_key *devkey; list_for_each_entry_safe(pos, pn, &dev->dev.keys, list) { devkey = container_of(pos, struct mac802154_llsec_device_key, devkey); list_del(&pos->list); kfree_sensitive(devkey); } kfree_sensitive(dev); } int mac802154_llsec_dev_add(struct mac802154_llsec *sec, const struct ieee802154_llsec_device *dev) { struct mac802154_llsec_device *entry; u32 skey = llsec_dev_hash_short(dev->short_addr, dev->pan_id); u64 hwkey = llsec_dev_hash_long(dev->hwaddr); BUILD_BUG_ON(sizeof(hwkey) != IEEE802154_ADDR_LEN); if ((llsec_dev_use_shortaddr(dev->short_addr) && llsec_dev_find_short(sec, dev->short_addr, dev->pan_id)) || llsec_dev_find_long(sec, dev->hwaddr)) return -EEXIST; entry = kmalloc(sizeof(*entry), GFP_KERNEL); if (!entry) return -ENOMEM; entry->dev = *dev; spin_lock_init(&entry->lock); INIT_LIST_HEAD(&entry->dev.keys); if (llsec_dev_use_shortaddr(dev->short_addr)) hash_add_rcu(sec->devices_short, &entry->bucket_s, skey); else INIT_HLIST_NODE(&entry->bucket_s); hash_add_rcu(sec->devices_hw, &entry->bucket_hw, hwkey); list_add_tail_rcu(&entry->dev.list, &sec->table.devices); return 0; } static void llsec_dev_free_rcu(struct rcu_head *rcu) { llsec_dev_free(container_of(rcu, struct mac802154_llsec_device, rcu)); } int mac802154_llsec_dev_del(struct mac802154_llsec *sec, __le64 device_addr) { struct mac802154_llsec_device *pos; pos = llsec_dev_find_long(sec, device_addr); if (!pos) return -ENOENT; hash_del_rcu(&pos->bucket_s); hash_del_rcu(&pos->bucket_hw); list_del_rcu(&pos->dev.list); call_rcu(&pos->rcu, llsec_dev_free_rcu); return 0; } static struct mac802154_llsec_device_key* llsec_devkey_find(struct mac802154_llsec_device *dev, const struct ieee802154_llsec_key_id *key) { struct ieee802154_llsec_device_key *devkey; list_for_each_entry_rcu(devkey, &dev->dev.keys, list) { if (!llsec_key_id_equal(key, &devkey->key_id)) continue; return container_of(devkey, struct mac802154_llsec_device_key, devkey); } return NULL; } int mac802154_llsec_devkey_add(struct mac802154_llsec *sec, __le64 dev_addr, const struct ieee802154_llsec_device_key *key) { struct mac802154_llsec_device *dev; struct mac802154_llsec_device_key *devkey; dev = llsec_dev_find_long(sec, dev_addr); if (!dev) return -ENOENT; if (llsec_devkey_find(dev, &key->key_id)) return -EEXIST; devkey = kmalloc(sizeof(*devkey), GFP_KERNEL); if (!devkey) return -ENOMEM; devkey->devkey = *key; list_add_tail_rcu(&devkey->devkey.list, &dev->dev.keys); return 0; } int mac802154_llsec_devkey_del(struct mac802154_llsec *sec, __le64 dev_addr, const struct ieee802154_llsec_device_key *key) { struct mac802154_llsec_device *dev; struct mac802154_llsec_device_key *devkey; dev = llsec_dev_find_long(sec, dev_addr); if (!dev) return -ENOENT; devkey = llsec_devkey_find(dev, &key->key_id); if (!devkey) return -ENOENT; list_del_rcu(&devkey->devkey.list); kfree_rcu(devkey, rcu); return 0; } static struct mac802154_llsec_seclevel* llsec_find_seclevel(const struct mac802154_llsec *sec, const struct ieee802154_llsec_seclevel *sl) { struct ieee802154_llsec_seclevel *pos; list_for_each_entry(pos, &sec->table.security_levels, list) { if (pos->frame_type != sl->frame_type || (pos->frame_type == IEEE802154_FC_TYPE_MAC_CMD && pos->cmd_frame_id != sl->cmd_frame_id) || pos->device_override != sl->device_override || pos->sec_levels != sl->sec_levels) continue; return container_of(pos, struct mac802154_llsec_seclevel, level); } return NULL; } int mac802154_llsec_seclevel_add(struct mac802154_llsec *sec, const struct ieee802154_llsec_seclevel *sl) { struct mac802154_llsec_seclevel *entry; if (llsec_find_seclevel(sec, sl)) return -EEXIST; entry = kmalloc(sizeof(*entry), GFP_KERNEL); if (!entry) return -ENOMEM; entry->level = *sl; list_add_tail_rcu(&entry->level.list, &sec->table.security_levels); return 0; } int mac802154_llsec_seclevel_del(struct mac802154_llsec *sec, const struct ieee802154_llsec_seclevel *sl) { struct mac802154_llsec_seclevel *pos; pos = llsec_find_seclevel(sec, sl); if (!pos) return -ENOENT; list_del_rcu(&pos->level.list); kfree_rcu(pos, rcu); return 0; } static int llsec_recover_addr(struct mac802154_llsec *sec, struct ieee802154_addr *addr) { __le16 caddr = sec->params.coord_shortaddr; addr->pan_id = sec->params.pan_id; if (caddr == cpu_to_le16(IEEE802154_ADDR_BROADCAST)) { return -EINVAL; } else if (caddr == cpu_to_le16(IEEE802154_ADDR_UNDEF)) { addr->extended_addr = sec->params.coord_hwaddr; addr->mode = IEEE802154_ADDR_LONG; } else { addr->short_addr = sec->params.coord_shortaddr; addr->mode = IEEE802154_ADDR_SHORT; } return 0; } static struct mac802154_llsec_key* llsec_lookup_key(struct mac802154_llsec *sec, const struct ieee802154_hdr *hdr, const struct ieee802154_addr *addr, struct ieee802154_llsec_key_id *key_id) { struct ieee802154_addr devaddr = *addr; u8 key_id_mode = hdr->sec.key_id_mode; struct ieee802154_llsec_key_entry *key_entry; struct mac802154_llsec_key *key; if (key_id_mode == IEEE802154_SCF_KEY_IMPLICIT && devaddr.mode == IEEE802154_ADDR_NONE) { if (hdr->fc.type == IEEE802154_FC_TYPE_BEACON) { devaddr.extended_addr = sec->params.coord_hwaddr; devaddr.mode = IEEE802154_ADDR_LONG; } else if (llsec_recover_addr(sec, &devaddr) < 0) { return NULL; } } list_for_each_entry_rcu(key_entry, &sec->table.keys, list) { const struct ieee802154_llsec_key_id *id = &key_entry->id; if (!(key_entry->key->frame_types & BIT(hdr->fc.type))) continue; if (id->mode != key_id_mode) continue; if (key_id_mode == IEEE802154_SCF_KEY_IMPLICIT) { if (ieee802154_addr_equal(&devaddr, &id->device_addr)) goto found; } else { if (id->id != hdr->sec.key_id) continue; if ((key_id_mode == IEEE802154_SCF_KEY_INDEX) || (key_id_mode == IEEE802154_SCF_KEY_SHORT_INDEX && id->short_source == hdr->sec.short_src) || (key_id_mode == IEEE802154_SCF_KEY_HW_INDEX && id->extended_source == hdr->sec.extended_src)) goto found; } } return NULL; found: key = container_of(key_entry->key, struct mac802154_llsec_key, key); if (key_id) *key_id = key_entry->id; return llsec_key_get(key); } static void llsec_geniv(u8 iv[16], __le64 addr, const struct ieee802154_sechdr *sec) { __be64 addr_bytes = (__force __be64) swab64((__force u64) addr); __be32 frame_counter = (__force __be32) swab32((__force u32) sec->frame_counter); iv[0] = 1; /* L' = L - 1 = 1 */ memcpy(iv + 1, &addr_bytes, sizeof(addr_bytes)); memcpy(iv + 9, &frame_counter, sizeof(frame_counter)); iv[13] = sec->level; iv[14] = 0; iv[15] = 1; } static int llsec_do_encrypt_unauth(struct sk_buff *skb, const struct mac802154_llsec *sec, const struct ieee802154_hdr *hdr, struct mac802154_llsec_key *key) { u8 iv[16]; struct scatterlist src; SYNC_SKCIPHER_REQUEST_ON_STACK(req, key->tfm0); int err, datalen; unsigned char *data; llsec_geniv(iv, sec->params.hwaddr, &hdr->sec); /* Compute data payload offset and data length */ data = skb_mac_header(skb) + skb->mac_len; datalen = skb_tail_pointer(skb) - data; sg_init_one(&src, data, datalen); skcipher_request_set_sync_tfm(req, key->tfm0); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, &src, &src, datalen, iv); err = crypto_skcipher_encrypt(req); skcipher_request_zero(req); return err; } static struct crypto_aead* llsec_tfm_by_len(struct mac802154_llsec_key *key, int authlen) { int i; for (i = 0; i < ARRAY_SIZE(key->tfm); i++) if (crypto_aead_authsize(key->tfm[i]) == authlen) return key->tfm[i]; BUG(); } static int llsec_do_encrypt_auth(struct sk_buff *skb, const struct mac802154_llsec *sec, const struct ieee802154_hdr *hdr, struct mac802154_llsec_key *key) { u8 iv[16]; unsigned char *data; int authlen, assoclen, datalen, rc; struct scatterlist sg; struct aead_request *req; authlen = ieee802154_sechdr_authtag_len(&hdr->sec); llsec_geniv(iv, sec->params.hwaddr, &hdr->sec); req = aead_request_alloc(llsec_tfm_by_len(key, authlen), GFP_ATOMIC); if (!req) return -ENOMEM; assoclen = skb->mac_len; data = skb_mac_header(skb) + skb->mac_len; datalen = skb_tail_pointer(skb) - data; skb_put(skb, authlen); sg_init_one(&sg, skb_mac_header(skb), assoclen + datalen + authlen); if (!(hdr->sec.level & IEEE802154_SCF_SECLEVEL_ENC)) { assoclen += datalen; datalen = 0; } aead_request_set_callback(req, 0, NULL, NULL); aead_request_set_crypt(req, &sg, &sg, datalen, iv); aead_request_set_ad(req, assoclen); rc = crypto_aead_encrypt(req); kfree_sensitive(req); return rc; } static int llsec_do_encrypt(struct sk_buff *skb, const struct mac802154_llsec *sec, const struct ieee802154_hdr *hdr, struct mac802154_llsec_key *key) { if (hdr->sec.level == IEEE802154_SCF_SECLEVEL_ENC) return llsec_do_encrypt_unauth(skb, sec, hdr, key); else return llsec_do_encrypt_auth(skb, sec, hdr, key); } int mac802154_llsec_encrypt(struct mac802154_llsec *sec, struct sk_buff *skb) { struct ieee802154_hdr hdr; int rc, authlen, hlen; struct mac802154_llsec_key *key; u32 frame_ctr; hlen = ieee802154_hdr_pull(skb, &hdr); /* TODO: control frames security support */ if (hlen < 0 || (hdr.fc.type != IEEE802154_FC_TYPE_DATA && hdr.fc.type != IEEE802154_FC_TYPE_BEACON)) return -EINVAL; if (!hdr.fc.security_enabled || (hdr.sec.level == IEEE802154_SCF_SECLEVEL_NONE)) { skb_push(skb, hlen); return 0; } authlen = ieee802154_sechdr_authtag_len(&hdr.sec); if (skb->len + hlen + authlen + IEEE802154_MFR_SIZE > IEEE802154_MTU) return -EMSGSIZE; rcu_read_lock(); read_lock_bh(&sec->lock); if (!sec->params.enabled) { rc = -EINVAL; goto fail_read; } key = llsec_lookup_key(sec, &hdr, &hdr.dest, NULL); if (!key) { rc = -ENOKEY; goto fail_read; } read_unlock_bh(&sec->lock); write_lock_bh(&sec->lock); frame_ctr = be32_to_cpu(sec->params.frame_counter); hdr.sec.frame_counter = cpu_to_le32(frame_ctr); if (frame_ctr == 0xFFFFFFFF) { write_unlock_bh(&sec->lock); llsec_key_put(key); rc = -EOVERFLOW; goto fail; } sec->params.frame_counter = cpu_to_be32(frame_ctr + 1); write_unlock_bh(&sec->lock); rcu_read_unlock(); skb->mac_len = ieee802154_hdr_push(skb, &hdr); skb_reset_mac_header(skb); rc = llsec_do_encrypt(skb, sec, &hdr, key); llsec_key_put(key); return rc; fail_read: read_unlock_bh(&sec->lock); fail: rcu_read_unlock(); return rc; } static struct mac802154_llsec_device* llsec_lookup_dev(struct mac802154_llsec *sec, const struct ieee802154_addr *addr) { struct ieee802154_addr devaddr = *addr; struct mac802154_llsec_device *dev = NULL; if (devaddr.mode == IEEE802154_ADDR_NONE && llsec_recover_addr(sec, &devaddr) < 0) return NULL; if (devaddr.mode == IEEE802154_ADDR_SHORT) { u32 key = llsec_dev_hash_short(devaddr.short_addr, devaddr.pan_id); hash_for_each_possible_rcu(sec->devices_short, dev, bucket_s, key) { if (dev->dev.pan_id == devaddr.pan_id && dev->dev.short_addr == devaddr.short_addr) return dev; } } else { u64 key = llsec_dev_hash_long(devaddr.extended_addr); hash_for_each_possible_rcu(sec->devices_hw, dev, bucket_hw, key) { if (dev->dev.hwaddr == devaddr.extended_addr) return dev; } } return NULL; } static int llsec_lookup_seclevel(const struct mac802154_llsec *sec, u8 frame_type, u8 cmd_frame_id, struct ieee802154_llsec_seclevel *rlevel) { struct ieee802154_llsec_seclevel *level; list_for_each_entry_rcu(level, &sec->table.security_levels, list) { if (level->frame_type == frame_type && (frame_type != IEEE802154_FC_TYPE_MAC_CMD || level->cmd_frame_id == cmd_frame_id)) { *rlevel = *level; return 0; } } return -EINVAL; } static int llsec_do_decrypt_unauth(struct sk_buff *skb, const struct mac802154_llsec *sec, const struct ieee802154_hdr *hdr, struct mac802154_llsec_key *key, __le64 dev_addr) { u8 iv[16]; unsigned char *data; int datalen; struct scatterlist src; SYNC_SKCIPHER_REQUEST_ON_STACK(req, key->tfm0); int err; llsec_geniv(iv, dev_addr, &hdr->sec); data = skb_mac_header(skb) + skb->mac_len; datalen = skb_tail_pointer(skb) - data; sg_init_one(&src, data, datalen); skcipher_request_set_sync_tfm(req, key->tfm0); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, &src, &src, datalen, iv); err = crypto_skcipher_decrypt(req); skcipher_request_zero(req); return err; } static int llsec_do_decrypt_auth(struct sk_buff *skb, const struct mac802154_llsec *sec, const struct ieee802154_hdr *hdr, struct mac802154_llsec_key *key, __le64 dev_addr) { u8 iv[16]; unsigned char *data; int authlen, datalen, assoclen, rc; struct scatterlist sg; struct aead_request *req; authlen = ieee802154_sechdr_authtag_len(&hdr->sec); llsec_geniv(iv, dev_addr, &hdr->sec); req = aead_request_alloc(llsec_tfm_by_len(key, authlen), GFP_ATOMIC); if (!req) return -ENOMEM; assoclen = skb->mac_len; data = skb_mac_header(skb) + skb->mac_len; datalen = skb_tail_pointer(skb) - data; sg_init_one(&sg, skb_mac_header(skb), assoclen + datalen); if (!(hdr->sec.level & IEEE802154_SCF_SECLEVEL_ENC)) { assoclen += datalen - authlen; datalen = authlen; } aead_request_set_callback(req, 0, NULL, NULL); aead_request_set_crypt(req, &sg, &sg, datalen, iv); aead_request_set_ad(req, assoclen); rc = crypto_aead_decrypt(req); kfree_sensitive(req); skb_trim(skb, skb->len - authlen); return rc; } static int llsec_do_decrypt(struct sk_buff *skb, const struct mac802154_llsec *sec, const struct ieee802154_hdr *hdr, struct mac802154_llsec_key *key, __le64 dev_addr) { if (hdr->sec.level == IEEE802154_SCF_SECLEVEL_ENC) return llsec_do_decrypt_unauth(skb, sec, hdr, key, dev_addr); else return llsec_do_decrypt_auth(skb, sec, hdr, key, dev_addr); } static int llsec_update_devkey_record(struct mac802154_llsec_device *dev, const struct ieee802154_llsec_key_id *in_key) { struct mac802154_llsec_device_key *devkey; devkey = llsec_devkey_find(dev, in_key); if (!devkey) { struct mac802154_llsec_device_key *next; next = kzalloc(sizeof(*devkey), GFP_ATOMIC); if (!next) return -ENOMEM; next->devkey.key_id = *in_key; spin_lock_bh(&dev->lock); devkey = llsec_devkey_find(dev, in_key); if (!devkey) list_add_rcu(&next->devkey.list, &dev->dev.keys); else kfree_sensitive(next); spin_unlock_bh(&dev->lock); } return 0; } static int llsec_update_devkey_info(struct mac802154_llsec_device *dev, const struct ieee802154_llsec_key_id *in_key, u32 frame_counter) { struct mac802154_llsec_device_key *devkey = NULL; if (dev->dev.key_mode == IEEE802154_LLSEC_DEVKEY_RESTRICT) { devkey = llsec_devkey_find(dev, in_key); if (!devkey) return -ENOENT; } if (dev->dev.key_mode == IEEE802154_LLSEC_DEVKEY_RECORD) { int rc = llsec_update_devkey_record(dev, in_key); if (rc < 0) return rc; } spin_lock_bh(&dev->lock); if ((!devkey && frame_counter < dev->dev.frame_counter) || (devkey && frame_counter < devkey->devkey.frame_counter)) { spin_unlock_bh(&dev->lock); return -EINVAL; } if (devkey) devkey->devkey.frame_counter = frame_counter + 1; else dev->dev.frame_counter = frame_counter + 1; spin_unlock_bh(&dev->lock); return 0; } int mac802154_llsec_decrypt(struct mac802154_llsec *sec, struct sk_buff *skb) { struct ieee802154_hdr hdr; struct mac802154_llsec_key *key; struct ieee802154_llsec_key_id key_id; struct mac802154_llsec_device *dev; struct ieee802154_llsec_seclevel seclevel; int err; __le64 dev_addr; u32 frame_ctr; if (ieee802154_hdr_peek(skb, &hdr) < 0) return -EINVAL; if (!hdr.fc.security_enabled) return 0; if (hdr.fc.version == 0) return -EINVAL; read_lock_bh(&sec->lock); if (!sec->params.enabled) { read_unlock_bh(&sec->lock); return -EINVAL; } read_unlock_bh(&sec->lock); rcu_read_lock(); key = llsec_lookup_key(sec, &hdr, &hdr.source, &key_id); if (!key) { err = -ENOKEY; goto fail; } dev = llsec_lookup_dev(sec, &hdr.source); if (!dev) { err = -EINVAL; goto fail_dev; } if (llsec_lookup_seclevel(sec, hdr.fc.type, 0, &seclevel) < 0) { err = -EINVAL; goto fail_dev; } if (!(seclevel.sec_levels & BIT(hdr.sec.level)) && (hdr.sec.level == 0 && seclevel.device_override && !dev->dev.seclevel_exempt)) { err = -EINVAL; goto fail_dev; } frame_ctr = le32_to_cpu(hdr.sec.frame_counter); if (frame_ctr == 0xffffffff) { err = -EOVERFLOW; goto fail_dev; } err = llsec_update_devkey_info(dev, &key_id, frame_ctr); if (err) goto fail_dev; dev_addr = dev->dev.hwaddr; rcu_read_unlock(); err = llsec_do_decrypt(skb, sec, &hdr, key, dev_addr); llsec_key_put(key); return err; fail_dev: llsec_key_put(key); fail: rcu_read_unlock(); return err; } |
| 13 13 13 13 13 13 10 3 13 12 1 2 11 13 1 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 | // SPDX-License-Identifier: GPL-2.0-only /* * LED Class Core * * Copyright (C) 2005 John Lenz <lenz@cs.wisc.edu> * Copyright (C) 2005-2007 Richard Purdie <rpurdie@openedhand.com> */ #include <linux/ctype.h> #include <linux/device.h> #include <linux/err.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/leds.h> #include <linux/list.h> #include <linux/module.h> #include <linux/property.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/timer.h> #include <uapi/linux/uleds.h> #include <linux/of.h> #include "leds.h" static DEFINE_MUTEX(leds_lookup_lock); static LIST_HEAD(leds_lookup_list); static struct workqueue_struct *leds_wq; static ssize_t brightness_show(struct device *dev, struct device_attribute *attr, char *buf) { struct led_classdev *led_cdev = dev_get_drvdata(dev); unsigned int brightness; mutex_lock(&led_cdev->led_access); led_update_brightness(led_cdev); brightness = led_cdev->brightness; mutex_unlock(&led_cdev->led_access); return sysfs_emit(buf, "%u\n", brightness); } static ssize_t brightness_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t size) { struct led_classdev *led_cdev = dev_get_drvdata(dev); unsigned long state; ssize_t ret; mutex_lock(&led_cdev->led_access); if (led_sysfs_is_disabled(led_cdev)) { ret = -EBUSY; goto unlock; } ret = kstrtoul(buf, 10, &state); if (ret) goto unlock; if (state == LED_OFF) led_trigger_remove(led_cdev); led_set_brightness(led_cdev, state); ret = size; unlock: mutex_unlock(&led_cdev->led_access); return ret; } static DEVICE_ATTR_RW(brightness); static ssize_t max_brightness_show(struct device *dev, struct device_attribute *attr, char *buf) { struct led_classdev *led_cdev = dev_get_drvdata(dev); unsigned int max_brightness; mutex_lock(&led_cdev->led_access); max_brightness = led_cdev->max_brightness; mutex_unlock(&led_cdev->led_access); return sysfs_emit(buf, "%u\n", max_brightness); } static DEVICE_ATTR_RO(max_brightness); #ifdef CONFIG_LEDS_TRIGGERS static const BIN_ATTR(trigger, 0644, led_trigger_read, led_trigger_write, 0); static const struct bin_attribute *const led_trigger_bin_attrs[] = { &bin_attr_trigger, NULL, }; static const struct attribute_group led_trigger_group = { .bin_attrs = led_trigger_bin_attrs, }; #endif static struct attribute *led_class_attrs[] = { &dev_attr_brightness.attr, &dev_attr_max_brightness.attr, NULL, }; static const struct attribute_group led_group = { .attrs = led_class_attrs, }; static const struct attribute_group *led_groups[] = { &led_group, #ifdef CONFIG_LEDS_TRIGGERS &led_trigger_group, #endif NULL, }; #ifdef CONFIG_LEDS_BRIGHTNESS_HW_CHANGED static ssize_t brightness_hw_changed_show(struct device *dev, struct device_attribute *attr, char *buf) { struct led_classdev *led_cdev = dev_get_drvdata(dev); if (led_cdev->brightness_hw_changed == -1) return -ENODATA; return sysfs_emit(buf, "%u\n", led_cdev->brightness_hw_changed); } static DEVICE_ATTR_RO(brightness_hw_changed); static int led_add_brightness_hw_changed(struct led_classdev *led_cdev) { struct device *dev = led_cdev->dev; int ret; ret = device_create_file(dev, &dev_attr_brightness_hw_changed); if (ret) { dev_err(dev, "Error creating brightness_hw_changed\n"); return ret; } led_cdev->brightness_hw_changed_kn = sysfs_get_dirent(dev->kobj.sd, "brightness_hw_changed"); if (!led_cdev->brightness_hw_changed_kn) { dev_err(dev, "Error getting brightness_hw_changed kn\n"); device_remove_file(dev, &dev_attr_brightness_hw_changed); return -ENXIO; } return 0; } static void led_remove_brightness_hw_changed(struct led_classdev *led_cdev) { sysfs_put(led_cdev->brightness_hw_changed_kn); device_remove_file(led_cdev->dev, &dev_attr_brightness_hw_changed); } void led_classdev_notify_brightness_hw_changed(struct led_classdev *led_cdev, unsigned int brightness) { if (WARN_ON(!led_cdev->brightness_hw_changed_kn)) return; led_cdev->brightness_hw_changed = brightness; sysfs_notify_dirent(led_cdev->brightness_hw_changed_kn); } EXPORT_SYMBOL_GPL(led_classdev_notify_brightness_hw_changed); #else static int led_add_brightness_hw_changed(struct led_classdev *led_cdev) { return 0; } static void led_remove_brightness_hw_changed(struct led_classdev *led_cdev) { } #endif /** * led_classdev_suspend - suspend an led_classdev. * @led_cdev: the led_classdev to suspend. */ void led_classdev_suspend(struct led_classdev *led_cdev) { led_cdev->flags |= LED_SUSPENDED; led_set_brightness_nopm(led_cdev, 0); flush_work(&led_cdev->set_brightness_work); } EXPORT_SYMBOL_GPL(led_classdev_suspend); /** * led_classdev_resume - resume an led_classdev. * @led_cdev: the led_classdev to resume. */ void led_classdev_resume(struct led_classdev *led_cdev) { led_set_brightness_nopm(led_cdev, led_cdev->brightness); if (led_cdev->flash_resume) led_cdev->flash_resume(led_cdev); led_cdev->flags &= ~LED_SUSPENDED; } EXPORT_SYMBOL_GPL(led_classdev_resume); #ifdef CONFIG_PM_SLEEP static int led_suspend(struct device *dev) { struct led_classdev *led_cdev = dev_get_drvdata(dev); if (led_cdev->flags & LED_CORE_SUSPENDRESUME) led_classdev_suspend(led_cdev); return 0; } static int led_resume(struct device *dev) { struct led_classdev *led_cdev = dev_get_drvdata(dev); if (led_cdev->flags & LED_CORE_SUSPENDRESUME) led_classdev_resume(led_cdev); return 0; } #endif static SIMPLE_DEV_PM_OPS(leds_class_dev_pm_ops, led_suspend, led_resume); static struct led_classdev *led_module_get(struct device *led_dev) { struct led_classdev *led_cdev; if (!led_dev) return ERR_PTR(-EPROBE_DEFER); led_cdev = dev_get_drvdata(led_dev); if (!try_module_get(led_cdev->dev->parent->driver->owner)) { put_device(led_cdev->dev); return ERR_PTR(-ENODEV); } return led_cdev; } static const struct class leds_class = { .name = "leds", .dev_groups = led_groups, .pm = &leds_class_dev_pm_ops, }; /** * of_led_get() - request a LED device via the LED framework * @np: device node to get the LED device from * @index: the index of the LED * @name: the name of the LED used to map it to its function, if present * * Returns the LED device parsed from the phandle specified in the "leds" * property of a device tree node or a negative error-code on failure. */ static struct led_classdev *of_led_get(struct device_node *np, int index, const char *name) { struct device *led_dev; struct device_node *led_node; /* * For named LEDs, first look up the name in the "led-names" property. * If it cannot be found, then of_parse_phandle() will propagate the error. */ if (name) index = of_property_match_string(np, "led-names", name); led_node = of_parse_phandle(np, "leds", index); if (!led_node) return ERR_PTR(-ENOENT); led_dev = class_find_device_by_of_node(&leds_class, led_node); of_node_put(led_node); return led_module_get(led_dev); } /** * led_put() - release a LED device * @led_cdev: LED device */ void led_put(struct led_classdev *led_cdev) { module_put(led_cdev->dev->parent->driver->owner); put_device(led_cdev->dev); } EXPORT_SYMBOL_GPL(led_put); static void devm_led_release(struct device *dev, void *res) { struct led_classdev **p = res; led_put(*p); } static struct led_classdev *__devm_led_get(struct device *dev, struct led_classdev *led) { struct led_classdev **dr; dr = devres_alloc(devm_led_release, sizeof(struct led_classdev *), GFP_KERNEL); if (!dr) { led_put(led); return ERR_PTR(-ENOMEM); } *dr = led; devres_add(dev, dr); return led; } /** * devm_of_led_get - Resource-managed request of a LED device * @dev: LED consumer * @index: index of the LED to obtain in the consumer * * The device node of the device is parse to find the request LED device. * The LED device returned from this function is automatically released * on driver detach. * * @return a pointer to a LED device or ERR_PTR(errno) on failure. */ struct led_classdev *__must_check devm_of_led_get(struct device *dev, int index) { struct led_classdev *led; if (!dev) return ERR_PTR(-EINVAL); led = of_led_get(dev->of_node, index, NULL); if (IS_ERR(led)) return led; return __devm_led_get(dev, led); } EXPORT_SYMBOL_GPL(devm_of_led_get); /** * led_get() - request a LED device via the LED framework * @dev: device for which to get the LED device * @con_id: name of the LED from the device's point of view * * @return a pointer to a LED device or ERR_PTR(errno) on failure. */ struct led_classdev *led_get(struct device *dev, char *con_id) { struct led_lookup_data *lookup; struct led_classdev *led_cdev; const char *provider = NULL; struct device *led_dev; led_cdev = of_led_get(dev->of_node, -1, con_id); if (!IS_ERR(led_cdev) || PTR_ERR(led_cdev) != -ENOENT) return led_cdev; mutex_lock(&leds_lookup_lock); list_for_each_entry(lookup, &leds_lookup_list, list) { if (!strcmp(lookup->dev_id, dev_name(dev)) && !strcmp(lookup->con_id, con_id)) { provider = kstrdup_const(lookup->provider, GFP_KERNEL); break; } } mutex_unlock(&leds_lookup_lock); if (!provider) return ERR_PTR(-ENOENT); led_dev = class_find_device_by_name(&leds_class, provider); kfree_const(provider); return led_module_get(led_dev); } EXPORT_SYMBOL_GPL(led_get); /** * devm_led_get() - request a LED device via the LED framework * @dev: device for which to get the LED device * @con_id: name of the LED from the device's point of view * * The LED device returned from this function is automatically released * on driver detach. * * @return a pointer to a LED device or ERR_PTR(errno) on failure. */ struct led_classdev *devm_led_get(struct device *dev, char *con_id) { struct led_classdev *led; led = led_get(dev, con_id); if (IS_ERR(led)) return led; return __devm_led_get(dev, led); } EXPORT_SYMBOL_GPL(devm_led_get); /** * led_add_lookup() - Add a LED lookup table entry * @led_lookup: the lookup table entry to add * * Add a LED lookup table entry. On systems without devicetree the lookup table * is used by led_get() to find LEDs. */ void led_add_lookup(struct led_lookup_data *led_lookup) { mutex_lock(&leds_lookup_lock); list_add_tail(&led_lookup->list, &leds_lookup_list); mutex_unlock(&leds_lookup_lock); } EXPORT_SYMBOL_GPL(led_add_lookup); /** * led_remove_lookup() - Remove a LED lookup table entry * @led_lookup: the lookup table entry to remove */ void led_remove_lookup(struct led_lookup_data *led_lookup) { mutex_lock(&leds_lookup_lock); list_del(&led_lookup->list); mutex_unlock(&leds_lookup_lock); } EXPORT_SYMBOL_GPL(led_remove_lookup); /** * devm_of_led_get_optional - Resource-managed request of an optional LED device * @dev: LED consumer * @index: index of the LED to obtain in the consumer * * The device node of the device is parsed to find the requested LED device. * The LED device returned from this function is automatically released * on driver detach. * * @return a pointer to a LED device, ERR_PTR(errno) on failure and NULL if the * led was not found. */ struct led_classdev *__must_check devm_of_led_get_optional(struct device *dev, int index) { struct led_classdev *led; led = devm_of_led_get(dev, index); if (IS_ERR(led) && PTR_ERR(led) == -ENOENT) return NULL; return led; } EXPORT_SYMBOL_GPL(devm_of_led_get_optional); static int led_classdev_next_name(const char *init_name, char *name, size_t len) { unsigned int i = 0; int ret = 0; struct device *dev; strscpy(name, init_name, len); while ((ret < len) && (dev = class_find_device_by_name(&leds_class, name))) { put_device(dev); ret = snprintf(name, len, "%s_%u", init_name, ++i); } if (ret >= len) return -ENOMEM; return i; } /** * led_classdev_register_ext - register a new object of led_classdev class * with init data. * * @parent: parent of LED device * @led_cdev: the led_classdev structure for this device. * @init_data: LED class device initialization data */ int led_classdev_register_ext(struct device *parent, struct led_classdev *led_cdev, struct led_init_data *init_data) { char composed_name[LED_MAX_NAME_SIZE]; char final_name[LED_MAX_NAME_SIZE]; const char *proposed_name = composed_name; int ret; if (init_data) { if (init_data->devname_mandatory && !init_data->devicename) { dev_err(parent, "Mandatory device name is missing"); return -EINVAL; } ret = led_compose_name(parent, init_data, composed_name); if (ret < 0) return ret; if (init_data->fwnode) { fwnode_property_read_string(init_data->fwnode, "linux,default-trigger", &led_cdev->default_trigger); if (fwnode_property_present(init_data->fwnode, "retain-state-shutdown")) led_cdev->flags |= LED_RETAIN_AT_SHUTDOWN; fwnode_property_read_u32(init_data->fwnode, "max-brightness", &led_cdev->max_brightness); if (fwnode_property_present(init_data->fwnode, "color")) fwnode_property_read_u32(init_data->fwnode, "color", &led_cdev->color); } } else { proposed_name = led_cdev->name; } ret = led_classdev_next_name(proposed_name, final_name, sizeof(final_name)); if (ret < 0) return ret; else if (ret && led_cdev->flags & LED_REJECT_NAME_CONFLICT) return -EEXIST; else if (ret) dev_warn(parent, "Led %s renamed to %s due to name collision\n", proposed_name, final_name); if (led_cdev->color >= LED_COLOR_ID_MAX) dev_warn(parent, "LED %s color identifier out of range\n", final_name); mutex_init(&led_cdev->led_access); mutex_lock(&led_cdev->led_access); led_cdev->dev = device_create_with_groups(&leds_class, parent, 0, led_cdev, led_cdev->groups, "%s", final_name); if (IS_ERR(led_cdev->dev)) { mutex_unlock(&led_cdev->led_access); return PTR_ERR(led_cdev->dev); } if (init_data && init_data->fwnode) device_set_node(led_cdev->dev, init_data->fwnode); if (led_cdev->flags & LED_BRIGHT_HW_CHANGED) { ret = led_add_brightness_hw_changed(led_cdev); if (ret) { device_unregister(led_cdev->dev); led_cdev->dev = NULL; mutex_unlock(&led_cdev->led_access); return ret; } } led_cdev->work_flags = 0; #ifdef CONFIG_LEDS_TRIGGERS init_rwsem(&led_cdev->trigger_lock); #endif #ifdef CONFIG_LEDS_BRIGHTNESS_HW_CHANGED led_cdev->brightness_hw_changed = -1; #endif /* add to the list of leds */ down_write(&leds_list_lock); list_add_tail(&led_cdev->node, &leds_list); up_write(&leds_list_lock); if (!led_cdev->max_brightness) led_cdev->max_brightness = LED_FULL; led_update_brightness(led_cdev); led_cdev->wq = leds_wq; led_init_core(led_cdev); #ifdef CONFIG_LEDS_TRIGGERS led_trigger_set_default(led_cdev); #endif mutex_unlock(&led_cdev->led_access); dev_dbg(parent, "Registered led device: %s\n", led_cdev->name); return 0; } EXPORT_SYMBOL_GPL(led_classdev_register_ext); /** * led_classdev_unregister - unregisters a object of led_properties class. * @led_cdev: the led device to unregister * * Unregisters a previously registered via led_classdev_register object. */ void led_classdev_unregister(struct led_classdev *led_cdev) { if (IS_ERR_OR_NULL(led_cdev->dev)) return; #ifdef CONFIG_LEDS_TRIGGERS down_write(&led_cdev->trigger_lock); if (led_cdev->trigger) led_trigger_set(led_cdev, NULL); up_write(&led_cdev->trigger_lock); #endif led_cdev->flags |= LED_UNREGISTERING; /* Stop blinking */ led_stop_software_blink(led_cdev); if (!(led_cdev->flags & LED_RETAIN_AT_SHUTDOWN)) led_set_brightness(led_cdev, LED_OFF); flush_work(&led_cdev->set_brightness_work); if (led_cdev->flags & LED_BRIGHT_HW_CHANGED) led_remove_brightness_hw_changed(led_cdev); device_unregister(led_cdev->dev); down_write(&leds_list_lock); list_del(&led_cdev->node); up_write(&leds_list_lock); mutex_destroy(&led_cdev->led_access); } EXPORT_SYMBOL_GPL(led_classdev_unregister); static void devm_led_classdev_release(struct device *dev, void *res) { led_classdev_unregister(*(struct led_classdev **)res); } /** * devm_led_classdev_register_ext - resource managed led_classdev_register_ext() * * @parent: parent of LED device * @led_cdev: the led_classdev structure for this device. * @init_data: LED class device initialization data */ int devm_led_classdev_register_ext(struct device *parent, struct led_classdev *led_cdev, struct led_init_data *init_data) { struct led_classdev **dr; int rc; dr = devres_alloc(devm_led_classdev_release, sizeof(*dr), GFP_KERNEL); if (!dr) return -ENOMEM; rc = led_classdev_register_ext(parent, led_cdev, init_data); if (rc) { devres_free(dr); return rc; } *dr = led_cdev; devres_add(parent, dr); return 0; } EXPORT_SYMBOL_GPL(devm_led_classdev_register_ext); static int devm_led_classdev_match(struct device *dev, void *res, void *data) { struct led_classdev **p = res; if (WARN_ON(!p || !*p)) return 0; return *p == data; } /** * devm_led_classdev_unregister() - resource managed led_classdev_unregister() * @dev: The device to unregister. * @led_cdev: the led_classdev structure for this device. */ void devm_led_classdev_unregister(struct device *dev, struct led_classdev *led_cdev) { WARN_ON(devres_release(dev, devm_led_classdev_release, devm_led_classdev_match, led_cdev)); } EXPORT_SYMBOL_GPL(devm_led_classdev_unregister); static int __init leds_init(void) { leds_wq = alloc_ordered_workqueue("leds", 0); if (!leds_wq) { pr_err("Failed to create LEDs ordered workqueue\n"); return -ENOMEM; } return class_register(&leds_class); } static void __exit leds_exit(void) { class_unregister(&leds_class); destroy_workqueue(leds_wq); } subsys_initcall(leds_init); module_exit(leds_exit); MODULE_AUTHOR("John Lenz, Richard Purdie"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("LED Class Interface"); |
| 16 2 14 14 14 1 14 14 14 14 14 14 1 13 1 1 2 2 1 1 4 1 2 1 3 2 1 7 1 1 2 1 1 1 3 1 1 1 1 3 3 1 2 1 1 2 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 | // SPDX-License-Identifier: GPL-2.0-only /* * vivid-sdr-cap.c - software defined radio support functions. * * Copyright 2014 Cisco Systems, Inc. and/or its affiliates. All rights reserved. */ #include <linux/errno.h> #include <linux/kernel.h> #include <linux/delay.h> #include <linux/kthread.h> #include <linux/freezer.h> #include <linux/math64.h> #include <linux/videodev2.h> #include <linux/v4l2-dv-timings.h> #include <media/v4l2-common.h> #include <media/v4l2-event.h> #include <media/v4l2-dv-timings.h> #include <linux/fixp-arith.h> #include <linux/jiffies.h> #include "vivid-core.h" #include "vivid-ctrls.h" #include "vivid-sdr-cap.h" /* stream formats */ struct vivid_format { u32 pixelformat; u32 buffersize; }; /* format descriptions for capture and preview */ static const struct vivid_format formats[] = { { .pixelformat = V4L2_SDR_FMT_CU8, .buffersize = SDR_CAP_SAMPLES_PER_BUF * 2, }, { .pixelformat = V4L2_SDR_FMT_CS8, .buffersize = SDR_CAP_SAMPLES_PER_BUF * 2, }, }; static const struct v4l2_frequency_band bands_adc[] = { { .tuner = 0, .type = V4L2_TUNER_ADC, .index = 0, .capability = V4L2_TUNER_CAP_1HZ | V4L2_TUNER_CAP_FREQ_BANDS, .rangelow = 300000, .rangehigh = 300000, }, { .tuner = 0, .type = V4L2_TUNER_ADC, .index = 1, .capability = V4L2_TUNER_CAP_1HZ | V4L2_TUNER_CAP_FREQ_BANDS, .rangelow = 900001, .rangehigh = 2800000, }, { .tuner = 0, .type = V4L2_TUNER_ADC, .index = 2, .capability = V4L2_TUNER_CAP_1HZ | V4L2_TUNER_CAP_FREQ_BANDS, .rangelow = 3200000, .rangehigh = 3200000, }, }; /* ADC band midpoints */ #define BAND_ADC_0 ((bands_adc[0].rangehigh + bands_adc[1].rangelow) / 2) #define BAND_ADC_1 ((bands_adc[1].rangehigh + bands_adc[2].rangelow) / 2) static const struct v4l2_frequency_band bands_fm[] = { { .tuner = 1, .type = V4L2_TUNER_RF, .index = 0, .capability = V4L2_TUNER_CAP_1HZ | V4L2_TUNER_CAP_FREQ_BANDS, .rangelow = 50000000, .rangehigh = 2000000000, }, }; static void vivid_thread_sdr_cap_tick(struct vivid_dev *dev) { struct vivid_buffer *sdr_cap_buf = NULL; dprintk(dev, 1, "SDR Capture Thread Tick\n"); /* Drop a certain percentage of buffers. */ if (dev->perc_dropped_buffers && get_random_u32_below(100) < dev->perc_dropped_buffers) return; spin_lock(&dev->slock); if (!list_empty(&dev->sdr_cap_active)) { sdr_cap_buf = list_entry(dev->sdr_cap_active.next, struct vivid_buffer, list); list_del(&sdr_cap_buf->list); } spin_unlock(&dev->slock); if (sdr_cap_buf) { sdr_cap_buf->vb.sequence = dev->sdr_cap_with_seq_wrap_count; v4l2_ctrl_request_setup(sdr_cap_buf->vb.vb2_buf.req_obj.req, &dev->ctrl_hdl_sdr_cap); v4l2_ctrl_request_complete(sdr_cap_buf->vb.vb2_buf.req_obj.req, &dev->ctrl_hdl_sdr_cap); vivid_sdr_cap_process(dev, sdr_cap_buf); sdr_cap_buf->vb.vb2_buf.timestamp = ktime_get_ns() + dev->time_wrap_offset; vb2_buffer_done(&sdr_cap_buf->vb.vb2_buf, dev->dqbuf_error ? VB2_BUF_STATE_ERROR : VB2_BUF_STATE_DONE); dev->dqbuf_error = false; } } static int vivid_thread_sdr_cap(void *data) { struct vivid_dev *dev = data; u64 samples_since_start; u64 buffers_since_start; u64 next_jiffies_since_start; unsigned long jiffies_since_start; unsigned long cur_jiffies; unsigned wait_jiffies; dprintk(dev, 1, "SDR Capture Thread Start\n"); set_freezable(); /* Resets frame counters */ dev->sdr_cap_seq_offset = 0; dev->sdr_cap_seq_count = 0; dev->jiffies_sdr_cap = jiffies; dev->sdr_cap_seq_resync = false; if (dev->time_wrap) dev->time_wrap_offset = dev->time_wrap - ktime_get_ns(); else dev->time_wrap_offset = 0; for (;;) { try_to_freeze(); if (kthread_should_stop()) break; if (!mutex_trylock(&dev->mutex)) { schedule(); continue; } cur_jiffies = jiffies; if (dev->sdr_cap_seq_resync) { dev->jiffies_sdr_cap = cur_jiffies; dev->sdr_cap_seq_offset = dev->sdr_cap_seq_count + 1; dev->sdr_cap_seq_count = 0; dev->sdr_cap_seq_resync = false; } /* Calculate the number of jiffies since we started streaming */ jiffies_since_start = cur_jiffies - dev->jiffies_sdr_cap; /* Get the number of buffers streamed since the start */ buffers_since_start = (u64)jiffies_since_start * dev->sdr_adc_freq + (HZ * SDR_CAP_SAMPLES_PER_BUF) / 2; do_div(buffers_since_start, HZ * SDR_CAP_SAMPLES_PER_BUF); /* * After more than 0xf0000000 (rounded down to a multiple of * 'jiffies-per-day' to ease jiffies_to_msecs calculation) * jiffies have passed since we started streaming reset the * counters and keep track of the sequence offset. */ if (jiffies_since_start > JIFFIES_RESYNC) { dev->jiffies_sdr_cap = cur_jiffies; dev->sdr_cap_seq_offset = buffers_since_start; buffers_since_start = 0; } dev->sdr_cap_seq_count = buffers_since_start + dev->sdr_cap_seq_offset; dev->sdr_cap_with_seq_wrap_count = dev->sdr_cap_seq_count - dev->sdr_cap_seq_start; vivid_thread_sdr_cap_tick(dev); mutex_unlock(&dev->mutex); /* * Calculate the number of samples streamed since we started, * not including the current buffer. */ samples_since_start = buffers_since_start * SDR_CAP_SAMPLES_PER_BUF; /* And the number of jiffies since we started */ jiffies_since_start = jiffies - dev->jiffies_sdr_cap; /* Increase by the number of samples in one buffer */ samples_since_start += SDR_CAP_SAMPLES_PER_BUF; /* * Calculate when that next buffer is supposed to start * in jiffies since we started streaming. */ next_jiffies_since_start = samples_since_start * HZ + dev->sdr_adc_freq / 2; do_div(next_jiffies_since_start, dev->sdr_adc_freq); /* If it is in the past, then just schedule asap */ if (next_jiffies_since_start < jiffies_since_start) next_jiffies_since_start = jiffies_since_start; wait_jiffies = next_jiffies_since_start - jiffies_since_start; if (!time_is_after_jiffies(cur_jiffies + wait_jiffies)) continue; wait_queue_head_t wait; init_waitqueue_head(&wait); wait_event_interruptible_timeout(wait, kthread_should_stop(), cur_jiffies + wait_jiffies - jiffies); } dprintk(dev, 1, "SDR Capture Thread End\n"); return 0; } static int sdr_cap_queue_setup(struct vb2_queue *vq, unsigned *nbuffers, unsigned *nplanes, unsigned sizes[], struct device *alloc_devs[]) { /* 2 = max 16-bit sample returned */ u32 size = SDR_CAP_SAMPLES_PER_BUF * 2; if (*nplanes) return sizes[0] < size ? -EINVAL : 0; *nplanes = 1; sizes[0] = size; return 0; } static int sdr_cap_buf_prepare(struct vb2_buffer *vb) { struct vivid_dev *dev = vb2_get_drv_priv(vb->vb2_queue); unsigned size = SDR_CAP_SAMPLES_PER_BUF * 2; dprintk(dev, 1, "%s\n", __func__); if (dev->buf_prepare_error) { /* * Error injection: test what happens if buf_prepare() returns * an error. */ dev->buf_prepare_error = false; return -EINVAL; } if (vb2_plane_size(vb, 0) < size) { dprintk(dev, 1, "%s data will not fit into plane (%lu < %u)\n", __func__, vb2_plane_size(vb, 0), size); return -EINVAL; } vb2_set_plane_payload(vb, 0, size); return 0; } static void sdr_cap_buf_queue(struct vb2_buffer *vb) { struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb); struct vivid_dev *dev = vb2_get_drv_priv(vb->vb2_queue); struct vivid_buffer *buf = container_of(vbuf, struct vivid_buffer, vb); dprintk(dev, 1, "%s\n", __func__); spin_lock(&dev->slock); list_add_tail(&buf->list, &dev->sdr_cap_active); spin_unlock(&dev->slock); } static int sdr_cap_start_streaming(struct vb2_queue *vq, unsigned count) { struct vivid_dev *dev = vb2_get_drv_priv(vq); int err = 0; dprintk(dev, 1, "%s\n", __func__); dev->sdr_cap_seq_start = dev->seq_wrap * 128; if (dev->start_streaming_error) { dev->start_streaming_error = false; err = -EINVAL; } else if (dev->kthread_sdr_cap == NULL) { dev->kthread_sdr_cap = kthread_run(vivid_thread_sdr_cap, dev, "%s-sdr-cap", dev->v4l2_dev.name); if (IS_ERR(dev->kthread_sdr_cap)) { v4l2_err(&dev->v4l2_dev, "kernel_thread() failed\n"); err = PTR_ERR(dev->kthread_sdr_cap); dev->kthread_sdr_cap = NULL; } } if (err) { struct vivid_buffer *buf, *tmp; list_for_each_entry_safe(buf, tmp, &dev->sdr_cap_active, list) { list_del(&buf->list); vb2_buffer_done(&buf->vb.vb2_buf, VB2_BUF_STATE_QUEUED); } } return err; } /* abort streaming and wait for last buffer */ static void sdr_cap_stop_streaming(struct vb2_queue *vq) { struct vivid_dev *dev = vb2_get_drv_priv(vq); if (dev->kthread_sdr_cap == NULL) return; while (!list_empty(&dev->sdr_cap_active)) { struct vivid_buffer *buf; buf = list_entry(dev->sdr_cap_active.next, struct vivid_buffer, list); list_del(&buf->list); v4l2_ctrl_request_complete(buf->vb.vb2_buf.req_obj.req, &dev->ctrl_hdl_sdr_cap); vb2_buffer_done(&buf->vb.vb2_buf, VB2_BUF_STATE_ERROR); } /* shutdown control thread */ kthread_stop(dev->kthread_sdr_cap); dev->kthread_sdr_cap = NULL; } static void sdr_cap_buf_request_complete(struct vb2_buffer *vb) { struct vivid_dev *dev = vb2_get_drv_priv(vb->vb2_queue); v4l2_ctrl_request_complete(vb->req_obj.req, &dev->ctrl_hdl_sdr_cap); } const struct vb2_ops vivid_sdr_cap_qops = { .queue_setup = sdr_cap_queue_setup, .buf_prepare = sdr_cap_buf_prepare, .buf_queue = sdr_cap_buf_queue, .start_streaming = sdr_cap_start_streaming, .stop_streaming = sdr_cap_stop_streaming, .buf_request_complete = sdr_cap_buf_request_complete, }; int vivid_sdr_enum_freq_bands(struct file *file, void *priv, struct v4l2_frequency_band *band) { switch (band->tuner) { case 0: if (band->index >= ARRAY_SIZE(bands_adc)) return -EINVAL; *band = bands_adc[band->index]; return 0; case 1: if (band->index >= ARRAY_SIZE(bands_fm)) return -EINVAL; *band = bands_fm[band->index]; return 0; default: return -EINVAL; } } int vivid_sdr_g_frequency(struct file *file, void *priv, struct v4l2_frequency *vf) { struct vivid_dev *dev = video_drvdata(file); switch (vf->tuner) { case 0: vf->frequency = dev->sdr_adc_freq; vf->type = V4L2_TUNER_ADC; return 0; case 1: vf->frequency = dev->sdr_fm_freq; vf->type = V4L2_TUNER_RF; return 0; default: return -EINVAL; } } int vivid_sdr_s_frequency(struct file *file, void *priv, const struct v4l2_frequency *vf) { struct vivid_dev *dev = video_drvdata(file); unsigned freq = vf->frequency; unsigned band; switch (vf->tuner) { case 0: if (vf->type != V4L2_TUNER_ADC) return -EINVAL; if (freq < BAND_ADC_0) band = 0; else if (freq < BAND_ADC_1) band = 1; else band = 2; freq = clamp_t(unsigned, freq, bands_adc[band].rangelow, bands_adc[band].rangehigh); if (vb2_is_streaming(&dev->vb_sdr_cap_q) && freq != dev->sdr_adc_freq) { /* resync the thread's timings */ dev->sdr_cap_seq_resync = true; } dev->sdr_adc_freq = freq; return 0; case 1: if (vf->type != V4L2_TUNER_RF) return -EINVAL; dev->sdr_fm_freq = clamp_t(unsigned, freq, bands_fm[0].rangelow, bands_fm[0].rangehigh); return 0; default: return -EINVAL; } } int vivid_sdr_g_tuner(struct file *file, void *priv, struct v4l2_tuner *vt) { switch (vt->index) { case 0: strscpy(vt->name, "ADC", sizeof(vt->name)); vt->type = V4L2_TUNER_ADC; vt->capability = V4L2_TUNER_CAP_1HZ | V4L2_TUNER_CAP_FREQ_BANDS; vt->rangelow = bands_adc[0].rangelow; vt->rangehigh = bands_adc[2].rangehigh; return 0; case 1: strscpy(vt->name, "RF", sizeof(vt->name)); vt->type = V4L2_TUNER_RF; vt->capability = V4L2_TUNER_CAP_1HZ | V4L2_TUNER_CAP_FREQ_BANDS; vt->rangelow = bands_fm[0].rangelow; vt->rangehigh = bands_fm[0].rangehigh; return 0; default: return -EINVAL; } } int vivid_sdr_s_tuner(struct file *file, void *priv, const struct v4l2_tuner *vt) { if (vt->index > 1) return -EINVAL; return 0; } int vidioc_enum_fmt_sdr_cap(struct file *file, void *priv, struct v4l2_fmtdesc *f) { if (f->index >= ARRAY_SIZE(formats)) return -EINVAL; f->pixelformat = formats[f->index].pixelformat; return 0; } int vidioc_g_fmt_sdr_cap(struct file *file, void *priv, struct v4l2_format *f) { struct vivid_dev *dev = video_drvdata(file); f->fmt.sdr.pixelformat = dev->sdr_pixelformat; f->fmt.sdr.buffersize = dev->sdr_buffersize; return 0; } int vidioc_s_fmt_sdr_cap(struct file *file, void *priv, struct v4l2_format *f) { struct vivid_dev *dev = video_drvdata(file); struct vb2_queue *q = &dev->vb_sdr_cap_q; int i; if (vb2_is_busy(q)) return -EBUSY; for (i = 0; i < ARRAY_SIZE(formats); i++) { if (formats[i].pixelformat == f->fmt.sdr.pixelformat) { dev->sdr_pixelformat = formats[i].pixelformat; dev->sdr_buffersize = formats[i].buffersize; f->fmt.sdr.buffersize = formats[i].buffersize; return 0; } } dev->sdr_pixelformat = formats[0].pixelformat; dev->sdr_buffersize = formats[0].buffersize; f->fmt.sdr.pixelformat = formats[0].pixelformat; f->fmt.sdr.buffersize = formats[0].buffersize; return 0; } int vidioc_try_fmt_sdr_cap(struct file *file, void *priv, struct v4l2_format *f) { int i; for (i = 0; i < ARRAY_SIZE(formats); i++) { if (formats[i].pixelformat == f->fmt.sdr.pixelformat) { f->fmt.sdr.buffersize = formats[i].buffersize; return 0; } } f->fmt.sdr.pixelformat = formats[0].pixelformat; f->fmt.sdr.buffersize = formats[0].buffersize; return 0; } #define FIXP_N (15) #define FIXP_FRAC (1 << FIXP_N) #define FIXP_2PI ((int)(2 * 3.141592653589 * FIXP_FRAC)) #define M_100000PI (3.14159 * 100000) void vivid_sdr_cap_process(struct vivid_dev *dev, struct vivid_buffer *buf) { u8 *vbuf = vb2_plane_vaddr(&buf->vb.vb2_buf, 0); unsigned long i; unsigned long plane_size = vb2_plane_size(&buf->vb.vb2_buf, 0); s64 s64tmp; s32 src_phase_step; s32 mod_phase_step; s32 fixp_i; s32 fixp_q; /* calculate phase step */ #define BEEP_FREQ 1000 /* 1kHz beep */ src_phase_step = DIV_ROUND_CLOSEST(FIXP_2PI * BEEP_FREQ, dev->sdr_adc_freq); for (i = 0; i < plane_size; i += 2) { mod_phase_step = fixp_cos32_rad(dev->sdr_fixp_src_phase, FIXP_2PI) >> (31 - FIXP_N); dev->sdr_fixp_src_phase += src_phase_step; s64tmp = (s64) mod_phase_step * dev->sdr_fm_deviation; dev->sdr_fixp_mod_phase += div_s64(s64tmp, M_100000PI); /* * Transfer phase angle to [0, 2xPI] in order to avoid variable * overflow and make it suitable for cosine implementation * used, which does not support negative angles. */ dev->sdr_fixp_src_phase %= FIXP_2PI; dev->sdr_fixp_mod_phase %= FIXP_2PI; if (dev->sdr_fixp_mod_phase < 0) dev->sdr_fixp_mod_phase += FIXP_2PI; fixp_i = fixp_cos32_rad(dev->sdr_fixp_mod_phase, FIXP_2PI); fixp_q = fixp_sin32_rad(dev->sdr_fixp_mod_phase, FIXP_2PI); /* Normalize fraction values represented with 32 bit precision * to fixed point representation with FIXP_N bits */ fixp_i >>= (31 - FIXP_N); fixp_q >>= (31 - FIXP_N); switch (dev->sdr_pixelformat) { case V4L2_SDR_FMT_CU8: /* convert 'fixp float' to u8 [0, +255] */ /* u8 = X * 127.5 + 127.5; X is float [-1.0, +1.0] */ fixp_i = fixp_i * 1275 + FIXP_FRAC * 1275; fixp_q = fixp_q * 1275 + FIXP_FRAC * 1275; *vbuf++ = DIV_ROUND_CLOSEST(fixp_i, FIXP_FRAC * 10); *vbuf++ = DIV_ROUND_CLOSEST(fixp_q, FIXP_FRAC * 10); break; case V4L2_SDR_FMT_CS8: /* convert 'fixp float' to s8 [-128, +127] */ /* s8 = X * 127.5 - 0.5; X is float [-1.0, +1.0] */ fixp_i = fixp_i * 1275 - FIXP_FRAC * 5; fixp_q = fixp_q * 1275 - FIXP_FRAC * 5; *vbuf++ = DIV_ROUND_CLOSEST(fixp_i, FIXP_FRAC * 10); *vbuf++ = DIV_ROUND_CLOSEST(fixp_q, FIXP_FRAC * 10); break; default: break; } } } |
| 161 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 |