Total coverage: 94435 (6%)of 1803668
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 /* SPDX-License-Identifier: GPL-2.0 */ /* * befs.h * * Copyright (C) 2001-2002 Will Dyson <will_dyson@pobox.com> * Copyright (C) 1999 Makoto Kato (m_kato@ga2.so-net.ne.jp) */ #ifndef _LINUX_BEFS_H #define _LINUX_BEFS_H #include "befs_fs_types.h" /* used in debug.c */ #define BEFS_VERSION "0.9.3" typedef u64 befs_blocknr_t; /* * BeFS in memory structures */ struct befs_mount_options { kgid_t gid; kuid_t uid; int use_gid; int use_uid; int debug; char *iocharset; }; struct befs_sb_info { u32 magic1; u32 block_size; u32 block_shift; int byte_order; befs_off_t num_blocks; befs_off_t used_blocks; u32 inode_size; u32 magic2; /* Allocation group information */ u32 blocks_per_ag; u32 ag_shift; u32 num_ags; /* State of the superblock */ u32 flags; /* Journal log entry */ befs_block_run log_blocks; befs_off_t log_start; befs_off_t log_end; befs_inode_addr root_dir; befs_inode_addr indices; u32 magic3; struct befs_mount_options mount_opts; struct nls_table *nls; }; struct befs_inode_info { u32 i_flags; u32 i_type; befs_inode_addr i_inode_num; befs_inode_addr i_parent; befs_inode_addr i_attribute; union { befs_data_stream ds; char symlink[BEFS_SYMLINK_LEN]; } i_data; struct inode vfs_inode; }; enum befs_err { BEFS_OK, BEFS_ERR, BEFS_BAD_INODE, BEFS_BT_END, BEFS_BT_EMPTY, BEFS_BT_MATCH, BEFS_BT_OVERFLOW, BEFS_BT_NOT_FOUND }; /****************************/ /* debug.c */ __printf(2, 3) void befs_error(const struct super_block *sb, const char *fmt, ...); __printf(2, 3) void befs_warning(const struct super_block *sb, const char *fmt, ...); __printf(2, 3) void befs_debug(const struct super_block *sb, const char *fmt, ...); void befs_dump_super_block(const struct super_block *sb, befs_super_block *); void befs_dump_inode(const struct super_block *sb, befs_inode *); void befs_dump_index_entry(const struct super_block *sb, befs_disk_btree_super *); void befs_dump_index_node(const struct super_block *sb, befs_btree_nodehead *); /****************************/ /* Gets a pointer to the private portion of the super_block * structure from the public part */ static inline struct befs_sb_info * BEFS_SB(const struct super_block *super) { return (struct befs_sb_info *) super->s_fs_info; } static inline struct befs_inode_info * BEFS_I(const struct inode *inode) { return container_of(inode, struct befs_inode_info, vfs_inode); } static inline befs_blocknr_t iaddr2blockno(struct super_block *sb, const befs_inode_addr *iaddr) { return ((iaddr->allocation_group << BEFS_SB(sb)->ag_shift) + iaddr->start); } static inline befs_inode_addr blockno2iaddr(struct super_block *sb, befs_blocknr_t blockno) { befs_inode_addr iaddr; iaddr.allocation_group = blockno >> BEFS_SB(sb)->ag_shift; iaddr.start = blockno - (iaddr.allocation_group << BEFS_SB(sb)->ag_shift); iaddr.len = 1; return iaddr; } static inline unsigned int befs_iaddrs_per_block(struct super_block *sb) { return BEFS_SB(sb)->block_size / sizeof(befs_disk_inode_addr); } #include "endian.h" #endif /* _LINUX_BEFS_H */
975 974 194 196 196 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright(c) 2017 Intel Corporation. All rights reserved. */ #include <linux/pagemap.h> #include <linux/module.h> #include <linux/mount.h> #include <linux/pseudo_fs.h> #include <linux/magic.h> #include <linux/cdev.h> #include <linux/slab.h> #include <linux/uio.h> #include <linux/dax.h> #include <linux/fs.h> #include <linux/cacheinfo.h> #include "dax-private.h" #include "bus.h" /** * struct dax_device - anchor object for dax services * @inode: core vfs * @cdev: optional character interface for "device dax" * @private: dax driver private data * @flags: state and boolean properties * @ops: operations for this device * @holder_data: holder of a dax_device: could be filesystem or mapped device * @holder_ops: operations for the inner holder */ struct dax_device { struct inode inode; struct cdev cdev; void *private; unsigned long flags; const struct dax_operations *ops; void *holder_data; const struct dax_holder_operations *holder_ops; }; static dev_t dax_devt; DEFINE_STATIC_SRCU(dax_srcu); static struct vfsmount *dax_mnt; static DEFINE_IDA(dax_minor_ida); static struct kmem_cache *dax_cache __read_mostly; static struct super_block *dax_superblock __read_mostly; int dax_read_lock(void) { return srcu_read_lock(&dax_srcu); } EXPORT_SYMBOL_GPL(dax_read_lock); void dax_read_unlock(int id) { srcu_read_unlock(&dax_srcu, id); } EXPORT_SYMBOL_GPL(dax_read_unlock); #if defined(CONFIG_BLOCK) && defined(CONFIG_FS_DAX) #include <linux/blkdev.h> static DEFINE_XARRAY(dax_hosts); int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk) { return xa_insert(&dax_hosts, (unsigned long)disk, dax_dev, GFP_KERNEL); } EXPORT_SYMBOL_GPL(dax_add_host); void dax_remove_host(struct gendisk *disk) { xa_erase(&dax_hosts, (unsigned long)disk); } EXPORT_SYMBOL_GPL(dax_remove_host); /** * fs_dax_get_by_bdev() - temporary lookup mechanism for filesystem-dax * @bdev: block device to find a dax_device for * @start_off: returns the byte offset into the dax_device that @bdev starts * @holder: filesystem or mapped device inside the dax_device * @ops: operations for the inner holder */ struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, u64 *start_off, void *holder, const struct dax_holder_operations *ops) { struct dax_device *dax_dev; u64 part_size; int id; if (!blk_queue_dax(bdev->bd_disk->queue)) return NULL; *start_off = get_start_sect(bdev) * SECTOR_SIZE; part_size = bdev_nr_sectors(bdev) * SECTOR_SIZE; if (*start_off % PAGE_SIZE || part_size % PAGE_SIZE) { pr_info("%pg: error: unaligned partition for dax\n", bdev); return NULL; } id = dax_read_lock(); dax_dev = xa_load(&dax_hosts, (unsigned long)bdev->bd_disk); if (!dax_dev || !dax_alive(dax_dev) || !igrab(&dax_dev->inode)) dax_dev = NULL; else if (holder) { if (!cmpxchg(&dax_dev->holder_data, NULL, holder)) dax_dev->holder_ops = ops; else dax_dev = NULL; } dax_read_unlock(id); return dax_dev; } EXPORT_SYMBOL_GPL(fs_dax_get_by_bdev); #endif /* CONFIG_BLOCK && CONFIG_FS_DAX */ #if IS_ENABLED(CONFIG_FS_DAX) void fs_put_dax(struct dax_device *dax_dev, void *holder) { if (dax_dev && holder && cmpxchg(&dax_dev->holder_data, holder, NULL) == holder) dax_dev->holder_ops = NULL; put_dax(dax_dev); } EXPORT_SYMBOL_GPL(fs_put_dax); /** * fs_dax_get() - get ownership of a devdax via holder/holder_ops * * fs-dax file systems call this function to prepare to use a devdax device for * fsdax. This is like fs_dax_get_by_bdev(), but the caller already has struct * dev_dax (and there is no bdev). The holder makes this exclusive. * * @dax_dev: dev to be prepared for fs-dax usage * @holder: filesystem or mapped device inside the dax_device * @hops: operations for the inner holder * * Returns: 0 on success, <0 on failure */ int fs_dax_get(struct dax_device *dax_dev, void *holder, const struct dax_holder_operations *hops) { struct dev_dax *dev_dax; struct dax_device_driver *dax_drv; int id; id = dax_read_lock(); if (!dax_dev || !dax_alive(dax_dev) || !igrab(&dax_dev->inode)) { dax_read_unlock(id); return -ENODEV; } dax_read_unlock(id); /* Verify the device is bound to fsdev_dax driver */ dev_dax = dax_get_private(dax_dev); if (!dev_dax) { iput(&dax_dev->inode); return -ENODEV; } device_lock(&dev_dax->dev); if (!dev_dax->dev.driver) { device_unlock(&dev_dax->dev); iput(&dax_dev->inode); return -ENODEV; } dax_drv = to_dax_drv(dev_dax->dev.driver); if (dax_drv->type != DAXDRV_FSDEV_TYPE) { device_unlock(&dev_dax->dev); iput(&dax_dev->inode); return -EOPNOTSUPP; } device_unlock(&dev_dax->dev); if (cmpxchg(&dax_dev->holder_data, NULL, holder)) { iput(&dax_dev->inode); return -EBUSY; } dax_dev->holder_ops = hops; return 0; } EXPORT_SYMBOL_GPL(fs_dax_get); #endif /* CONFIG_FS_DAX */ enum dax_device_flags { /* !alive + rcu grace period == no new operations / mappings */ DAXDEV_ALIVE, /* gate whether dax_flush() calls the low level flush routine */ DAXDEV_WRITE_CACHE, /* flag to check if device supports synchronous flush */ DAXDEV_SYNC, /* do not leave the caches dirty after writes */ DAXDEV_NOCACHE, /* handle CPU fetch exceptions during reads */ DAXDEV_NOMC, }; /** * dax_direct_access() - translate a device pgoff to an absolute pfn * @dax_dev: a dax_device instance representing the logical memory range * @pgoff: offset in pages from the start of the device to translate * @nr_pages: number of consecutive pages caller can handle relative to @pfn * @mode: indicator on normal access or recovery write * @kaddr: output parameter that returns a virtual address mapping of pfn * @pfn: output parameter that returns an absolute pfn translation of @pgoff * * Return: negative errno if an error occurs, otherwise the number of * pages accessible at the device relative @pgoff. */ long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, enum dax_access_mode mode, void **kaddr, unsigned long *pfn) { long avail; if (!dax_dev) return -EOPNOTSUPP; if (!dax_alive(dax_dev)) return -ENXIO; if (!dax_dev->ops) return -EOPNOTSUPP; if (nr_pages < 0) return -EINVAL; avail = dax_dev->ops->direct_access(dax_dev, pgoff, nr_pages, mode, kaddr, pfn); if (!avail) return -ERANGE; return min(avail, nr_pages); } EXPORT_SYMBOL_GPL(dax_direct_access); size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i) { if (!dax_alive(dax_dev)) return 0; /* * The userspace address for the memory copy has already been validated * via access_ok() in vfs_write, so use the 'no check' version to bypass * the HARDENED_USERCOPY overhead. */ if (test_bit(DAXDEV_NOCACHE, &dax_dev->flags)) return _copy_from_iter_flushcache(addr, bytes, i); return _copy_from_iter(addr, bytes, i); } size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i) { if (!dax_alive(dax_dev)) return 0; /* * The userspace address for the memory copy has already been validated * via access_ok() in vfs_red, so use the 'no check' version to bypass * the HARDENED_USERCOPY overhead. */ if (test_bit(DAXDEV_NOMC, &dax_dev->flags)) return _copy_mc_to_iter(addr, bytes, i); return _copy_to_iter(addr, bytes, i); } int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, size_t nr_pages) { int ret; if (!dax_alive(dax_dev)) return -ENXIO; if (!dax_dev->ops) return -EOPNOTSUPP; /* * There are no callers that want to zero more than one page as of now. * Once users are there, this check can be removed after the * device mapper code has been updated to split ranges across targets. */ if (nr_pages != 1) return -EIO; ret = dax_dev->ops->zero_page_range(dax_dev, pgoff, nr_pages); return dax_mem2blk_err(ret); } EXPORT_SYMBOL_GPL(dax_zero_page_range); size_t dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *iter) { if (!dax_dev->ops || !dax_dev->ops->recovery_write) return 0; return dax_dev->ops->recovery_write(dax_dev, pgoff, addr, bytes, iter); } EXPORT_SYMBOL_GPL(dax_recovery_write); int dax_holder_notify_failure(struct dax_device *dax_dev, u64 off, u64 len, int mf_flags) { int rc, id; id = dax_read_lock(); if (!dax_alive(dax_dev)) { rc = -ENXIO; goto out; } if (!dax_dev->holder_ops) { rc = -EOPNOTSUPP; goto out; } rc = dax_dev->holder_ops->notify_failure(dax_dev, off, len, mf_flags); out: dax_read_unlock(id); return rc; } EXPORT_SYMBOL_GPL(dax_holder_notify_failure); #ifdef CONFIG_ARCH_HAS_PMEM_API void arch_wb_cache_pmem(void *addr, size_t size); void dax_flush(struct dax_device *dax_dev, void *addr, size_t size) { if (unlikely(!dax_write_cache_enabled(dax_dev))) return; arch_wb_cache_pmem(addr, size); } #else void dax_flush(struct dax_device *dax_dev, void *addr, size_t size) { } #endif EXPORT_SYMBOL_GPL(dax_flush); void dax_write_cache(struct dax_device *dax_dev, bool wc) { if (wc) set_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags); else clear_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags); } EXPORT_SYMBOL_GPL(dax_write_cache); bool dax_write_cache_enabled(struct dax_device *dax_dev) { return test_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags); } EXPORT_SYMBOL_GPL(dax_write_cache_enabled); bool dax_synchronous(struct dax_device *dax_dev) { return test_bit(DAXDEV_SYNC, &dax_dev->flags); } EXPORT_SYMBOL_GPL(dax_synchronous); void set_dax_synchronous(struct dax_device *dax_dev) { set_bit(DAXDEV_SYNC, &dax_dev->flags); } EXPORT_SYMBOL_GPL(set_dax_synchronous); void set_dax_nocache(struct dax_device *dax_dev) { set_bit(DAXDEV_NOCACHE, &dax_dev->flags); } EXPORT_SYMBOL_GPL(set_dax_nocache); void set_dax_nomc(struct dax_device *dax_dev) { set_bit(DAXDEV_NOMC, &dax_dev->flags); } EXPORT_SYMBOL_GPL(set_dax_nomc); /** * dax_set_ops - set the dax_operations for a dax_device * @dax_dev: the dax_device to configure * @ops: the operations to set (may be NULL to clear) * * This allows drivers to set the dax_operations after the dax_device * has been allocated. This is needed when the device is created before * the driver that needs specific ops is bound (e.g., fsdev_dax binding * to a dev_dax created by hmem). * * When setting non-NULL ops, fails if ops are already set (returns -EBUSY). * When clearing ops (NULL), always succeeds. * * Return: 0 on success, -EBUSY if ops already set */ int dax_set_ops(struct dax_device *dax_dev, const struct dax_operations *ops) { if (ops) { /* Setting ops: fail if already set */ if (cmpxchg(&dax_dev->ops, NULL, ops) != NULL) return -EBUSY; } else { /* Clearing ops: always allowed */ dax_dev->ops = NULL; } return 0; } EXPORT_SYMBOL_GPL(dax_set_ops); bool dax_alive(struct dax_device *dax_dev) { lockdep_assert_held(&dax_srcu); return test_bit(DAXDEV_ALIVE, &dax_dev->flags); } EXPORT_SYMBOL_GPL(dax_alive); /* * Note, rcu is not protecting the liveness of dax_dev, rcu is ensuring * that any fault handlers or operations that might have seen * dax_alive(), have completed. Any operations that start after * synchronize_srcu() has run will abort upon seeing !dax_alive(). * * Note, because alloc_dax() returns an ERR_PTR() on error, callers * typically store its result into a local variable in order to check * the result. Therefore, care must be taken to populate the struct * device dax_dev field make sure the dax_dev is not leaked. */ void kill_dax(struct dax_device *dax_dev) { if (!dax_dev) return; if (dax_dev->holder_data != NULL) dax_holder_notify_failure(dax_dev, 0, U64_MAX, MF_MEM_PRE_REMOVE); clear_bit(DAXDEV_ALIVE, &dax_dev->flags); synchronize_srcu(&dax_srcu); /* clear holder data */ dax_dev->holder_ops = NULL; dax_dev->holder_data = NULL; } EXPORT_SYMBOL_GPL(kill_dax); void run_dax(struct dax_device *dax_dev) { set_bit(DAXDEV_ALIVE, &dax_dev->flags); } EXPORT_SYMBOL_GPL(run_dax); static struct inode *dax_alloc_inode(struct super_block *sb) { struct dax_device *dax_dev; struct inode *inode; dax_dev = alloc_inode_sb(sb, dax_cache, GFP_KERNEL); if (!dax_dev) return NULL; inode = &dax_dev->inode; inode->i_rdev = 0; return inode; } static struct dax_device *to_dax_dev(struct inode *inode) { return container_of(inode, struct dax_device, inode); } static void dax_free_inode(struct inode *inode) { struct dax_device *dax_dev = to_dax_dev(inode); if (inode->i_rdev) ida_free(&dax_minor_ida, iminor(inode)); kmem_cache_free(dax_cache, dax_dev); } static void dax_destroy_inode(struct inode *inode) { struct dax_device *dax_dev = to_dax_dev(inode); WARN_ONCE(test_bit(DAXDEV_ALIVE, &dax_dev->flags), "kill_dax() must be called before final iput()\n"); } static const struct super_operations dax_sops = { .statfs = simple_statfs, .alloc_inode = dax_alloc_inode, .destroy_inode = dax_destroy_inode, .free_inode = dax_free_inode, .drop_inode = inode_just_drop, }; static int dax_init_fs_context(struct fs_context *fc) { struct pseudo_fs_context *ctx = init_pseudo(fc, DAXFS_MAGIC); if (!ctx) return -ENOMEM; ctx->ops = &dax_sops; return 0; } static struct file_system_type dax_fs_type = { .name = "dax", .init_fs_context = dax_init_fs_context, .kill_sb = kill_anon_super, }; static int dax_test(struct inode *inode, void *data) { dev_t devt = *(dev_t *) data; return inode->i_rdev == devt; } static int dax_set(struct inode *inode, void *data) { dev_t devt = *(dev_t *) data; inode->i_rdev = devt; return 0; } struct dax_device *dax_dev_get(dev_t devt) { struct dax_device *dax_dev; struct inode *inode; inode = iget5_locked(dax_superblock, hash_32(devt + DAXFS_MAGIC, 31), dax_test, dax_set, &devt); if (!inode) return NULL; dax_dev = to_dax_dev(inode); if (inode_state_read_once(inode) & I_NEW) { set_bit(DAXDEV_ALIVE, &dax_dev->flags); inode->i_cdev = &dax_dev->cdev; inode->i_mode = S_IFCHR; inode->i_flags = S_DAX; mapping_set_gfp_mask(&inode->i_data, GFP_USER); unlock_new_inode(inode); } return dax_dev; } EXPORT_SYMBOL_GPL(dax_dev_get); struct dax_device *alloc_dax(void *private, const struct dax_operations *ops) { struct dax_device *dax_dev; dev_t devt; int minor; /* * Unavailable on architectures with virtually aliased data caches, * except for device-dax (NULL operations pointer), which does * not use aliased mappings from the kernel. */ if (ops && cpu_dcache_is_aliasing()) return ERR_PTR(-EOPNOTSUPP); if (WARN_ON_ONCE(ops && !ops->zero_page_range)) return ERR_PTR(-EINVAL); minor = ida_alloc_max(&dax_minor_ida, MINORMASK, GFP_KERNEL); if (minor < 0) return ERR_PTR(-ENOMEM); devt = MKDEV(MAJOR(dax_devt), minor); dax_dev = dax_dev_get(devt); if (!dax_dev) goto err_dev; dax_dev->ops = ops; dax_dev->private = private; return dax_dev; err_dev: ida_free(&dax_minor_ida, minor); return ERR_PTR(-ENOMEM); } EXPORT_SYMBOL_GPL(alloc_dax); void put_dax(struct dax_device *dax_dev) { if (!dax_dev) return; iput(&dax_dev->inode); } EXPORT_SYMBOL_GPL(put_dax); /** * dax_holder() - obtain the holder of a dax device * @dax_dev: a dax_device instance * * Return: the holder's data which represents the holder if registered, * otherwize NULL. */ void *dax_holder(struct dax_device *dax_dev) { return dax_dev->holder_data; } EXPORT_SYMBOL_GPL(dax_holder); /** * inode_dax: convert a public inode into its dax_dev * @inode: An inode with i_cdev pointing to a dax_dev * * Note this is not equivalent to to_dax_dev() which is for private * internal use where we know the inode filesystem type == dax_fs_type. */ struct dax_device *inode_dax(struct inode *inode) { struct cdev *cdev = inode->i_cdev; return container_of(cdev, struct dax_device, cdev); } EXPORT_SYMBOL_GPL(inode_dax); struct inode *dax_inode(struct dax_device *dax_dev) { return &dax_dev->inode; } EXPORT_SYMBOL_GPL(dax_inode); void *dax_get_private(struct dax_device *dax_dev) { if (!test_bit(DAXDEV_ALIVE, &dax_dev->flags)) return NULL; return dax_dev->private; } EXPORT_SYMBOL_GPL(dax_get_private); static void init_once(void *_dax_dev) { struct dax_device *dax_dev = _dax_dev; struct inode *inode = &dax_dev->inode; memset(dax_dev, 0, sizeof(*dax_dev)); inode_init_once(inode); } static int dax_fs_init(void) { int rc; dax_cache = kmem_cache_create("dax_cache", sizeof(struct dax_device), 0, SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT, init_once); if (!dax_cache) return -ENOMEM; dax_mnt = kern_mount(&dax_fs_type); if (IS_ERR(dax_mnt)) { rc = PTR_ERR(dax_mnt); goto err_mount; } dax_superblock = dax_mnt->mnt_sb; return 0; err_mount: kmem_cache_destroy(dax_cache); return rc; } static void dax_fs_exit(void) { kern_unmount(dax_mnt); rcu_barrier(); kmem_cache_destroy(dax_cache); } static int __init dax_core_init(void) { int rc; rc = dax_fs_init(); if (rc) return rc; rc = alloc_chrdev_region(&dax_devt, 0, MINORMASK+1, "dax"); if (rc) goto err_chrdev; rc = dax_bus_init(); if (rc) goto err_bus; return 0; err_bus: unregister_chrdev_region(dax_devt, MINORMASK+1); err_chrdev: dax_fs_exit(); return 0; } static void __exit dax_core_exit(void) { dax_bus_exit(); unregister_chrdev_region(dax_devt, MINORMASK+1); ida_destroy(&dax_minor_ida); dax_fs_exit(); } MODULE_AUTHOR("Intel Corporation"); MODULE_DESCRIPTION("DAX: direct access to differentiated memory"); MODULE_LICENSE("GPL v2"); subsys_initcall(dax_core_init); module_exit(dax_core_exit);
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 // SPDX-License-Identifier: GPL-2.0-only /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * The Internet Protocol (IP) output module. * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Donald Becker, <becker@super.org> * Alan Cox, <Alan.Cox@linux.org> * Richard Underwood * Stefan Becker, <stefanb@yello.ping.de> * Jorge Cwik, <jorge@laser.satlink.net> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * Hirokazu Takahashi, <taka@valinux.co.jp> * * See ip_input.c for original log * * Fixes: * Alan Cox : Missing nonblock feature in ip_build_xmit. * Mike Kilburn : htons() missing in ip_build_xmit. * Bradford Johnson: Fix faulty handling of some frames when * no route is found. * Alexander Demenshin: Missing sk/skb free in ip_queue_xmit * (in case if packet not accepted by * output firewall rules) * Mike McLagan : Routing by source * Alexey Kuznetsov: use new route cache * Andi Kleen: Fix broken PMTU recovery and remove * some redundant tests. * Vitaly E. Lavrov : Transparent proxy revived after year coma. * Andi Kleen : Replace ip_reply with ip_send_reply. * Andi Kleen : Split fast and slow ip_build_xmit path * for decreased register pressure on x86 * and more readability. * Marc Boucher : When call_out_firewall returns FW_QUEUE, * silently drop skb instead of failing with -EPERM. * Detlev Wengorz : Copy protocol for fragments. * Hirokazu Takahashi: HW checksumming for outgoing UDP * datagrams. * Hirokazu Takahashi: sendfile() on UDP works now. */ #include <linux/uaccess.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/highmem.h> #include <linux/slab.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/proc_fs.h> #include <linux/stat.h> #include <linux/init.h> #include <net/flow.h> #include <net/snmp.h> #include <net/ip.h> #include <net/protocol.h> #include <net/route.h> #include <net/xfrm.h> #include <linux/skbuff.h> #include <net/sock.h> #include <net/arp.h> #include <net/icmp.h> #include <net/checksum.h> #include <net/gso.h> #include <net/inetpeer.h> #include <net/lwtunnel.h> #include <net/inet_dscp.h> #include <linux/bpf-cgroup.h> #include <linux/igmp.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_bridge.h> #include <linux/netlink.h> #include <linux/tcp.h> #include <net/psp.h> static int ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, unsigned int mtu, int (*output)(struct net *, struct sock *, struct sk_buff *)); /* Generate a checksum for an outgoing IP datagram. */ void ip_send_check(struct iphdr *iph) { iph->check = 0; iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); } EXPORT_SYMBOL(ip_send_check); int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) { struct iphdr *iph = ip_hdr(skb); IP_INC_STATS(net, IPSTATS_MIB_OUTREQUESTS); iph_set_totlen(iph, skb->len); ip_send_check(iph); /* if egress device is enslaved to an L3 master device pass the * skb to its handler for processing */ skb = l3mdev_ip_out(sk, skb); if (unlikely(!skb)) return 0; skb->protocol = htons(ETH_P_IP); return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, net, sk, skb, NULL, skb_dst_dev(skb), dst_output); } int ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) { int err; err = __ip_local_out(net, sk, skb); if (likely(err == 1)) err = dst_output(net, sk, skb); return err; } EXPORT_SYMBOL_GPL(ip_local_out); static inline int ip_select_ttl(const struct inet_sock *inet, const struct dst_entry *dst) { int ttl = READ_ONCE(inet->uc_ttl); if (ttl < 0) ttl = ip4_dst_hoplimit(dst); return ttl; } /* * Add an ip header to a skbuff and send it out. * */ int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk, __be32 saddr, __be32 daddr, struct ip_options_rcu *opt, u8 tos) { const struct inet_sock *inet = inet_sk(sk); struct rtable *rt = skb_rtable(skb); struct net *net = sock_net(sk); struct iphdr *iph; /* Build the IP header. */ skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0)); skb_reset_network_header(skb); iph = ip_hdr(skb); iph->version = 4; iph->ihl = 5; iph->tos = tos; iph->ttl = ip_select_ttl(inet, &rt->dst); iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr); iph->saddr = saddr; iph->protocol = sk->sk_protocol; /* Do not bother generating IPID for small packets (eg SYNACK) */ if (skb->len <= IPV4_MIN_MTU || ip_dont_fragment(sk, &rt->dst)) { iph->frag_off = htons(IP_DF); iph->id = 0; } else { iph->frag_off = 0; /* TCP packets here are SYNACK with fat IPv4/TCP options. * Avoid using the hashed IP ident generator. */ if (sk->sk_protocol == IPPROTO_TCP) iph->id = (__force __be16)get_random_u16(); else __ip_select_ident(net, iph, 1); } if (opt && opt->opt.optlen) { iph->ihl += opt->opt.optlen>>2; ip_options_build(skb, &opt->opt, daddr, rt); } skb->priority = READ_ONCE(sk->sk_priority); if (!skb->mark) skb->mark = READ_ONCE(sk->sk_mark); /* Send it out. */ return ip_local_out(net, skb->sk, skb); } EXPORT_SYMBOL_GPL(ip_build_and_send_pkt); static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct rtable *rt = dst_rtable(dst); struct net_device *dev = dst_dev(dst); unsigned int hh_len = LL_RESERVED_SPACE(dev); struct neighbour *neigh; bool is_v6gw = false; if (rt->rt_type == RTN_MULTICAST) { IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTMCAST, skb->len); } else if (rt->rt_type == RTN_BROADCAST) IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTBCAST, skb->len); /* OUTOCTETS should be counted after fragment */ IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len); if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { skb = skb_expand_head(skb, hh_len); if (!skb) return -ENOMEM; } if (lwtunnel_xmit_redirect(dst->lwtstate)) { int res = lwtunnel_xmit(skb); if (res != LWTUNNEL_XMIT_CONTINUE) return res; } rcu_read_lock(); neigh = ip_neigh_for_gw(rt, skb, &is_v6gw); if (!IS_ERR(neigh)) { int res; sock_confirm_neigh(skb, neigh); /* if crossing protocols, can not use the cached header */ res = neigh_output(neigh, skb, is_v6gw); rcu_read_unlock(); return res; } rcu_read_unlock(); net_dbg_ratelimited("%s: No header cache and no neighbour!\n", __func__); kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL); return PTR_ERR(neigh); } static int ip_finish_output_gso(struct net *net, struct sock *sk, struct sk_buff *skb, unsigned int mtu) { struct sk_buff *segs, *nskb; netdev_features_t features; int ret = 0; /* common case: seglen is <= mtu */ if (skb_gso_validate_network_len(skb, mtu)) return ip_finish_output2(net, sk, skb); /* Slowpath - GSO segment length exceeds the egress MTU. * * This can happen in several cases: * - Forwarding of a TCP GRO skb, when DF flag is not set. * - Forwarding of an skb that arrived on a virtualization interface * (virtio-net/vhost/tap) with TSO/GSO size set by other network * stack. * - Local GSO skb transmitted on an NETIF_F_TSO tunnel stacked over an * interface with a smaller MTU. * - Arriving GRO skb (or GSO skb in a virtualized environment) that is * bridged to a NETIF_F_TSO tunnel stacked over an interface with an * insufficient MTU. */ features = netif_skb_features(skb); BUILD_BUG_ON(sizeof(*IPCB(skb)) > SKB_GSO_CB_OFFSET); segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); if (IS_ERR_OR_NULL(segs)) { kfree_skb(skb); return -ENOMEM; } consume_skb(skb); skb_list_walk_safe(segs, segs, nskb) { int err; skb_mark_not_on_list(segs); err = ip_fragment(net, sk, segs, mtu, ip_finish_output2); if (err && ret == 0) ret = err; } return ret; } static int __ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { unsigned int mtu; #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) /* Policy lookup after SNAT yielded a new policy */ if (skb_dst(skb)->xfrm) { IPCB(skb)->flags |= IPSKB_REROUTED; return dst_output(net, sk, skb); } #endif mtu = ip_skb_dst_mtu(sk, skb); if (skb_is_gso(skb)) return ip_finish_output_gso(net, sk, skb, mtu); if (skb->len > mtu || IPCB(skb)->frag_max_size) return ip_fragment(net, sk, skb, mtu, ip_finish_output2); return ip_finish_output2(net, sk, skb); } static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { int ret; ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); switch (ret) { case NET_XMIT_SUCCESS: return __ip_finish_output(net, sk, skb); case NET_XMIT_CN: return __ip_finish_output(net, sk, skb) ? : ret; default: kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS); return ret; } } static int ip_mc_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct rtable *new_rt; bool do_cn = false; int ret, err; ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); switch (ret) { case NET_XMIT_CN: do_cn = true; fallthrough; case NET_XMIT_SUCCESS: break; default: kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS); return ret; } /* Reset rt_iif so that inet_iif() will return skb->skb_iif. Setting * this to non-zero causes ipi_ifindex in in_pktinfo to be overwritten, * see ipv4_pktinfo_prepare(). */ new_rt = rt_dst_clone(net->loopback_dev, skb_rtable(skb)); if (new_rt) { new_rt->rt_iif = 0; skb_dst_drop(skb); skb_dst_set(skb, &new_rt->dst); } err = dev_loopback_xmit(net, sk, skb); return (do_cn && err) ? ret : err; } int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct rtable *rt = skb_rtable(skb); struct net_device *dev = rt->dst.dev; /* * If the indicated interface is up and running, send the packet. */ skb->dev = dev; skb->protocol = htons(ETH_P_IP); /* * Multicasts are looped back for other local users */ if (rt->rt_flags&RTCF_MULTICAST) { if (sk_mc_loop(sk) #ifdef CONFIG_IP_MROUTE /* Small optimization: do not loopback not local frames, which returned after forwarding; they will be dropped by ip_mr_input in any case. Note, that local frames are looped back to be delivered to local recipients. This check is duplicated in ip_mr_input at the moment. */ && ((rt->rt_flags & RTCF_LOCAL) || !(IPCB(skb)->flags & IPSKB_FORWARDED)) #endif ) { struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); if (newskb) NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, net, sk, newskb, NULL, newskb->dev, ip_mc_finish_output); } /* Multicasts with ttl 0 must not go beyond the host */ if (ip_hdr(skb)->ttl == 0) { kfree_skb(skb); return 0; } } if (rt->rt_flags&RTCF_BROADCAST) { struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); if (newskb) NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, net, sk, newskb, NULL, newskb->dev, ip_mc_finish_output); } return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, net, sk, skb, NULL, skb->dev, ip_finish_output, !(IPCB(skb)->flags & IPSKB_REROUTED)); } int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_device *dev, *indev = skb->dev; int ret_val; rcu_read_lock(); dev = skb_dst_dev_rcu(skb); skb->dev = dev; skb->protocol = htons(ETH_P_IP); ret_val = NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, net, sk, skb, indev, dev, ip_finish_output, !(IPCB(skb)->flags & IPSKB_REROUTED)); rcu_read_unlock(); return ret_val; } EXPORT_SYMBOL(ip_output); /* * copy saddr and daddr, possibly using 64bit load/stores * Equivalent to : * iph->saddr = fl4->saddr; * iph->daddr = fl4->daddr; */ static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4) { BUILD_BUG_ON(offsetof(typeof(*fl4), daddr) != offsetof(typeof(*fl4), saddr) + sizeof(fl4->saddr)); iph->saddr = fl4->saddr; iph->daddr = fl4->daddr; } /* Note: skb->sk can be different from sk, in case of tunnels */ int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, __u8 tos) { struct inet_sock *inet = inet_sk(sk); struct net *net = sock_net(sk); struct ip_options_rcu *inet_opt; struct flowi4 *fl4; struct rtable *rt; struct iphdr *iph; int res; /* Skip all of this if the packet is already routed, * f.e. by something like SCTP. */ rcu_read_lock(); inet_opt = rcu_dereference(inet->inet_opt); fl4 = &fl->u.ip4; rt = skb_rtable(skb); if (rt) goto packet_routed; /* Make sure we can route this packet. */ rt = dst_rtable(__sk_dst_check(sk, 0)); if (!rt) { inet_sk_init_flowi4(inet, fl4); /* sctp_v4_xmit() uses its own DSCP value */ fl4->flowi4_dscp = inet_dsfield_to_dscp(tos); /* If this fails, retransmit mechanism of transport layer will * keep trying until route appears or the connection times * itself out. */ rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) goto no_route; sk_setup_caps(sk, &rt->dst); } skb_dst_set_noref(skb, &rt->dst); packet_routed: if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_uses_gateway) goto no_route; /* OK, we know where to send it, allocate and build IP header. */ skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0)); skb_reset_network_header(skb); iph = ip_hdr(skb); *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (tos & 0xff)); if (ip_dont_fragment(sk, &rt->dst) && !skb->ignore_df) iph->frag_off = htons(IP_DF); else iph->frag_off = 0; iph->ttl = ip_select_ttl(inet, &rt->dst); iph->protocol = sk->sk_protocol; ip_copy_addrs(iph, fl4); /* Transport layer set skb->h.foo itself. */ if (inet_opt && inet_opt->opt.optlen) { iph->ihl += inet_opt->opt.optlen >> 2; ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt); } ip_select_ident_segs(net, skb, sk, skb_shinfo(skb)->gso_segs ?: 1); /* TODO : should we use skb->sk here instead of sk ? */ skb->priority = READ_ONCE(sk->sk_priority); skb->mark = READ_ONCE(sk->sk_mark); res = ip_local_out(net, sk, skb); rcu_read_unlock(); return res; no_route: rcu_read_unlock(); IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); kfree_skb_reason(skb, SKB_DROP_REASON_IP_OUTNOROUTES); return -EHOSTUNREACH; } EXPORT_SYMBOL(__ip_queue_xmit); int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl) { return __ip_queue_xmit(sk, skb, fl, READ_ONCE(inet_sk(sk)->tos)); } EXPORT_SYMBOL(ip_queue_xmit); static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) { to->pkt_type = from->pkt_type; to->priority = from->priority; to->protocol = from->protocol; to->skb_iif = from->skb_iif; skb_dst_drop(to); skb_dst_copy(to, from); to->dev = from->dev; to->mark = from->mark; skb_copy_hash(to, from); #ifdef CONFIG_NET_SCHED to->tc_index = from->tc_index; #endif nf_copy(to, from); skb_ext_copy(to, from); #if IS_ENABLED(CONFIG_IP_VS) to->ipvs_property = from->ipvs_property; #endif skb_copy_secmark(to, from); } static int ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, unsigned int mtu, int (*output)(struct net *, struct sock *, struct sk_buff *)) { struct iphdr *iph = ip_hdr(skb); if ((iph->frag_off & htons(IP_DF)) == 0) return ip_do_fragment(net, sk, skb, output); if (unlikely(!skb->ignore_df || (IPCB(skb)->frag_max_size && IPCB(skb)->frag_max_size > mtu))) { IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); kfree_skb(skb); return -EMSGSIZE; } return ip_do_fragment(net, sk, skb, output); } void ip_fraglist_init(struct sk_buff *skb, struct iphdr *iph, unsigned int hlen, struct ip_fraglist_iter *iter) { unsigned int first_len = skb_pagelen(skb); iter->frag = skb_shinfo(skb)->frag_list; skb_frag_list_init(skb); iter->offset = 0; iter->iph = iph; iter->hlen = hlen; skb->data_len = first_len - skb_headlen(skb); skb->len = first_len; iph->tot_len = htons(first_len); iph->frag_off = htons(IP_MF); ip_send_check(iph); } EXPORT_SYMBOL(ip_fraglist_init); void ip_fraglist_prepare(struct sk_buff *skb, struct ip_fraglist_iter *iter) { unsigned int hlen = iter->hlen; struct iphdr *iph = iter->iph; struct sk_buff *frag; frag = iter->frag; frag->ip_summed = CHECKSUM_NONE; skb_reset_transport_header(frag); __skb_push(frag, hlen); skb_reset_network_header(frag); memcpy(skb_network_header(frag), iph, hlen); iter->iph = ip_hdr(frag); iph = iter->iph; iph->tot_len = htons(frag->len); ip_copy_metadata(frag, skb); iter->offset += skb->len - hlen; iph->frag_off = htons(iter->offset >> 3); if (frag->next) iph->frag_off |= htons(IP_MF); /* Ready, complete checksum */ ip_send_check(iph); } EXPORT_SYMBOL(ip_fraglist_prepare); void ip_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int ll_rs, unsigned int mtu, bool DF, struct ip_frag_state *state) { struct iphdr *iph = ip_hdr(skb); state->DF = DF; state->hlen = hlen; state->ll_rs = ll_rs; state->mtu = mtu; state->left = skb->len - hlen; /* Space per frame */ state->ptr = hlen; /* Where to start from */ state->offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3; state->not_last_frag = iph->frag_off & htons(IP_MF); } EXPORT_SYMBOL(ip_frag_init); static void ip_frag_ipcb(struct sk_buff *from, struct sk_buff *to, bool first_frag) { /* Copy the flags to each fragment. */ IPCB(to)->flags = IPCB(from)->flags; /* ANK: dirty, but effective trick. Upgrade options only if * the segment to be fragmented was THE FIRST (otherwise, * options are already fixed) and make it ONCE * on the initial skb, so that all the following fragments * will inherit fixed options. */ if (first_frag) ip_options_fragment(from); } struct sk_buff *ip_frag_next(struct sk_buff *skb, struct ip_frag_state *state) { unsigned int len = state->left; struct sk_buff *skb2; struct iphdr *iph; /* IF: it doesn't fit, use 'mtu' - the data space left */ if (len > state->mtu) len = state->mtu; /* IF: we are not sending up to and including the packet end then align the next start on an eight byte boundary */ if (len < state->left) { len &= ~7; } /* Allocate buffer */ skb2 = alloc_skb(len + state->hlen + state->ll_rs, GFP_ATOMIC); if (!skb2) return ERR_PTR(-ENOMEM); /* * Set up data on packet */ ip_copy_metadata(skb2, skb); skb_reserve(skb2, state->ll_rs); skb_put(skb2, len + state->hlen); skb_reset_network_header(skb2); skb2->transport_header = skb2->network_header + state->hlen; /* * Charge the memory for the fragment to any owner * it might possess */ if (skb->sk) skb_set_owner_w(skb2, skb->sk); /* * Copy the packet header into the new buffer. */ skb_copy_from_linear_data(skb, skb_network_header(skb2), state->hlen); /* * Copy a block of the IP datagram. */ if (skb_copy_bits(skb, state->ptr, skb_transport_header(skb2), len)) BUG(); state->left -= len; /* * Fill in the new header fields. */ iph = ip_hdr(skb2); iph->frag_off = htons((state->offset >> 3)); if (state->DF) iph->frag_off |= htons(IP_DF); /* * Added AC : If we are fragmenting a fragment that's not the * last fragment then keep MF on each bit */ if (state->left > 0 || state->not_last_frag) iph->frag_off |= htons(IP_MF); state->ptr += len; state->offset += len; iph->tot_len = htons(len + state->hlen); ip_send_check(iph); return skb2; } EXPORT_SYMBOL(ip_frag_next); /* * This IP datagram is too large to be sent in one piece. Break it up into * smaller pieces (each of size equal to IP header plus * a block of the data of the original IP data part) that will yet fit in a * single device frame, and queue such a frame for sending. */ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, int (*output)(struct net *, struct sock *, struct sk_buff *)) { struct iphdr *iph; struct sk_buff *skb2; u8 tstamp_type = skb->tstamp_type; struct rtable *rt = skb_rtable(skb); unsigned int mtu, hlen, ll_rs; struct ip_fraglist_iter iter; ktime_t tstamp = skb->tstamp; struct ip_frag_state state; int err = 0; /* for offloaded checksums cleanup checksum before fragmentation */ if (skb->ip_summed == CHECKSUM_PARTIAL && (err = skb_checksum_help(skb))) goto fail; /* * Point into the IP datagram header. */ iph = ip_hdr(skb); mtu = ip_skb_dst_mtu(sk, skb); if (IPCB(skb)->frag_max_size && IPCB(skb)->frag_max_size < mtu) mtu = IPCB(skb)->frag_max_size; /* * Setup starting values. */ hlen = iph->ihl * 4; mtu = mtu - hlen; /* Size of data space */ IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE; ll_rs = LL_RESERVED_SPACE(rt->dst.dev); /* When frag_list is given, use it. First, check its validity: * some transformers could create wrong frag_list or break existing * one, it is not prohibited. In this case fall back to copying. * * LATER: this step can be merged to real generation of fragments, * we can switch to copy when see the first bad fragment. */ if (skb_has_frag_list(skb)) { struct sk_buff *frag, *frag2; unsigned int first_len = skb_pagelen(skb); if (first_len - hlen > mtu || ((first_len - hlen) & 7) || ip_is_fragment(iph) || skb_cloned(skb) || skb_headroom(skb) < ll_rs) goto slow_path; skb_walk_frags(skb, frag) { /* Correct geometry. */ if (frag->len > mtu || ((frag->len & 7) && frag->next) || skb_headroom(frag) < hlen + ll_rs) goto slow_path_clean; /* Partially cloned skb? */ if (skb_shared(frag)) goto slow_path_clean; BUG_ON(frag->sk); if (skb->sk) { frag->sk = skb->sk; frag->destructor = sock_wfree; } skb->truesize -= frag->truesize; } /* Everything is OK. Generate! */ ip_fraglist_init(skb, iph, hlen, &iter); for (;;) { /* Prepare header of the next frame, * before previous one went down. */ if (iter.frag) { bool first_frag = (iter.offset == 0); IPCB(iter.frag)->flags = IPCB(skb)->flags; ip_fraglist_prepare(skb, &iter); if (first_frag && IPCB(skb)->opt.optlen) { /* ipcb->opt is not populated for frags * coming from __ip_make_skb(), * ip_options_fragment() needs optlen */ IPCB(iter.frag)->opt.optlen = IPCB(skb)->opt.optlen; ip_options_fragment(iter.frag); ip_send_check(iter.iph); } } skb_set_delivery_time(skb, tstamp, tstamp_type); err = output(net, sk, skb); if (!err) IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES); if (err || !iter.frag) break; skb = ip_fraglist_next(&iter); } if (err == 0) { IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS); return 0; } kfree_skb_list(iter.frag); IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS); return err; slow_path_clean: skb_walk_frags(skb, frag2) { if (frag2 == frag) break; frag2->sk = NULL; frag2->destructor = NULL; skb->truesize += frag2->truesize; } } slow_path: /* * Fragment the datagram. */ ip_frag_init(skb, hlen, ll_rs, mtu, IPCB(skb)->flags & IPSKB_FRAG_PMTU, &state); /* * Keep copying data until we run out. */ while (state.left > 0) { bool first_frag = (state.offset == 0); skb2 = ip_frag_next(skb, &state); if (IS_ERR(skb2)) { err = PTR_ERR(skb2); goto fail; } ip_frag_ipcb(skb, skb2, first_frag); /* * Put this fragment into the sending queue. */ skb_set_delivery_time(skb2, tstamp, tstamp_type); err = output(net, sk, skb2); if (err) goto fail; IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES); } consume_skb(skb); IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS); return err; fail: kfree_skb(skb); IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS); return err; } EXPORT_SYMBOL(ip_do_fragment); int ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb) { struct msghdr *msg = from; if (skb->ip_summed == CHECKSUM_PARTIAL) { if (!copy_from_iter_full(to, len, &msg->msg_iter)) return -EFAULT; } else { __wsum csum = 0; if (!csum_and_copy_from_iter_full(to, len, &csum, &msg->msg_iter)) return -EFAULT; skb->csum = csum_block_add(skb->csum, csum, odd); } return 0; } EXPORT_SYMBOL(ip_generic_getfrag); static int __ip_append_data(struct sock *sk, struct flowi4 *fl4, struct sk_buff_head *queue, struct inet_cork *cork, struct page_frag *pfrag, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int transhdrlen, unsigned int flags) { struct inet_sock *inet = inet_sk(sk); struct ubuf_info *uarg = NULL; struct sk_buff *skb; struct ip_options *opt = cork->opt; int hh_len; int exthdrlen; int mtu; int copy; int err; int offset = 0; bool zc = false; unsigned int maxfraglen, fragheaderlen, maxnonfragsize; int csummode = CHECKSUM_NONE; struct rtable *rt = dst_rtable(cork->dst); bool paged, hold_tskey = false, extra_uref = false; unsigned int wmem_alloc_delta = 0; u32 tskey = 0; skb = skb_peek_tail(queue); exthdrlen = !skb ? rt->dst.header_len : 0; mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize; paged = !!cork->gso_size; hh_len = LL_RESERVED_SPACE(rt->dst.dev); fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; maxnonfragsize = ip_sk_ignore_df(sk) ? IP_MAX_MTU : mtu; if (cork->length + length > maxnonfragsize - fragheaderlen) { ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, mtu - (opt ? opt->optlen : 0)); return -EMSGSIZE; } /* * transhdrlen > 0 means that this is the first fragment and we wish * it won't be fragmented in the future. */ if (transhdrlen && length + fragheaderlen <= mtu && rt->dst.dev->features & (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM) && (!(flags & MSG_MORE) || cork->gso_size) && (!exthdrlen || (rt->dst.dev->features & NETIF_F_HW_ESP_TX_CSUM))) csummode = CHECKSUM_PARTIAL; if ((flags & MSG_ZEROCOPY) && length) { struct msghdr *msg = from; if (getfrag == ip_generic_getfrag && msg->msg_ubuf) { if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb)) return -EINVAL; /* Leave uarg NULL if can't zerocopy, callers should * be able to handle it. */ if ((rt->dst.dev->features & NETIF_F_SG) && csummode == CHECKSUM_PARTIAL) { paged = true; zc = true; uarg = msg->msg_ubuf; } } else if (sock_flag(sk, SOCK_ZEROCOPY)) { uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb), false); if (!uarg) return -ENOBUFS; extra_uref = !skb_zcopy(skb); /* only ref on new uarg */ if (rt->dst.dev->features & NETIF_F_SG && csummode == CHECKSUM_PARTIAL) { paged = true; zc = true; } else { uarg_to_msgzc(uarg)->zerocopy = 0; skb_zcopy_set(skb, uarg, &extra_uref); } } } else if ((flags & MSG_SPLICE_PAGES) && length) { if (inet_test_bit(HDRINCL, sk)) return -EPERM; if (rt->dst.dev->features & NETIF_F_SG && getfrag == ip_generic_getfrag) /* We need an empty buffer to attach stuff to */ paged = true; else flags &= ~MSG_SPLICE_PAGES; } cork->length += length; if (cork->tx_flags & SKBTX_ANY_TSTAMP && READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) { if (cork->flags & IPCORK_TS_OPT_ID) { tskey = cork->ts_opt_id; } else { tskey = atomic_inc_return(&sk->sk_tskey) - 1; hold_tskey = true; } } /* So, what's going on in the loop below? * * We use calculated fragment length to generate chained skb, * each of segments is IP fragment ready for sending to network after * adding appropriate IP header. */ if (!skb) goto alloc_new_skb; while (length > 0) { /* Check if the remaining data fits into current packet. */ copy = mtu - skb->len; if (copy < length) copy = maxfraglen - skb->len; if (copy <= 0) { char *data; unsigned int datalen; unsigned int fraglen; unsigned int fraggap; unsigned int alloclen, alloc_extra; unsigned int pagedlen; struct sk_buff *skb_prev; alloc_new_skb: skb_prev = skb; if (skb_prev) fraggap = skb_prev->len - maxfraglen; else fraggap = 0; /* * If remaining data exceeds the mtu, * we know we need more fragment(s). */ datalen = length + fraggap; if (datalen > mtu - fragheaderlen) datalen = maxfraglen - fragheaderlen; fraglen = datalen + fragheaderlen; pagedlen = 0; alloc_extra = hh_len + 15; alloc_extra += exthdrlen; /* The last fragment gets additional space at tail. * Note, with MSG_MORE we overallocate on fragments, * because we have no idea what fragment will be * the last. */ if (datalen == length + fraggap) alloc_extra += rt->dst.trailer_len; if ((flags & MSG_MORE) && !(rt->dst.dev->features&NETIF_F_SG)) alloclen = mtu; else if (!paged && (fraglen + alloc_extra < SKB_MAX_ALLOC || !(rt->dst.dev->features & NETIF_F_SG))) alloclen = fraglen; else { alloclen = fragheaderlen + transhdrlen; pagedlen = datalen - transhdrlen; } alloclen += alloc_extra; if (transhdrlen) { skb = sock_alloc_send_skb(sk, alloclen, (flags & MSG_DONTWAIT), &err); } else { skb = NULL; if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <= 2 * sk->sk_sndbuf) skb = alloc_skb(alloclen, sk->sk_allocation); if (unlikely(!skb)) err = -ENOBUFS; } if (!skb) goto error; /* * Fill in the control structures */ skb->ip_summed = csummode; skb->csum = 0; skb_reserve(skb, hh_len); /* * Find where to start putting bytes. */ data = skb_put(skb, fraglen + exthdrlen - pagedlen); skb_set_network_header(skb, exthdrlen); skb->transport_header = (skb->network_header + fragheaderlen); data += fragheaderlen + exthdrlen; if (fraggap) { skb->csum = skb_copy_and_csum_bits( skb_prev, maxfraglen, data + transhdrlen, fraggap); skb_prev->csum = csum_sub(skb_prev->csum, skb->csum); data += fraggap; pskb_trim_unique(skb_prev, maxfraglen); } copy = datalen - transhdrlen - fraggap - pagedlen; /* [!] NOTE: copy will be negative if pagedlen>0 * because then the equation reduces to -fraggap. */ if (copy > 0 && INDIRECT_CALL_1(getfrag, ip_generic_getfrag, from, data + transhdrlen, offset, copy, fraggap, skb) < 0) { err = -EFAULT; kfree_skb(skb); goto error; } else if (flags & MSG_SPLICE_PAGES) { copy = 0; } offset += copy; length -= copy + transhdrlen; transhdrlen = 0; exthdrlen = 0; csummode = CHECKSUM_NONE; /* only the initial fragment is time stamped */ skb_shinfo(skb)->tx_flags = cork->tx_flags; cork->tx_flags = 0; skb_shinfo(skb)->tskey = tskey; tskey = 0; skb_zcopy_set(skb, uarg, &extra_uref); if ((flags & MSG_CONFIRM) && !skb_prev) skb_set_dst_pending_confirm(skb, 1); /* * Put the packet on the pending queue. */ if (!skb->destructor) { skb->destructor = sock_wfree; skb->sk = sk; wmem_alloc_delta += skb->truesize; } __skb_queue_tail(queue, skb); continue; } if (copy > length) copy = length; if (!(rt->dst.dev->features&NETIF_F_SG) && skb_tailroom(skb) >= copy) { unsigned int off; off = skb->len; if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag, from, skb_put(skb, copy), offset, copy, off, skb) < 0) { __skb_trim(skb, off); err = -EFAULT; goto error; } } else if (flags & MSG_SPLICE_PAGES) { struct msghdr *msg = from; err = -EIO; if (WARN_ON_ONCE(copy > msg->msg_iter.count)) goto error; err = skb_splice_from_iter(skb, &msg->msg_iter, copy); if (err < 0) goto error; copy = err; if (!(flags & MSG_NO_SHARED_FRAGS)) skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG; wmem_alloc_delta += copy; } else if (!zc) { int i = skb_shinfo(skb)->nr_frags; err = -ENOMEM; if (!sk_page_frag_refill(sk, pfrag)) goto error; skb_zcopy_downgrade_managed(skb); if (!skb_can_coalesce(skb, i, pfrag->page, pfrag->offset)) { err = -EMSGSIZE; if (i == MAX_SKB_FRAGS) goto error; __skb_fill_page_desc(skb, i, pfrag->page, pfrag->offset, 0); skb_shinfo(skb)->nr_frags = ++i; get_page(pfrag->page); } copy = min_t(int, copy, pfrag->size - pfrag->offset); if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag, from, page_address(pfrag->page) + pfrag->offset, offset, copy, skb->len, skb) < 0) goto error_efault; pfrag->offset += copy; skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); skb_len_add(skb, copy); wmem_alloc_delta += copy; } else { err = skb_zerocopy_iter_dgram(skb, from, copy); if (err < 0) goto error; } offset += copy; length -= copy; } if (wmem_alloc_delta) refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); return 0; error_efault: err = -EFAULT; error: net_zcopy_put_abort(uarg, extra_uref); cork->length -= length; IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); if (hold_tskey) atomic_dec(&sk->sk_tskey); return err; } static int ip_setup_cork(struct sock *sk, struct inet_cork *cork, struct ipcm_cookie *ipc, struct rtable **rtp) { struct ip_options_rcu *opt; struct rtable *rt; rt = *rtp; if (unlikely(!rt)) return -EFAULT; cork->fragsize = ip_sk_use_pmtu(sk) ? dst4_mtu(&rt->dst) : READ_ONCE(rt->dst.dev->mtu); if (!inetdev_valid_mtu(cork->fragsize)) return -ENETUNREACH; /* * setup for corking. */ opt = ipc->opt; if (opt) { if (!cork->opt) { cork->opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation); if (unlikely(!cork->opt)) return -ENOBUFS; } memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen); cork->flags |= IPCORK_OPT; cork->addr = ipc->addr; } cork->gso_size = ipc->gso_size; cork->dst = &rt->dst; /* We stole this route, caller should not release it. */ *rtp = NULL; cork->length = 0; cork->ttl = ipc->ttl; cork->tos = ipc->tos; cork->mark = ipc->sockc.mark; cork->priority = ipc->sockc.priority; cork->transmit_time = ipc->sockc.transmit_time; cork->tx_flags = 0; sock_tx_timestamp(sk, &ipc->sockc, &cork->tx_flags); if (ipc->sockc.tsflags & SOCKCM_FLAG_TS_OPT_ID) { cork->flags |= IPCORK_TS_OPT_ID; cork->ts_opt_id = ipc->sockc.ts_opt_id; } return 0; } /* * ip_append_data() can make one large IP datagram from many pieces of * data. Each piece will be held on the socket until * ip_push_pending_frames() is called. Each piece can be a page or * non-page data. * * Not only UDP, other transport protocols - e.g. raw sockets - can use * this interface potentially. * * LATER: length must be adjusted by pad at tail, when it is required. */ int ip_append_data(struct sock *sk, struct flowi4 *fl4, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int transhdrlen, struct ipcm_cookie *ipc, struct rtable **rtp, unsigned int flags) { struct inet_sock *inet = inet_sk(sk); int err; if (flags&MSG_PROBE) return 0; if (skb_queue_empty(&sk->sk_write_queue)) { err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp); if (err) return err; } else { transhdrlen = 0; } return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base, sk_page_frag(sk), getfrag, from, length, transhdrlen, flags); } static void ip_cork_release(struct inet_cork *cork) { cork->flags &= ~IPCORK_OPT; kfree(cork->opt); cork->opt = NULL; dst_release(cork->dst); cork->dst = NULL; } /* * Combined all pending IP fragments on the socket as one IP datagram * and push them out. */ struct sk_buff *__ip_make_skb(struct sock *sk, struct flowi4 *fl4, struct sk_buff_head *queue, struct inet_cork *cork) { struct sk_buff *skb, *tmp_skb; struct sk_buff **tail_skb; struct inet_sock *inet = inet_sk(sk); struct net *net = sock_net(sk); struct ip_options *opt = NULL; struct rtable *rt = dst_rtable(cork->dst); struct iphdr *iph; u8 pmtudisc, ttl; __be16 df = 0; skb = __skb_dequeue(queue); if (!skb) goto out; tail_skb = &(skb_shinfo(skb)->frag_list); /* move skb->data to ip header from ext header */ if (skb->data < skb_network_header(skb)) __skb_pull(skb, skb_network_offset(skb)); while ((tmp_skb = __skb_dequeue(queue)) != NULL) { __skb_pull(tmp_skb, skb_network_header_len(skb)); *tail_skb = tmp_skb; tail_skb = &(tmp_skb->next); skb->len += tmp_skb->len; skb->data_len += tmp_skb->len; skb->truesize += tmp_skb->truesize; tmp_skb->destructor = NULL; tmp_skb->sk = NULL; } /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow * to fragment the frame generated here. No matter, what transforms * how transforms change size of the packet, it will come out. */ skb->ignore_df = ip_sk_ignore_df(sk); /* DF bit is set when we want to see DF on outgoing frames. * If ignore_df is set too, we still allow to fragment this frame * locally. */ pmtudisc = READ_ONCE(inet->pmtudisc); if (pmtudisc == IP_PMTUDISC_DO || pmtudisc == IP_PMTUDISC_PROBE || (skb->len <= dst4_mtu(&rt->dst) && ip_dont_fragment(sk, &rt->dst))) df = htons(IP_DF); if (cork->flags & IPCORK_OPT) opt = cork->opt; if (cork->ttl != 0) ttl = cork->ttl; else if (rt->rt_type == RTN_MULTICAST) ttl = READ_ONCE(inet->mc_ttl); else ttl = ip_select_ttl(inet, &rt->dst); iph = ip_hdr(skb); iph->version = 4; iph->ihl = 5; iph->tos = (cork->tos != -1) ? cork->tos : READ_ONCE(inet->tos); iph->frag_off = df; iph->ttl = ttl; iph->protocol = sk->sk_protocol; ip_copy_addrs(iph, fl4); ip_select_ident(net, skb, sk); if (opt) { iph->ihl += opt->optlen >> 2; ip_options_build(skb, opt, cork->addr, rt); } skb->priority = cork->priority; skb->mark = cork->mark; if (sk_is_tcp(sk)) skb_set_delivery_time(skb, cork->transmit_time, SKB_CLOCK_MONOTONIC); else skb_set_delivery_type_by_clockid(skb, cork->transmit_time, sk->sk_clockid); /* * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec * on dst refcount */ cork->dst = NULL; skb_dst_set(skb, &rt->dst); if (iph->protocol == IPPROTO_ICMP) { u8 icmp_type; /* For such sockets, transhdrlen is zero when do ip_append_data(), * so icmphdr does not in skb linear region and can not get icmp_type * by icmp_hdr(skb)->type. */ if (sk->sk_type == SOCK_RAW && !(fl4->flowi4_flags & FLOWI_FLAG_KNOWN_NH)) icmp_type = fl4->fl4_icmp_type; else icmp_type = icmp_hdr(skb)->type; icmp_out_count(net, icmp_type); } ip_cork_release(cork); out: return skb; } int ip_send_skb(struct net *net, struct sk_buff *skb) { int err; err = ip_local_out(net, skb->sk, skb); if (err) { if (err > 0) err = net_xmit_errno(err); if (err) IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS); } return err; } int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4) { struct sk_buff *skb; skb = ip_finish_skb(sk, fl4); if (!skb) return 0; /* Netfilter gets whole the not fragmented skb. */ return ip_send_skb(sock_net(sk), skb); } /* * Throw away all pending data on the socket. */ static void __ip_flush_pending_frames(struct sock *sk, struct sk_buff_head *queue, struct inet_cork *cork) { struct sk_buff *skb; while ((skb = __skb_dequeue_tail(queue)) != NULL) kfree_skb(skb); ip_cork_release(cork); } void ip_flush_pending_frames(struct sock *sk) { __ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base); } struct sk_buff *ip_make_skb(struct sock *sk, struct flowi4 *fl4, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int transhdrlen, struct ipcm_cookie *ipc, struct rtable **rtp, struct inet_cork *cork, unsigned int flags) { struct sk_buff_head queue; int err; if (flags & MSG_PROBE) return NULL; __skb_queue_head_init(&queue); cork->flags = 0; cork->addr = 0; cork->opt = NULL; err = ip_setup_cork(sk, cork, ipc, rtp); if (err) return ERR_PTR(err); err = __ip_append_data(sk, fl4, &queue, cork, &current->task_frag, getfrag, from, length, transhdrlen, flags); if (err) { __ip_flush_pending_frames(sk, &queue, cork); return ERR_PTR(err); } return __ip_make_skb(sk, fl4, &queue, cork); } /* * Fetch data from kernel space and fill in checksum if needed. */ static int ip_reply_glue_bits(void *dptr, char *to, int offset, int len, int odd, struct sk_buff *skb) { __wsum csum; csum = csum_partial_copy_nocheck(dptr+offset, to, len); skb->csum = csum_block_add(skb->csum, csum, odd); return 0; } /* * Generic function to send a packet as reply to another packet. * Used to send some TCP resets/acks so far. */ void ip_send_unicast_reply(struct sock *sk, const struct sock *orig_sk, struct sk_buff *skb, const struct ip_options *sopt, __be32 daddr, __be32 saddr, const struct ip_reply_arg *arg, unsigned int len, u64 transmit_time, u32 txhash) { DEFINE_RAW_FLEX(struct ip_options_rcu, replyopts, opt.__data, IP_OPTIONS_DATA_FIXED_SIZE); struct ipcm_cookie ipc; struct flowi4 fl4; struct rtable *rt = skb_rtable(skb); struct net *net = sock_net(sk); struct sk_buff *nskb; int err; int oif; if (__ip_options_echo(net, &replyopts->opt, skb, sopt)) return; ipcm_init(&ipc); ipc.addr = daddr; ipc.sockc.transmit_time = transmit_time; if (replyopts->opt.optlen) { ipc.opt = replyopts; if (replyopts->opt.srr) daddr = replyopts->opt.faddr; } oif = arg->bound_dev_if; if (!oif && netif_index_is_l3_master(net, skb->skb_iif)) oif = skb->skb_iif; flowi4_init_output(&fl4, oif, IP4_REPLY_MARK(net, skb->mark) ?: sk->sk_mark, arg->tos & INET_DSCP_MASK, RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol, ip_reply_arg_flowi_flags(arg), daddr, saddr, tcp_hdr(skb)->source, tcp_hdr(skb)->dest, arg->uid); security_skb_classify_flow(skb, flowi4_to_flowi_common(&fl4)); rt = ip_route_output_flow(net, &fl4, sk); if (IS_ERR(rt)) return; inet_sk(sk)->tos = arg->tos; sk->sk_protocol = ip_hdr(skb)->protocol; sk->sk_bound_dev_if = arg->bound_dev_if; sk->sk_sndbuf = READ_ONCE(sysctl_wmem_default); ipc.sockc.mark = fl4.flowi4_mark; err = ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0, &ipc, &rt, MSG_DONTWAIT); if (unlikely(err)) { ip_flush_pending_frames(sk); goto out; } nskb = skb_peek(&sk->sk_write_queue); if (nskb) { if (arg->csumoffset >= 0) *((__sum16 *)skb_transport_header(nskb) + arg->csumoffset) = csum_fold(csum_add(nskb->csum, arg->csum)); nskb->ip_summed = CHECKSUM_NONE; if (orig_sk) { skb_set_owner_edemux(nskb, (struct sock *)orig_sk); psp_reply_set_decrypted(orig_sk, nskb); } if (transmit_time) nskb->tstamp_type = SKB_CLOCK_MONOTONIC; if (txhash) skb_set_hash(nskb, txhash, PKT_HASH_TYPE_L4); ip_push_pending_frames(sk, &fl4); } out: ip_rt_put(rt); } void __init ip_init(void) { ip_rt_init(); inet_initpeers(); #if defined(CONFIG_IP_MULTICAST) igmp_mc_init(); #endif }
12 36 36 31 21 38 38 38 38 2 8 12 1 1 1 1 1 26 1 25 26 19 6 28 2 1 12 26 38 564 563 7 8 7 8 43 43 43 43 43 4 38 38 47 47 14 7 7 339 295 5 39 37 534 534 289 289 285 4 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 // SPDX-License-Identifier: GPL-2.0 /* * Key setup facility for FS encryption support. * * Copyright (C) 2015, Google, Inc. * * Originally written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar. * Heavily modified since then. */ #include <crypto/skcipher.h> #include <linux/export.h> #include <linux/random.h> #include "fscrypt_private.h" struct fscrypt_mode fscrypt_modes[] = { [FSCRYPT_MODE_AES_256_XTS] = { .friendly_name = "AES-256-XTS", .cipher_str = "xts(aes)", .keysize = 64, .security_strength = 32, .ivsize = 16, .blk_crypto_mode = BLK_ENCRYPTION_MODE_AES_256_XTS, }, [FSCRYPT_MODE_AES_256_CTS] = { .friendly_name = "AES-256-CBC-CTS", .cipher_str = "cts(cbc(aes))", .keysize = 32, .security_strength = 32, .ivsize = 16, }, [FSCRYPT_MODE_AES_128_CBC] = { .friendly_name = "AES-128-CBC-ESSIV", .cipher_str = "essiv(cbc(aes),sha256)", .keysize = 16, .security_strength = 16, .ivsize = 16, .blk_crypto_mode = BLK_ENCRYPTION_MODE_AES_128_CBC_ESSIV, }, [FSCRYPT_MODE_AES_128_CTS] = { .friendly_name = "AES-128-CBC-CTS", .cipher_str = "cts(cbc(aes))", .keysize = 16, .security_strength = 16, .ivsize = 16, }, [FSCRYPT_MODE_SM4_XTS] = { .friendly_name = "SM4-XTS", .cipher_str = "xts(sm4)", .keysize = 32, .security_strength = 16, .ivsize = 16, .blk_crypto_mode = BLK_ENCRYPTION_MODE_SM4_XTS, }, [FSCRYPT_MODE_SM4_CTS] = { .friendly_name = "SM4-CBC-CTS", .cipher_str = "cts(cbc(sm4))", .keysize = 16, .security_strength = 16, .ivsize = 16, }, [FSCRYPT_MODE_ADIANTUM] = { .friendly_name = "Adiantum", .cipher_str = "adiantum(xchacha12,aes)", .keysize = 32, .security_strength = 32, .ivsize = 32, .blk_crypto_mode = BLK_ENCRYPTION_MODE_ADIANTUM, }, [FSCRYPT_MODE_AES_256_HCTR2] = { .friendly_name = "AES-256-HCTR2", .cipher_str = "hctr2(aes)", .keysize = 32, .security_strength = 32, .ivsize = 32, }, }; static DEFINE_MUTEX(fscrypt_mode_key_setup_mutex); static struct fscrypt_mode * select_encryption_mode(const union fscrypt_policy *policy, const struct inode *inode) { BUILD_BUG_ON(ARRAY_SIZE(fscrypt_modes) != FSCRYPT_MODE_MAX + 1); if (S_ISREG(inode->i_mode)) return &fscrypt_modes[fscrypt_policy_contents_mode(policy)]; if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) return &fscrypt_modes[fscrypt_policy_fnames_mode(policy)]; WARN_ONCE(1, "fscrypt: filesystem tried to load encryption info for inode %llu, which is not encryptable (file type %d)\n", inode->i_ino, (inode->i_mode & S_IFMT)); return ERR_PTR(-EINVAL); } /* Create a symmetric cipher object for the given encryption mode and key */ static struct crypto_sync_skcipher * fscrypt_allocate_skcipher(struct fscrypt_mode *mode, const u8 *raw_key, const struct inode *inode) { struct crypto_sync_skcipher *tfm; int err; tfm = crypto_alloc_sync_skcipher(mode->cipher_str, 0, FSCRYPT_CRYPTOAPI_MASK); if (IS_ERR(tfm)) { if (PTR_ERR(tfm) == -ENOENT) { fscrypt_warn(inode, "Missing crypto API support for %s (API name: \"%s\")", mode->friendly_name, mode->cipher_str); return ERR_PTR(-ENOPKG); } fscrypt_err(inode, "Error allocating '%s' transform: %ld", mode->cipher_str, PTR_ERR(tfm)); return tfm; } if (!xchg(&mode->logged_cryptoapi_impl, 1)) { /* * fscrypt performance can vary greatly depending on which * crypto algorithm implementation is used. Help people debug * performance problems by logging the ->cra_driver_name the * first time a mode is used. */ pr_info("fscrypt: %s using implementation \"%s\"\n", mode->friendly_name, crypto_skcipher_driver_name(&tfm->base)); } if (WARN_ON_ONCE(crypto_sync_skcipher_ivsize(tfm) != mode->ivsize)) { err = -EINVAL; goto err_free_tfm; } crypto_sync_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS); err = crypto_sync_skcipher_setkey(tfm, raw_key, mode->keysize); if (err) goto err_free_tfm; return tfm; err_free_tfm: crypto_free_sync_skcipher(tfm); return ERR_PTR(err); } /* * Prepare the crypto transform object or blk-crypto key in @prep_key, given the * raw key, encryption mode (@ci->ci_mode), flag indicating which encryption * implementation (fs-layer or blk-crypto) will be used (@ci->ci_inlinecrypt), * and IV generation method (@ci->ci_policy.flags). */ int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key, const u8 *raw_key, const struct fscrypt_inode_info *ci) { struct crypto_sync_skcipher *tfm; if (fscrypt_using_inline_encryption(ci)) return fscrypt_prepare_inline_crypt_key(prep_key, raw_key, ci->ci_mode->keysize, false, ci); tfm = fscrypt_allocate_skcipher(ci->ci_mode, raw_key, ci->ci_inode); if (IS_ERR(tfm)) return PTR_ERR(tfm); /* * Pairs with the smp_load_acquire() in fscrypt_is_key_prepared(). * I.e., here we publish ->tfm with a RELEASE barrier so that * concurrent tasks can ACQUIRE it. Note that this concurrency is only * possible for per-mode keys, not for per-file keys. */ smp_store_release(&prep_key->tfm, tfm); return 0; } /* Destroy a crypto transform object and/or blk-crypto key. */ void fscrypt_destroy_prepared_key(struct super_block *sb, struct fscrypt_prepared_key *prep_key) { crypto_free_sync_skcipher(prep_key->tfm); fscrypt_destroy_inline_crypt_key(sb, prep_key); memzero_explicit(prep_key, sizeof(*prep_key)); } /* Given a per-file encryption key, set up the file's crypto transform object */ int fscrypt_set_per_file_enc_key(struct fscrypt_inode_info *ci, const u8 *raw_key) { ci->ci_owns_key = true; return fscrypt_prepare_key(&ci->ci_enc_key, raw_key, ci); } static int setup_per_mode_enc_key(struct fscrypt_inode_info *ci, struct fscrypt_master_key *mk, struct fscrypt_prepared_key *keys, u8 hkdf_context, bool include_fs_uuid) { const struct inode *inode = ci->ci_inode; const struct super_block *sb = inode->i_sb; struct fscrypt_mode *mode = ci->ci_mode; const u8 mode_num = mode - fscrypt_modes; struct fscrypt_prepared_key *prep_key; u8 mode_key[FSCRYPT_MAX_RAW_KEY_SIZE]; u8 hkdf_info[sizeof(mode_num) + sizeof(sb->s_uuid)]; unsigned int hkdf_infolen = 0; bool use_hw_wrapped_key = false; int err; if (WARN_ON_ONCE(mode_num > FSCRYPT_MODE_MAX)) return -EINVAL; if (mk->mk_secret.is_hw_wrapped && S_ISREG(inode->i_mode)) { /* Using a hardware-wrapped key for file contents encryption */ if (!fscrypt_using_inline_encryption(ci)) { if (sb->s_flags & SB_INLINECRYPT) fscrypt_warn(ci->ci_inode, "Hardware-wrapped key required, but no suitable inline encryption capabilities are available"); else fscrypt_warn(ci->ci_inode, "Hardware-wrapped keys require inline encryption (-o inlinecrypt)"); return -EINVAL; } use_hw_wrapped_key = true; } prep_key = &keys[mode_num]; if (fscrypt_is_key_prepared(prep_key, ci)) { ci->ci_enc_key = *prep_key; return 0; } mutex_lock(&fscrypt_mode_key_setup_mutex); if (fscrypt_is_key_prepared(prep_key, ci)) goto done_unlock; if (use_hw_wrapped_key) { err = fscrypt_prepare_inline_crypt_key(prep_key, mk->mk_secret.bytes, mk->mk_secret.size, true, ci); if (err) goto out_unlock; goto done_unlock; } BUILD_BUG_ON(sizeof(mode_num) != 1); BUILD_BUG_ON(sizeof(sb->s_uuid) != 16); BUILD_BUG_ON(sizeof(hkdf_info) != 17); hkdf_info[hkdf_infolen++] = mode_num; if (include_fs_uuid) { memcpy(&hkdf_info[hkdf_infolen], &sb->s_uuid, sizeof(sb->s_uuid)); hkdf_infolen += sizeof(sb->s_uuid); } fscrypt_hkdf_expand(&mk->mk_secret.hkdf, hkdf_context, hkdf_info, hkdf_infolen, mode_key, mode->keysize); err = fscrypt_prepare_key(prep_key, mode_key, ci); memzero_explicit(mode_key, mode->keysize); if (err) goto out_unlock; done_unlock: ci->ci_enc_key = *prep_key; err = 0; out_unlock: mutex_unlock(&fscrypt_mode_key_setup_mutex); return err; } /* * Derive a SipHash key from the given fscrypt master key and the given * application-specific information string. * * Note that the KDF produces a byte array, but the SipHash APIs expect the key * as a pair of 64-bit words. Therefore, on big endian CPUs we have to do an * endianness swap in order to get the same results as on little endian CPUs. */ static void fscrypt_derive_siphash_key(const struct fscrypt_master_key *mk, u8 context, const u8 *info, unsigned int infolen, siphash_key_t *key) { fscrypt_hkdf_expand(&mk->mk_secret.hkdf, context, info, infolen, (u8 *)key, sizeof(*key)); BUILD_BUG_ON(sizeof(*key) != 16); BUILD_BUG_ON(ARRAY_SIZE(key->key) != 2); le64_to_cpus(&key->key[0]); le64_to_cpus(&key->key[1]); } void fscrypt_derive_dirhash_key(struct fscrypt_inode_info *ci, const struct fscrypt_master_key *mk) { fscrypt_derive_siphash_key(mk, HKDF_CONTEXT_DIRHASH_KEY, ci->ci_nonce, FSCRYPT_FILE_NONCE_SIZE, &ci->ci_dirhash_key); ci->ci_dirhash_key_initialized = true; } void fscrypt_hash_inode_number(struct fscrypt_inode_info *ci, const struct fscrypt_master_key *mk) { WARN_ON_ONCE(ci->ci_inode->i_ino == 0); WARN_ON_ONCE(!mk->mk_ino_hash_key_initialized); ci->ci_hashed_ino = (u32)siphash_1u64(ci->ci_inode->i_ino, &mk->mk_ino_hash_key); } static int fscrypt_setup_iv_ino_lblk_32_key(struct fscrypt_inode_info *ci, struct fscrypt_master_key *mk) { int err; err = setup_per_mode_enc_key(ci, mk, mk->mk_iv_ino_lblk_32_keys, HKDF_CONTEXT_IV_INO_LBLK_32_KEY, true); if (err) return err; /* pairs with smp_store_release() below */ if (!smp_load_acquire(&mk->mk_ino_hash_key_initialized)) { mutex_lock(&fscrypt_mode_key_setup_mutex); if (mk->mk_ino_hash_key_initialized) goto unlock; fscrypt_derive_siphash_key(mk, HKDF_CONTEXT_INODE_HASH_KEY, NULL, 0, &mk->mk_ino_hash_key); /* pairs with smp_load_acquire() above */ smp_store_release(&mk->mk_ino_hash_key_initialized, true); unlock: mutex_unlock(&fscrypt_mode_key_setup_mutex); } /* * New inodes may not have an inode number assigned yet. * Hashing their inode number is delayed until later. */ if (ci->ci_inode->i_ino) fscrypt_hash_inode_number(ci, mk); return 0; } static int fscrypt_setup_v2_file_key(struct fscrypt_inode_info *ci, struct fscrypt_master_key *mk, bool need_dirhash_key) { int err; if (mk->mk_secret.is_hw_wrapped && !(ci->ci_policy.v2.flags & (FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64 | FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32))) { fscrypt_warn(ci->ci_inode, "Hardware-wrapped keys are only supported with IV_INO_LBLK policies"); return -EINVAL; } if (ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) { /* * DIRECT_KEY: instead of deriving per-file encryption keys, the * per-file nonce will be included in all the IVs. But unlike * v1 policies, for v2 policies in this case we don't encrypt * with the master key directly but rather derive a per-mode * encryption key. This ensures that the master key is * consistently used only for HKDF, avoiding key reuse issues. */ err = setup_per_mode_enc_key(ci, mk, mk->mk_direct_keys, HKDF_CONTEXT_DIRECT_KEY, false); } else if (ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64) { /* * IV_INO_LBLK_64: encryption keys are derived from (master_key, * mode_num, filesystem_uuid), and inode number is included in * the IVs. This format is optimized for use with inline * encryption hardware compliant with the UFS standard. */ err = setup_per_mode_enc_key(ci, mk, mk->mk_iv_ino_lblk_64_keys, HKDF_CONTEXT_IV_INO_LBLK_64_KEY, true); } else if (ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32) { err = fscrypt_setup_iv_ino_lblk_32_key(ci, mk); } else { u8 derived_key[FSCRYPT_MAX_RAW_KEY_SIZE]; fscrypt_hkdf_expand(&mk->mk_secret.hkdf, HKDF_CONTEXT_PER_FILE_ENC_KEY, ci->ci_nonce, FSCRYPT_FILE_NONCE_SIZE, derived_key, ci->ci_mode->keysize); err = fscrypt_set_per_file_enc_key(ci, derived_key); memzero_explicit(derived_key, ci->ci_mode->keysize); } if (err) return err; /* Derive a secret dirhash key for directories that need it. */ if (need_dirhash_key) fscrypt_derive_dirhash_key(ci, mk); return 0; } /* * Check whether the size of the given master key (@mk) is appropriate for the * encryption settings which a particular file will use (@ci). * * If the file uses a v1 encryption policy, then the master key must be at least * as long as the derived key, as this is a requirement of the v1 KDF. * * Otherwise, the KDF can accept any size key, so we enforce a slightly looser * requirement: we require that the size of the master key be at least the * maximum security strength of any algorithm whose key will be derived from it * (but in practice we only need to consider @ci->ci_mode, since any other * possible subkeys such as DIRHASH and INODE_HASH will never increase the * required key size over @ci->ci_mode). This allows AES-256-XTS keys to be * derived from a 256-bit master key, which is cryptographically sufficient, * rather than requiring a 512-bit master key which is unnecessarily long. (We * still allow 512-bit master keys if the user chooses to use them, though.) */ static bool fscrypt_valid_master_key_size(const struct fscrypt_master_key *mk, const struct fscrypt_inode_info *ci) { unsigned int min_keysize; if (ci->ci_policy.version == FSCRYPT_POLICY_V1) min_keysize = ci->ci_mode->keysize; else min_keysize = ci->ci_mode->security_strength; if (mk->mk_secret.size < min_keysize) { fscrypt_warn(NULL, "key with %s %*phN is too short (got %u bytes, need %u+ bytes)", master_key_spec_type(&mk->mk_spec), master_key_spec_len(&mk->mk_spec), (u8 *)&mk->mk_spec.u, mk->mk_secret.size, min_keysize); return false; } return true; } /* * Find the master key, then set up the inode's actual encryption key. * * If the master key is found in the filesystem-level keyring, then it is * returned in *mk_ret with its semaphore read-locked. This is needed to ensure * that only one task links the fscrypt_inode_info into ->mk_decrypted_inodes * (as multiple tasks may race to create an fscrypt_inode_info for the same * inode), and to synchronize the master key being removed with a new inode * starting to use it. */ static int setup_file_encryption_key(struct fscrypt_inode_info *ci, bool need_dirhash_key, struct fscrypt_master_key **mk_ret) { struct super_block *sb = ci->ci_inode->i_sb; struct fscrypt_key_specifier mk_spec; struct fscrypt_master_key *mk; int err; err = fscrypt_policy_to_key_spec(&ci->ci_policy, &mk_spec); if (err) return err; mk = fscrypt_find_master_key(sb, &mk_spec); if (unlikely(!mk)) { const union fscrypt_policy *dummy_policy = fscrypt_get_dummy_policy(sb); /* * Add the test_dummy_encryption key on-demand. In principle, * it should be added at mount time. Do it here instead so that * the individual filesystems don't need to worry about adding * this key at mount time and cleaning up on mount failure. */ if (dummy_policy && fscrypt_policies_equal(dummy_policy, &ci->ci_policy)) { err = fscrypt_add_test_dummy_key(sb, &mk_spec); if (err) return err; mk = fscrypt_find_master_key(sb, &mk_spec); } } if (unlikely(!mk)) { if (ci->ci_policy.version != FSCRYPT_POLICY_V1) return -ENOKEY; err = fscrypt_select_encryption_impl(ci, false); if (err) return err; /* * As a legacy fallback for v1 policies, search for the key in * the current task's subscribed keyrings too. Don't move this * to before the search of ->s_master_keys, since users * shouldn't be able to override filesystem-level keys. */ return fscrypt_setup_v1_file_key_via_subscribed_keyrings(ci); } down_read(&mk->mk_sem); if (!mk->mk_present) { /* FS_IOC_REMOVE_ENCRYPTION_KEY has been executed on this key */ err = -ENOKEY; goto out_release_key; } if (!fscrypt_valid_master_key_size(mk, ci)) { err = -ENOKEY; goto out_release_key; } err = fscrypt_select_encryption_impl(ci, mk->mk_secret.is_hw_wrapped); if (err) goto out_release_key; switch (ci->ci_policy.version) { case FSCRYPT_POLICY_V1: if (WARN_ON_ONCE(mk->mk_secret.is_hw_wrapped)) { /* * This should never happen, as adding a v1 policy key * that is hardware-wrapped isn't allowed. */ err = -EINVAL; goto out_release_key; } err = fscrypt_setup_v1_file_key(ci, mk->mk_secret.bytes); break; case FSCRYPT_POLICY_V2: err = fscrypt_setup_v2_file_key(ci, mk, need_dirhash_key); break; default: WARN_ON_ONCE(1); err = -EINVAL; break; } if (err) goto out_release_key; *mk_ret = mk; return 0; out_release_key: up_read(&mk->mk_sem); fscrypt_put_master_key(mk); return err; } static void put_crypt_info(struct fscrypt_inode_info *ci) { struct fscrypt_master_key *mk; if (!ci) return; if (ci->ci_direct_key) fscrypt_put_direct_key(ci->ci_direct_key); else if (ci->ci_owns_key) fscrypt_destroy_prepared_key(ci->ci_inode->i_sb, &ci->ci_enc_key); mk = ci->ci_master_key; if (mk) { /* * Remove this inode from the list of inodes that were unlocked * with the master key. In addition, if we're removing the last * inode from an incompletely removed key, then complete the * full removal of the key. */ spin_lock(&mk->mk_decrypted_inodes_lock); list_del(&ci->ci_master_key_link); spin_unlock(&mk->mk_decrypted_inodes_lock); fscrypt_put_master_key_activeref(ci->ci_inode->i_sb, mk); } memzero_explicit(ci, sizeof(*ci)); kmem_cache_free(fscrypt_inode_info_cachep, ci); } static int fscrypt_setup_encryption_info(struct inode *inode, const union fscrypt_policy *policy, const u8 nonce[FSCRYPT_FILE_NONCE_SIZE], bool need_dirhash_key) { struct fscrypt_inode_info *crypt_info; struct fscrypt_mode *mode; struct fscrypt_master_key *mk = NULL; int res; res = fscrypt_initialize(inode->i_sb); if (res) return res; crypt_info = kmem_cache_zalloc(fscrypt_inode_info_cachep, GFP_KERNEL); if (!crypt_info) return -ENOMEM; crypt_info->ci_inode = inode; crypt_info->ci_policy = *policy; memcpy(crypt_info->ci_nonce, nonce, FSCRYPT_FILE_NONCE_SIZE); mode = select_encryption_mode(&crypt_info->ci_policy, inode); if (IS_ERR(mode)) { res = PTR_ERR(mode); goto out; } WARN_ON_ONCE(mode->ivsize > FSCRYPT_MAX_IV_SIZE); crypt_info->ci_mode = mode; crypt_info->ci_data_unit_bits = fscrypt_policy_du_bits(&crypt_info->ci_policy, inode); res = setup_file_encryption_key(crypt_info, need_dirhash_key, &mk); if (res) goto out; /* * For existing inodes, multiple tasks may race to set the inode's * fscrypt info pointer. So use cmpxchg_release(). This pairs with the * smp_load_acquire() in fscrypt_get_inode_info(). I.e., publish the * pointer with a RELEASE barrier so that other tasks can ACQUIRE it. */ if (cmpxchg_release(fscrypt_inode_info_addr(inode), NULL, crypt_info) == NULL) { /* * We won the race and set the inode's fscrypt info to our * crypt_info. Now link it into the master key's inode list. */ if (mk) { crypt_info->ci_master_key = mk; refcount_inc(&mk->mk_active_refs); spin_lock(&mk->mk_decrypted_inodes_lock); list_add(&crypt_info->ci_master_key_link, &mk->mk_decrypted_inodes); spin_unlock(&mk->mk_decrypted_inodes_lock); } crypt_info = NULL; } res = 0; out: if (mk) { up_read(&mk->mk_sem); fscrypt_put_master_key(mk); } put_crypt_info(crypt_info); return res; } /** * fscrypt_get_encryption_info() - set up an inode's encryption key * @inode: the inode to set up the key for. Must be encrypted. * @allow_unsupported: if %true, treat an unsupported encryption policy (or * unrecognized encryption context) the same way as the key * being unavailable, instead of returning an error. Use * %false unless the operation being performed is needed in * order for files (or directories) to be deleted. * * Set up the inode's encryption key, if it hasn't already been done. * * Note: unless the key setup was already done, this isn't %GFP_NOFS-safe. So * generally this shouldn't be called from within a filesystem transaction. * * Return: 0 if the key is now set up, *or* if it couldn't be set up because the * needed master key is absent. (Use fscrypt_has_encryption_key() to * distinguish these cases.) Also can return another -errno code. */ int fscrypt_get_encryption_info(struct inode *inode, bool allow_unsupported) { int res; union fscrypt_context ctx; union fscrypt_policy policy; if (fscrypt_has_encryption_key(inode)) return 0; res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); if (res < 0) { if (res == -ERANGE && allow_unsupported) return 0; fscrypt_warn(inode, "Error %d getting encryption context", res); return res; } res = fscrypt_policy_from_context(&policy, &ctx, res); if (res) { if (allow_unsupported) return 0; fscrypt_warn(inode, "Unrecognized or corrupt encryption context"); return res; } if (!fscrypt_supported_policy(&policy, inode)) { if (allow_unsupported) return 0; return -EINVAL; } res = fscrypt_setup_encryption_info(inode, &policy, fscrypt_context_nonce(&ctx), IS_CASEFOLDED(inode) && S_ISDIR(inode->i_mode)); if (res == -ENOPKG && allow_unsupported) /* Algorithm unavailable? */ res = 0; if (res == -ENOKEY) res = 0; return res; } /** * fscrypt_prepare_new_inode() - prepare to create a new inode in a directory * @dir: a possibly-encrypted directory * @inode: the new inode. ->i_mode and ->i_blkbits must be set already. * ->i_ino doesn't need to be set yet. * @encrypt_ret: (output) set to %true if the new inode will be encrypted * * If the directory is encrypted, set up its encryption key in preparation for * encrypting the name of the new file. Also, if the new inode will be * encrypted, set up its encryption key too and set *encrypt_ret=true. * * This isn't %GFP_NOFS-safe, and therefore it should be called before starting * any filesystem transaction to create the inode. For this reason, ->i_ino * isn't required to be set yet, as the filesystem may not have set it yet. * * This doesn't persist the new inode's encryption context. That still needs to * be done later by calling fscrypt_set_context(). * * Return: 0 on success, -ENOKEY if a key needs to be set up for @dir or @inode * but the needed master key is absent, or another -errno code */ int fscrypt_prepare_new_inode(struct inode *dir, struct inode *inode, bool *encrypt_ret) { const union fscrypt_policy *policy; u8 nonce[FSCRYPT_FILE_NONCE_SIZE]; policy = fscrypt_policy_to_inherit(dir); if (policy == NULL) return 0; if (IS_ERR(policy)) return PTR_ERR(policy); if (WARN_ON_ONCE(inode->i_blkbits == 0)) return -EINVAL; if (WARN_ON_ONCE(inode->i_mode == 0)) return -EINVAL; /* * Only regular files, directories, and symlinks are encrypted. * Special files like device nodes and named pipes aren't. */ if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode) && !S_ISLNK(inode->i_mode)) return 0; *encrypt_ret = true; get_random_bytes(nonce, FSCRYPT_FILE_NONCE_SIZE); return fscrypt_setup_encryption_info(inode, policy, nonce, IS_CASEFOLDED(dir) && S_ISDIR(inode->i_mode)); } EXPORT_SYMBOL_GPL(fscrypt_prepare_new_inode); /** * fscrypt_put_encryption_info() - free most of an inode's fscrypt data * @inode: an inode being evicted * * Free the inode's fscrypt_inode_info. Filesystems must call this when the * inode is being evicted. An RCU grace period need not have elapsed yet. */ void fscrypt_put_encryption_info(struct inode *inode) { /* * Ideally we'd start with a lightweight IS_ENCRYPTED() check here * before proceeding to retrieve and check the pointer. However, during * inode creation, the fscrypt_inode_info is set before S_ENCRYPTED. If * an error occurs, it needs to be cleaned up regardless. */ struct fscrypt_inode_info **ci_addr = fscrypt_inode_info_addr(inode); put_crypt_info(*ci_addr); *ci_addr = NULL; } EXPORT_SYMBOL(fscrypt_put_encryption_info); /** * fscrypt_free_inode() - free an inode's fscrypt data requiring RCU delay * @inode: an inode being freed * * Free the inode's cached decrypted symlink target, if any. Filesystems must * call this after an RCU grace period, just before they free the inode. */ void fscrypt_free_inode(struct inode *inode) { if (IS_ENCRYPTED(inode) && S_ISLNK(inode->i_mode)) { kfree(inode->i_link); inode->i_link = NULL; } } EXPORT_SYMBOL(fscrypt_free_inode); /** * fscrypt_drop_inode() - check whether the inode's master key has been removed * @inode: an inode being considered for eviction * * Filesystems supporting fscrypt must call this from their ->drop_inode() * method so that encrypted inodes are evicted as soon as they're no longer in * use and their master key has been removed. * * Return: 1 if fscrypt wants the inode to be evicted now, otherwise 0 */ int fscrypt_drop_inode(struct inode *inode) { const struct fscrypt_inode_info *ci = fscrypt_get_inode_info(inode); /* * If ci is NULL, then the inode doesn't have an encryption key set up * so it's irrelevant. If ci_master_key is NULL, then the master key * was provided via the legacy mechanism of the process-subscribed * keyrings, so we don't know whether it's been removed or not. */ if (!ci || !ci->ci_master_key) return 0; /* * With proper, non-racy use of FS_IOC_REMOVE_ENCRYPTION_KEY, all inodes * protected by the key were cleaned by sync_filesystem(). But if * userspace is still using the files, inodes can be dirtied between * then and now. We mustn't lose any writes, so skip dirty inodes here. */ if (inode_state_read(inode) & I_DIRTY_ALL) return 0; /* * We can't take ->mk_sem here, since this runs in atomic context. * Therefore, ->mk_present can change concurrently, and our result may * immediately become outdated. But there's no correctness problem with * unnecessarily evicting. Nor is there a correctness problem with not * evicting while iput() is racing with the key being removed, since * then the thread removing the key will either evict the inode itself * or will correctly detect that it wasn't evicted due to the race. */ return !READ_ONCE(ci->ci_master_key->mk_present); } EXPORT_SYMBOL_GPL(fscrypt_drop_inode);
188 188 1088 2 1092 2 1081 20 426 10 201 382 81 23 37 19 324 446 50 425 156 322 445 543 17 136 491 84 23 40 421 595 72 567 197 2 421 595 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/readdir.c * * Copyright (C) 1995 Linus Torvalds */ #include <linux/stddef.h> #include <linux/kernel.h> #include <linux/export.h> #include <linux/time.h> #include <linux/mm.h> #include <linux/errno.h> #include <linux/stat.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/fsnotify.h> #include <linux/dirent.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/unistd.h> #include <linux/compat.h> #include <linux/uaccess.h> #define dirent_size(dirent, len) offsetof(typeof(*(dirent)), d_name[len]) /* * Some filesystems were never converted to '->iterate_shared()' * and their directory iterators want the inode lock held for * writing. This wrapper allows for converting from the shared * semantics to the exclusive inode use. */ int wrap_directory_iterator(struct file *file, struct dir_context *ctx, int (*iter)(struct file *, struct dir_context *)) { struct inode *inode = file_inode(file); int ret; /* * We'd love to have an 'inode_upgrade_trylock()' operation, * see the comment in mmap_upgrade_trylock() in mm/memory.c. * * But considering this is for "filesystems that never got * converted", it really doesn't matter. * * Also note that since we have to return with the lock held * for reading, we can't use the "killable()" locking here, * since we do need to get the lock even if we're dying. * * We could do the write part killably and then get the read * lock unconditionally if it mattered, but see above on why * this does the very simplistic conversion. */ up_read(&inode->i_rwsem); down_write(&inode->i_rwsem); /* * Since we dropped the inode lock, we should do the * DEADDIR test again. See 'iterate_dir()' below. * * Note that we don't need to re-do the f_pos games, * since the file must be locked wrt f_pos anyway. */ ret = -ENOENT; if (!IS_DEADDIR(inode)) ret = iter(file, ctx); downgrade_write(&inode->i_rwsem); return ret; } EXPORT_SYMBOL(wrap_directory_iterator); /* * Note the "unsafe_put_user()" semantics: we goto a * label for errors. */ #define unsafe_copy_dirent_name(_dst, _src, _len, label) do { \ char __user *dst = (_dst); \ const char *src = (_src); \ size_t len = (_len); \ unsafe_put_user(0, dst+len, label); \ unsafe_copy_to_user(dst, src, len, label); \ } while (0) int iterate_dir(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); int res = -ENOTDIR; if (!file->f_op->iterate_shared) goto out; res = security_file_permission(file, MAY_READ); if (res) goto out; res = fsnotify_file_perm(file, MAY_READ); if (res) goto out; res = down_read_killable(&inode->i_rwsem); if (res) goto out; res = -ENOENT; if (!IS_DEADDIR(inode)) { ctx->pos = file->f_pos; res = file->f_op->iterate_shared(file, ctx); file->f_pos = ctx->pos; fsnotify_access(file); file_accessed(file); } inode_unlock_shared(inode); out: return res; } EXPORT_SYMBOL(iterate_dir); /* * POSIX says that a dirent name cannot contain NULL or a '/'. * * It's not 100% clear what we should really do in this case. * The filesystem is clearly corrupted, but returning a hard * error means that you now don't see any of the other names * either, so that isn't a perfect alternative. * * And if you return an error, what error do you use? Several * filesystems seem to have decided on EUCLEAN being the error * code for EFSCORRUPTED, and that may be the error to use. Or * just EIO, which is perhaps more obvious to users. * * In order to see the other file names in the directory, the * caller might want to make this a "soft" error: skip the * entry, and return the error at the end instead. * * Note that this should likely do a "memchr(name, 0, len)" * check too, since that would be filesystem corruption as * well. However, that case can't actually confuse user space, * which has to do a strlen() on the name anyway to find the * filename length, and the above "soft error" worry means * that it's probably better left alone until we have that * issue clarified. * * Note the PATH_MAX check - it's arbitrary but the real * kernel limit on a possible path component, not NAME_MAX, * which is the technical standard limit. */ static int verify_dirent_name(const char *name, int len) { if (len <= 0 || len >= PATH_MAX) return -EIO; if (memchr(name, '/', len)) return -EIO; return 0; } /* * Traditional linux readdir() handling.. * * "count=1" is a special case, meaning that the buffer is one * dirent-structure in size and that the code can't handle more * anyway. Thus the special "fillonedir()" function for that * case (the low-level handlers don't need to care about this). */ #ifdef __ARCH_WANT_OLD_READDIR struct old_linux_dirent { unsigned long d_ino; unsigned long d_offset; unsigned short d_namlen; char d_name[]; }; struct readdir_callback { struct dir_context ctx; struct old_linux_dirent __user * dirent; int result; }; static bool fillonedir(struct dir_context *ctx, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { struct readdir_callback *buf = container_of(ctx, struct readdir_callback, ctx); struct old_linux_dirent __user * dirent; unsigned long d_ino; if (buf->result) return false; buf->result = verify_dirent_name(name, namlen); if (buf->result) return false; d_ino = ino; if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { buf->result = -EOVERFLOW; return false; } buf->result++; dirent = buf->dirent; scoped_user_write_access_size(dirent, dirent_size(dirent, namlen + 1), efault) { unsafe_put_user(d_ino, &dirent->d_ino, efault); unsafe_put_user(offset, &dirent->d_offset, efault); unsafe_put_user(namlen, &dirent->d_namlen, efault); unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault); } return true; efault: buf->result = -EFAULT; return false; } SYSCALL_DEFINE3(old_readdir, unsigned int, fd, struct old_linux_dirent __user *, dirent, unsigned int, count) { int error; CLASS(fd_pos, f)(fd); struct readdir_callback buf = { .ctx.actor = fillonedir, .ctx.count = 1, /* Hint to fs: just one entry. */ .dirent = dirent }; if (fd_empty(f)) return -EBADF; error = iterate_dir(fd_file(f), &buf.ctx); if (buf.result) error = buf.result; return error; } #endif /* __ARCH_WANT_OLD_READDIR */ /* * New, all-improved, singing, dancing, iBCS2-compliant getdents() * interface. */ struct linux_dirent { unsigned long d_ino; unsigned long d_off; unsigned short d_reclen; char d_name[]; }; struct getdents_callback { struct dir_context ctx; struct linux_dirent __user * current_dir; int prev_reclen; int error; }; static bool filldir(struct dir_context *ctx, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { struct linux_dirent __user *dirent, *prev; struct getdents_callback *buf = container_of(ctx, struct getdents_callback, ctx); unsigned long d_ino; int reclen = ALIGN(dirent_size(dirent, namlen + 2), sizeof(long)); int prev_reclen; unsigned int flags = d_type; BUILD_BUG_ON(FILLDIR_FLAG_NOINTR & S_DT_MASK); d_type &= S_DT_MASK; buf->error = verify_dirent_name(name, namlen); if (unlikely(buf->error)) return false; buf->error = -EINVAL; /* only used if we fail.. */ if (reclen > ctx->count) return false; d_ino = ino; if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { buf->error = -EOVERFLOW; return false; } prev_reclen = buf->prev_reclen; if (!(flags & FILLDIR_FLAG_NOINTR) && prev_reclen && signal_pending(current)) return false; dirent = buf->current_dir; prev = (void __user *) dirent - prev_reclen; scoped_user_write_access_size(prev, reclen + prev_reclen, efault) { /* This might be 'dirent->d_off', but if so it will get overwritten */ unsafe_put_user(offset, &prev->d_off, efault); unsafe_put_user(d_ino, &dirent->d_ino, efault); unsafe_put_user(reclen, &dirent->d_reclen, efault); unsafe_put_user(d_type, (char __user *)dirent + reclen - 1, efault); unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault); } buf->current_dir = (void __user *)dirent + reclen; buf->prev_reclen = reclen; ctx->count -= reclen; return true; efault: buf->error = -EFAULT; return false; } SYSCALL_DEFINE3(getdents, unsigned int, fd, struct linux_dirent __user *, dirent, unsigned int, count) { CLASS(fd_pos, f)(fd); struct getdents_callback buf = { .ctx.actor = filldir, .ctx.count = count, .ctx.dt_flags_mask = FILLDIR_FLAG_NOINTR, .current_dir = dirent }; int error; if (fd_empty(f)) return -EBADF; error = iterate_dir(fd_file(f), &buf.ctx); if (error >= 0) error = buf.error; if (buf.prev_reclen) { struct linux_dirent __user * lastdirent; lastdirent = (void __user *)buf.current_dir - buf.prev_reclen; if (put_user(buf.ctx.pos, &lastdirent->d_off)) error = -EFAULT; else error = count - buf.ctx.count; } return error; } struct getdents_callback64 { struct dir_context ctx; struct linux_dirent64 __user * current_dir; int prev_reclen; int error; }; static bool filldir64(struct dir_context *ctx, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { struct linux_dirent64 __user *dirent, *prev; struct getdents_callback64 *buf = container_of(ctx, struct getdents_callback64, ctx); int reclen = ALIGN(dirent_size(dirent, namlen + 1), sizeof(u64)); int prev_reclen; unsigned int flags = d_type; BUILD_BUG_ON(FILLDIR_FLAG_NOINTR & S_DT_MASK); d_type &= S_DT_MASK; buf->error = verify_dirent_name(name, namlen); if (unlikely(buf->error)) return false; buf->error = -EINVAL; /* only used if we fail.. */ if (reclen > ctx->count) return false; prev_reclen = buf->prev_reclen; if (!(flags & FILLDIR_FLAG_NOINTR) && prev_reclen && signal_pending(current)) return false; dirent = buf->current_dir; prev = (void __user *)dirent - prev_reclen; scoped_user_write_access_size(prev, reclen + prev_reclen, efault) { /* This might be 'dirent->d_off', but if so it will get overwritten */ unsafe_put_user(offset, &prev->d_off, efault); unsafe_put_user(ino, &dirent->d_ino, efault); unsafe_put_user(reclen, &dirent->d_reclen, efault); unsafe_put_user(d_type, &dirent->d_type, efault); unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault); } buf->prev_reclen = reclen; buf->current_dir = (void __user *)dirent + reclen; ctx->count -= reclen; return true; efault: buf->error = -EFAULT; return false; } SYSCALL_DEFINE3(getdents64, unsigned int, fd, struct linux_dirent64 __user *, dirent, unsigned int, count) { CLASS(fd_pos, f)(fd); struct getdents_callback64 buf = { .ctx.actor = filldir64, .ctx.count = count, .ctx.dt_flags_mask = FILLDIR_FLAG_NOINTR, .current_dir = dirent }; int error; if (fd_empty(f)) return -EBADF; error = iterate_dir(fd_file(f), &buf.ctx); if (error >= 0) error = buf.error; if (buf.prev_reclen) { struct linux_dirent64 __user * lastdirent; typeof(lastdirent->d_off) d_off = buf.ctx.pos; lastdirent = (void __user *) buf.current_dir - buf.prev_reclen; if (put_user(d_off, &lastdirent->d_off)) error = -EFAULT; else error = count - buf.ctx.count; } return error; } #ifdef CONFIG_COMPAT struct compat_old_linux_dirent { compat_ulong_t d_ino; compat_ulong_t d_offset; unsigned short d_namlen; char d_name[]; }; struct compat_readdir_callback { struct dir_context ctx; struct compat_old_linux_dirent __user *dirent; int result; }; static bool compat_fillonedir(struct dir_context *ctx, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { struct compat_readdir_callback *buf = container_of(ctx, struct compat_readdir_callback, ctx); struct compat_old_linux_dirent __user *dirent; compat_ulong_t d_ino; if (buf->result) return false; buf->result = verify_dirent_name(name, namlen); if (buf->result) return false; d_ino = ino; if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { buf->result = -EOVERFLOW; return false; } buf->result++; dirent = buf->dirent; scoped_user_write_access_size(dirent, dirent_size(dirent, namlen + 1), efault) { unsafe_put_user(d_ino, &dirent->d_ino, efault); unsafe_put_user(offset, &dirent->d_offset, efault); unsafe_put_user(namlen, &dirent->d_namlen, efault); unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault); } return true; efault: buf->result = -EFAULT; return false; } COMPAT_SYSCALL_DEFINE3(old_readdir, unsigned int, fd, struct compat_old_linux_dirent __user *, dirent, unsigned int, count) { int error; CLASS(fd_pos, f)(fd); struct compat_readdir_callback buf = { .ctx.actor = compat_fillonedir, .ctx.count = 1, /* Hint to fs: just one entry. */ .dirent = dirent }; if (fd_empty(f)) return -EBADF; error = iterate_dir(fd_file(f), &buf.ctx); if (buf.result) error = buf.result; return error; } struct compat_linux_dirent { compat_ulong_t d_ino; compat_ulong_t d_off; unsigned short d_reclen; char d_name[]; }; struct compat_getdents_callback { struct dir_context ctx; struct compat_linux_dirent __user *current_dir; int prev_reclen; int error; }; static bool compat_filldir(struct dir_context *ctx, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { struct compat_linux_dirent __user *dirent, *prev; struct compat_getdents_callback *buf = container_of(ctx, struct compat_getdents_callback, ctx); compat_ulong_t d_ino; int reclen = ALIGN(dirent_size(dirent, namlen + 2), sizeof(compat_long_t)); int prev_reclen; unsigned int flags = d_type; BUILD_BUG_ON(FILLDIR_FLAG_NOINTR & S_DT_MASK); d_type &= S_DT_MASK; buf->error = verify_dirent_name(name, namlen); if (unlikely(buf->error)) return false; buf->error = -EINVAL; /* only used if we fail.. */ if (reclen > ctx->count) return false; d_ino = ino; if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { buf->error = -EOVERFLOW; return false; } prev_reclen = buf->prev_reclen; if (!(flags & FILLDIR_FLAG_NOINTR) && prev_reclen && signal_pending(current)) return false; dirent = buf->current_dir; prev = (void __user *) dirent - prev_reclen; scoped_user_write_access_size(prev, reclen + prev_reclen, efault) { unsafe_put_user(offset, &prev->d_off, efault); unsafe_put_user(d_ino, &dirent->d_ino, efault); unsafe_put_user(reclen, &dirent->d_reclen, efault); unsafe_put_user(d_type, (char __user *)dirent + reclen - 1, efault); unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault); } buf->prev_reclen = reclen; buf->current_dir = (void __user *)dirent + reclen; ctx->count -= reclen; return true; efault: buf->error = -EFAULT; return false; } COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd, struct compat_linux_dirent __user *, dirent, unsigned int, count) { CLASS(fd_pos, f)(fd); struct compat_getdents_callback buf = { .ctx.actor = compat_filldir, .ctx.count = count, .ctx.dt_flags_mask = FILLDIR_FLAG_NOINTR, .current_dir = dirent, }; int error; if (fd_empty(f)) return -EBADF; error = iterate_dir(fd_file(f), &buf.ctx); if (error >= 0) error = buf.error; if (buf.prev_reclen) { struct compat_linux_dirent __user * lastdirent; lastdirent = (void __user *)buf.current_dir - buf.prev_reclen; if (put_user(buf.ctx.pos, &lastdirent->d_off)) error = -EFAULT; else error = count - buf.ctx.count; } return error; } #endif
1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 /* * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs * * Pentium III FXSR, SSE support * Gareth Hughes <gareth@valinux.com>, May 2000 */ /* * Handle hardware traps and faults. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/context_tracking.h> #include <linux/interrupt.h> #include <linux/kallsyms.h> #include <linux/kmsan.h> #include <linux/spinlock.h> #include <linux/kprobes.h> #include <linux/uaccess.h> #include <linux/kdebug.h> #include <linux/kgdb.h> #include <linux/kernel.h> #include <linux/export.h> #include <linux/ptrace.h> #include <linux/uprobes.h> #include <linux/string.h> #include <linux/delay.h> #include <linux/errno.h> #include <linux/kexec.h> #include <linux/sched.h> #include <linux/sched/task_stack.h> #include <linux/static_call.h> #include <linux/timer.h> #include <linux/init.h> #include <linux/bug.h> #include <linux/nmi.h> #include <linux/mm.h> #include <linux/smp.h> #include <linux/cpu.h> #include <linux/io.h> #include <linux/hardirq.h> #include <linux/atomic.h> #include <linux/iommu.h> #include <linux/ubsan.h> #include <asm/stacktrace.h> #include <asm/processor.h> #include <asm/debugreg.h> #include <asm/realmode.h> #include <asm/text-patching.h> #include <asm/ftrace.h> #include <asm/traps.h> #include <asm/desc.h> #include <asm/fred.h> #include <asm/fpu/api.h> #include <asm/cpu.h> #include <asm/cpu_entry_area.h> #include <asm/mce.h> #include <asm/fixmap.h> #include <asm/mach_traps.h> #include <asm/alternative.h> #include <asm/fpu/xstate.h> #include <asm/vm86.h> #include <asm/umip.h> #include <asm/insn.h> #include <asm/insn-eval.h> #include <asm/vdso.h> #include <asm/tdx.h> #include <asm/cfi.h> #include <asm/msr.h> #include <asm/vsyscall.h> #ifdef CONFIG_X86_64 #include <asm/x86_init.h> #else #include <asm/processor-flags.h> #include <asm/setup.h> #endif #include <asm/proto.h> DECLARE_BITMAP(system_vectors, NR_VECTORS); __always_inline int is_valid_bugaddr(unsigned long addr) { if (addr < TASK_SIZE_MAX) return 0; /* * We got #UD, if the text isn't readable we'd have gotten * a different exception. */ return *(unsigned short *)addr == INSN_UD2; } /* * Check for UD1 or UD2, accounting for Address Size Override Prefixes. * If it's a UD1, further decode to determine its use: * * FineIBT: d6 udb * FineIBT: f0 75 f9 lock jne . - 6 * UBSan{0}: 67 0f b9 00 ud1 (%eax),%eax * UBSan{10}: 67 0f b9 40 10 ud1 0x10(%eax),%eax * static_call: 0f b9 cc ud1 %esp,%ecx * __WARN_trap: 67 48 0f b9 3a ud1 (%edx),%reg * * Notable, since __WARN_trap can use all registers, the distinction between * UD1 users is through R/M. */ __always_inline int decode_bug(unsigned long addr, s32 *imm, int *len) { unsigned long start = addr; u8 v, reg, rm, rex = 0; int type = BUG_UD1; bool lock = false; if (addr < TASK_SIZE_MAX) return BUG_NONE; for (;;) { v = *(u8 *)(addr++); if (v == INSN_ASOP) continue; if (v == INSN_LOCK) { lock = true; continue; } if ((v & 0xf0) == 0x40) { rex = v; continue; } break; } switch (v) { case 0x70 ... 0x7f: /* Jcc.d8 */ addr += 1; /* d8 */ *len = addr - start; WARN_ON_ONCE(!lock); return BUG_LOCK; case 0xd6: *len = addr - start; return BUG_UDB; case OPCODE_ESCAPE: break; default: return BUG_NONE; } v = *(u8 *)(addr++); if (v == SECOND_BYTE_OPCODE_UD2) { *len = addr - start; return BUG_UD2; } if (v != SECOND_BYTE_OPCODE_UD1) return BUG_NONE; *imm = 0; v = *(u8 *)(addr++); /* ModRM */ if (X86_MODRM_MOD(v) != 3 && X86_MODRM_RM(v) == 4) addr++; /* SIB */ reg = X86_MODRM_REG(v) + 8*!!X86_REX_R(rex); rm = X86_MODRM_RM(v) + 8*!!X86_REX_B(rex); /* Decode immediate, if present */ switch (X86_MODRM_MOD(v)) { case 0: if (X86_MODRM_RM(v) == 5) addr += 4; /* RIP + disp32 */ if (rm == 0) /* (%eax) */ type = BUG_UD1_UBSAN; if (rm == 2) { /* (%edx) */ *imm = reg; type = BUG_UD1_WARN; } break; case 1: *imm = *(s8 *)addr; addr += 1; if (rm == 0) /* (%eax) */ type = BUG_UD1_UBSAN; break; case 2: *imm = *(s32 *)addr; addr += 4; if (rm == 0) /* (%eax) */ type = BUG_UD1_UBSAN; break; case 3: break; } /* record instruction length */ *len = addr - start; return type; } static inline unsigned long pt_regs_val(struct pt_regs *regs, int nr) { int offset = pt_regs_offset(regs, nr); if (WARN_ON_ONCE(offset < -0)) return 0; return *((unsigned long *)((void *)regs + offset)); } #ifdef HAVE_ARCH_BUG_FORMAT_ARGS DEFINE_STATIC_CALL(WARN_trap, __WARN_trap); EXPORT_STATIC_CALL_TRAMP(WARN_trap); /* * Create a va_list from an exception context. */ void *__warn_args(struct arch_va_list *args, struct pt_regs *regs) { /* * Register save area; populate with function call argument registers */ args->regs[0] = regs->di; args->regs[1] = regs->si; args->regs[2] = regs->dx; args->regs[3] = regs->cx; args->regs[4] = regs->r8; args->regs[5] = regs->r9; /* * From the ABI document: * * @gp_offset - the element holds the offset in bytes from * reg_save_area to the place where the next available general purpose * argument register is saved. In case all argument registers have * been exhausted, it is set to the value 48 (6*8). * * @fp_offset - the element holds the offset in bytes from * reg_save_area to the place where the next available floating point * argument is saved. In case all argument registers have been * exhausted, it is set to the value 176 (6*8 + 8*16) * * @overflow_arg_area - this pointer is used to fetch arguments passed * on the stack. It is initialized with the address of the first * argument passed on the stack, if any, and then always updated to * point to the start of the next argument on the stack. * * @reg_save_area - the element points to the start of the register * save area. * * Notably the vararg starts with the second argument and there are no * floating point arguments in the kernel. */ args->args.gp_offset = 1*8; args->args.fp_offset = 6*8 + 8*16; args->args.reg_save_area = &args->regs; args->args.overflow_arg_area = (void *)regs->sp; /* * If the exception came from __WARN_trap, there is a return * address on the stack, skip that. This is why any __WARN_trap() * caller must inhibit tail-call optimization. */ if ((void *)regs->ip == &__WARN_trap) args->args.overflow_arg_area += 8; return &args->args; } #endif /* HAVE_ARCH_BUG_FORMAT */ static nokprobe_inline int do_trap_no_signal(struct task_struct *tsk, int trapnr, const char *str, struct pt_regs *regs, long error_code) { if (v8086_mode(regs)) { /* * Traps 0, 1, 3, 4, and 5 should be forwarded to vm86. * On nmi (interrupt 2), do_trap should not be called. */ if (trapnr < X86_TRAP_UD) { if (!handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, trapnr)) return 0; } } else if (!user_mode(regs)) { if (fixup_exception(regs, trapnr, error_code, 0)) return 0; tsk->thread.error_code = error_code; tsk->thread.trap_nr = trapnr; die(str, regs, error_code); } else { if (fixup_vdso_exception(regs, trapnr, error_code, 0)) return 0; } /* * We want error_code and trap_nr set for userspace faults and * kernelspace faults which result in die(), but not * kernelspace faults which are fixed up. die() gives the * process no chance to handle the signal and notice the * kernel fault information, so that won't result in polluting * the information about previously queued, but not yet * delivered, faults. See also exc_general_protection below. */ tsk->thread.error_code = error_code; tsk->thread.trap_nr = trapnr; return -1; } static void show_signal(struct task_struct *tsk, int signr, const char *type, const char *desc, struct pt_regs *regs, long error_code) { if (show_unhandled_signals && unhandled_signal(tsk, signr) && printk_ratelimit()) { pr_info("%s[%d] %s%s ip:%lx sp:%lx error:%lx", tsk->comm, task_pid_nr(tsk), type, desc, regs->ip, regs->sp, error_code); print_vma_addr(KERN_CONT " in ", regs->ip); pr_cont("\n"); } } static void do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, long error_code, int sicode, void __user *addr) { struct task_struct *tsk = current; if (!do_trap_no_signal(tsk, trapnr, str, regs, error_code)) return; show_signal(tsk, signr, "trap ", str, regs, error_code); if (!sicode) force_sig(signr); else force_sig_fault(signr, sicode, addr); } NOKPROBE_SYMBOL(do_trap); static void do_error_trap(struct pt_regs *regs, long error_code, char *str, unsigned long trapnr, int signr, int sicode, void __user *addr) { RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) != NOTIFY_STOP) { cond_local_irq_enable(regs); do_trap(trapnr, signr, str, regs, error_code, sicode, addr); cond_local_irq_disable(regs); } } /* * Posix requires to provide the address of the faulting instruction for * SIGILL (#UD) and SIGFPE (#DE) in the si_addr member of siginfo_t. * * This address is usually regs->ip, but when an uprobe moved the code out * of line then regs->ip points to the XOL code which would confuse * anything which analyzes the fault address vs. the unmodified binary. If * a trap happened in XOL code then uprobe maps regs->ip back to the * original instruction address. */ static __always_inline void __user *error_get_trap_addr(struct pt_regs *regs) { return (void __user *)uprobe_get_trap_addr(regs); } DEFINE_IDTENTRY(exc_divide_error) { do_error_trap(regs, 0, "divide error", X86_TRAP_DE, SIGFPE, FPE_INTDIV, error_get_trap_addr(regs)); } DEFINE_IDTENTRY(exc_overflow) { do_error_trap(regs, 0, "overflow", X86_TRAP_OF, SIGSEGV, 0, NULL); } #ifdef CONFIG_X86_F00F_BUG void handle_invalid_op(struct pt_regs *regs) #else static inline void handle_invalid_op(struct pt_regs *regs) #endif { do_error_trap(regs, 0, "invalid opcode", X86_TRAP_UD, SIGILL, ILL_ILLOPN, error_get_trap_addr(regs)); } noinstr bool handle_bug(struct pt_regs *regs) { unsigned long addr = regs->ip; bool handled = false; int ud_type, ud_len; s32 ud_imm; ud_type = decode_bug(addr, &ud_imm, &ud_len); if (ud_type == BUG_NONE) return handled; /* * All lies, just get the WARN/BUG out. */ instrumentation_begin(); /* * Normally @regs are unpoisoned by irqentry_enter(), but handle_bug() * is a rare case that uses @regs without passing them to * irqentry_enter(). */ kmsan_unpoison_entry_regs(regs); /* * Since we're emulating a CALL with exceptions, restore the interrupt * state to what it was at the exception site. */ if (regs->flags & X86_EFLAGS_IF) raw_local_irq_enable(); switch (ud_type) { case BUG_UD1_WARN: if (report_bug_entry((void *)pt_regs_val(regs, ud_imm), regs) == BUG_TRAP_TYPE_WARN) handled = true; break; case BUG_UD2: if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN) { handled = true; break; } fallthrough; case BUG_UDB: case BUG_LOCK: if (handle_cfi_failure(regs) == BUG_TRAP_TYPE_WARN) { handled = true; break; } break; case BUG_UD1_UBSAN: if (IS_ENABLED(CONFIG_UBSAN_TRAP)) { pr_crit("%s at %pS\n", report_ubsan_failure(ud_imm), (void *)regs->ip); } break; default: break; } /* * When continuing, and regs->ip hasn't changed, move it to the next * instruction. When not continuing execution, restore the instruction * pointer. */ if (handled) { if (regs->ip == addr) regs->ip += ud_len; } else { regs->ip = addr; } if (regs->flags & X86_EFLAGS_IF) raw_local_irq_disable(); instrumentation_end(); return handled; } DEFINE_IDTENTRY_RAW(exc_invalid_op) { irqentry_state_t state; /* * We use UD2 as a short encoding for 'CALL __WARN', as such * handle it before exception entry to avoid recursive WARN * in case exception entry is the one triggering WARNs. */ if (!user_mode(regs) && handle_bug(regs)) return; state = irqentry_enter(regs); instrumentation_begin(); handle_invalid_op(regs); instrumentation_end(); irqentry_exit(regs, state); } DEFINE_IDTENTRY(exc_coproc_segment_overrun) { do_error_trap(regs, 0, "coprocessor segment overrun", X86_TRAP_OLD_MF, SIGFPE, 0, NULL); } DEFINE_IDTENTRY_ERRORCODE(exc_invalid_tss) { do_error_trap(regs, error_code, "invalid TSS", X86_TRAP_TS, SIGSEGV, 0, NULL); } DEFINE_IDTENTRY_ERRORCODE(exc_segment_not_present) { do_error_trap(regs, error_code, "segment not present", X86_TRAP_NP, SIGBUS, 0, NULL); } DEFINE_IDTENTRY_ERRORCODE(exc_stack_segment) { do_error_trap(regs, error_code, "stack segment", X86_TRAP_SS, SIGBUS, 0, NULL); } DEFINE_IDTENTRY_ERRORCODE(exc_alignment_check) { char *str = "alignment check"; if (notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_AC, SIGBUS) == NOTIFY_STOP) return; if (!user_mode(regs)) die("Split lock detected\n", regs, error_code); local_irq_enable(); if (handle_user_split_lock(regs, error_code)) goto out; do_trap(X86_TRAP_AC, SIGBUS, "alignment check", regs, error_code, BUS_ADRALN, NULL); out: local_irq_disable(); } #ifdef CONFIG_VMAP_STACK __visible void __noreturn handle_stack_overflow(struct pt_regs *regs, unsigned long fault_address, struct stack_info *info) { const char *name = stack_type_name(info->type); printk(KERN_EMERG "BUG: %s stack guard page was hit at %px (stack is %px..%px)\n", name, (void *)fault_address, info->begin, info->end); die("stack guard page", regs, 0); /* Be absolutely certain we don't return. */ panic("%s stack guard hit", name); } #endif /* * Prevent the compiler and/or objtool from marking the !CONFIG_X86_ESPFIX64 * version of exc_double_fault() as noreturn. Otherwise the noreturn mismatch * between configs triggers objtool warnings. * * This is a temporary hack until we have compiler or plugin support for * annotating noreturns. */ #ifdef CONFIG_X86_ESPFIX64 #define always_true() true #else bool always_true(void); bool __weak always_true(void) { return true; } #endif /* * Runs on an IST stack for x86_64 and on a special task stack for x86_32. * * On x86_64, this is more or less a normal kernel entry. Notwithstanding the * SDM's warnings about double faults being unrecoverable, returning works as * expected. Presumably what the SDM actually means is that the CPU may get * the register state wrong on entry, so returning could be a bad idea. * * Various CPU engineers have promised that double faults due to an IRET fault * while the stack is read-only are, in fact, recoverable. * * On x86_32, this is entered through a task gate, and regs are synthesized * from the TSS. Returning is, in principle, okay, but changes to regs will * be lost. If, for some reason, we need to return to a context with modified * regs, the shim code could be adjusted to synchronize the registers. * * The 32bit #DF shim provides CR2 already as an argument. On 64bit it needs * to be read before doing anything else. */ DEFINE_IDTENTRY_DF(exc_double_fault) { static const char str[] = "double fault"; struct task_struct *tsk = current; #ifdef CONFIG_VMAP_STACK unsigned long address = read_cr2(); struct stack_info info; #endif #ifdef CONFIG_X86_ESPFIX64 extern unsigned char native_irq_return_iret[]; /* * If IRET takes a non-IST fault on the espfix64 stack, then we * end up promoting it to a doublefault. In that case, take * advantage of the fact that we're not using the normal (TSS.sp0) * stack right now. We can write a fake #GP(0) frame at TSS.sp0 * and then modify our own IRET frame so that, when we return, * we land directly at the #GP(0) vector with the stack already * set up according to its expectations. * * The net result is that our #GP handler will think that we * entered from usermode with the bad user context. * * No need for nmi_enter() here because we don't use RCU. */ if (((long)regs->sp >> P4D_SHIFT) == ESPFIX_PGD_ENTRY && regs->cs == __KERNEL_CS && regs->ip == (unsigned long)native_irq_return_iret) { struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1; unsigned long *p = (unsigned long *)regs->sp; /* * regs->sp points to the failing IRET frame on the * ESPFIX64 stack. Copy it to the entry stack. This fills * in gpregs->ss through gpregs->ip. * */ gpregs->ip = p[0]; gpregs->cs = p[1]; gpregs->flags = p[2]; gpregs->sp = p[3]; gpregs->ss = p[4]; gpregs->orig_ax = 0; /* Missing (lost) #GP error code */ /* * Adjust our frame so that we return straight to the #GP * vector with the expected RSP value. This is safe because * we won't enable interrupts or schedule before we invoke * general_protection, so nothing will clobber the stack * frame we just set up. * * We will enter general_protection with kernel GSBASE, * which is what the stub expects, given that the faulting * RIP will be the IRET instruction. */ regs->ip = (unsigned long)asm_exc_general_protection; regs->sp = (unsigned long)&gpregs->orig_ax; return; } #endif irqentry_nmi_enter(regs); instrumentation_begin(); notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV); tsk->thread.error_code = error_code; tsk->thread.trap_nr = X86_TRAP_DF; #ifdef CONFIG_VMAP_STACK /* * If we overflow the stack into a guard page, the CPU will fail * to deliver #PF and will send #DF instead. Similarly, if we * take any non-IST exception while too close to the bottom of * the stack, the processor will get a page fault while * delivering the exception and will generate a double fault. * * According to the SDM (footnote in 6.15 under "Interrupt 14 - * Page-Fault Exception (#PF): * * Processors update CR2 whenever a page fault is detected. If a * second page fault occurs while an earlier page fault is being * delivered, the faulting linear address of the second fault will * overwrite the contents of CR2 (replacing the previous * address). These updates to CR2 occur even if the page fault * results in a double fault or occurs during the delivery of a * double fault. * * The logic below has a small possibility of incorrectly diagnosing * some errors as stack overflows. For example, if the IDT or GDT * gets corrupted such that #GP delivery fails due to a bad descriptor * causing #GP and we hit this condition while CR2 coincidentally * points to the stack guard page, we'll think we overflowed the * stack. Given that we're going to panic one way or another * if this happens, this isn't necessarily worth fixing. * * If necessary, we could improve the test by only diagnosing * a stack overflow if the saved RSP points within 47 bytes of * the bottom of the stack: if RSP == tsk_stack + 48 and we * take an exception, the stack is already aligned and there * will be enough room SS, RSP, RFLAGS, CS, RIP, and a * possible error code, so a stack overflow would *not* double * fault. With any less space left, exception delivery could * fail, and, as a practical matter, we've overflowed the * stack even if the actual trigger for the double fault was * something else. */ if (get_stack_guard_info((void *)address, &info)) handle_stack_overflow(regs, address, &info); #endif pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code); die("double fault", regs, error_code); if (always_true()) panic("Machine halted."); instrumentation_end(); } DEFINE_IDTENTRY(exc_bounds) { if (notify_die(DIE_TRAP, "bounds", regs, 0, X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP) return; cond_local_irq_enable(regs); if (!user_mode(regs)) die("bounds", regs, 0); do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, 0, 0, NULL); cond_local_irq_disable(regs); } enum kernel_gp_hint { GP_NO_HINT, GP_NON_CANONICAL, GP_CANONICAL, GP_LASS_VIOLATION, GP_NULL_POINTER, }; static const char * const kernel_gp_hint_help[] = { [GP_NON_CANONICAL] = "probably for non-canonical address", [GP_CANONICAL] = "maybe for address", [GP_LASS_VIOLATION] = "probably LASS violation for address", [GP_NULL_POINTER] = "kernel NULL pointer dereference", }; /* * When an uncaught #GP occurs, try to determine the memory address accessed by * the instruction and return that address to the caller. Also, try to figure * out whether any part of the access to that address was non-canonical or * across privilege levels. */ static enum kernel_gp_hint get_kernel_gp_address(struct pt_regs *regs, unsigned long *addr) { u8 insn_buf[MAX_INSN_SIZE]; struct insn insn; int ret; if (copy_from_kernel_nofault(insn_buf, (void *)regs->ip, MAX_INSN_SIZE)) return GP_NO_HINT; ret = insn_decode_kernel(&insn, insn_buf); if (ret < 0) return GP_NO_HINT; *addr = (unsigned long)insn_get_addr_ref(&insn, regs); if (*addr == -1UL) return GP_NO_HINT; #ifdef CONFIG_X86_64 /* Operand is in the kernel half */ if (*addr >= ~__VIRTUAL_MASK) return GP_CANONICAL; /* The last byte of the operand is not in the user canonical half */ if (*addr + insn.opnd_bytes - 1 > __VIRTUAL_MASK) return GP_NON_CANONICAL; /* * A NULL pointer dereference usually causes a #PF. However, it * can result in a #GP when LASS is active. Provide the same * hint in the rare case that the condition is hit without LASS. */ if (*addr < PAGE_SIZE) return GP_NULL_POINTER; /* * Assume that LASS caused the exception, because the address is * canonical and in the user half. */ if (cpu_feature_enabled(X86_FEATURE_LASS)) return GP_LASS_VIOLATION; #endif return GP_CANONICAL; } #define GPFSTR "general protection fault" static bool fixup_iopl_exception(struct pt_regs *regs) { struct thread_struct *t = &current->thread; unsigned char byte; unsigned long ip; if (!IS_ENABLED(CONFIG_X86_IOPL_IOPERM) || t->iopl_emul != 3) return false; if (insn_get_effective_ip(regs, &ip)) return false; if (get_user(byte, (const char __user *)ip)) return false; if (byte != 0xfa && byte != 0xfb) return false; if (!t->iopl_warn && printk_ratelimit()) { pr_err("%s[%d] attempts to use CLI/STI, pretending it's a NOP, ip:%lx", current->comm, task_pid_nr(current), ip); print_vma_addr(KERN_CONT " in ", ip); pr_cont("\n"); t->iopl_warn = 1; } regs->ip += 1; return true; } /* * The unprivileged ENQCMD instruction generates #GPs if the * IA32_PASID MSR has not been populated. If possible, populate * the MSR from a PASID previously allocated to the mm. */ static bool try_fixup_enqcmd_gp(void) { #ifdef CONFIG_ARCH_HAS_CPU_PASID u32 pasid; /* * MSR_IA32_PASID is managed using XSAVE. Directly * writing to the MSR is only possible when fpregs * are valid and the fpstate is not. This is * guaranteed when handling a userspace exception * in *before* interrupts are re-enabled. */ lockdep_assert_irqs_disabled(); /* * Hardware without ENQCMD will not generate * #GPs that can be fixed up here. */ if (!cpu_feature_enabled(X86_FEATURE_ENQCMD)) return false; /* * If the mm has not been allocated a * PASID, the #GP can not be fixed up. */ if (!mm_valid_pasid(current->mm)) return false; pasid = mm_get_enqcmd_pasid(current->mm); /* * Did this thread already have its PASID activated? * If so, the #GP must be from something else. */ if (current->pasid_activated) return false; wrmsrq(MSR_IA32_PASID, pasid | MSR_IA32_PASID_VALID); current->pasid_activated = 1; return true; #else return false; #endif } static bool gp_try_fixup_and_notify(struct pt_regs *regs, int trapnr, unsigned long error_code, const char *str, unsigned long address) { if (fixup_exception(regs, trapnr, error_code, address)) return true; current->thread.error_code = error_code; current->thread.trap_nr = trapnr; /* * To be potentially processing a kprobe fault and to trust the result * from kprobe_running(), we have to be non-preemptible. */ if (!preemptible() && kprobe_running() && kprobe_fault_handler(regs, trapnr)) return true; return notify_die(DIE_GPF, str, regs, error_code, trapnr, SIGSEGV) == NOTIFY_STOP; } static void gp_user_force_sig_segv(struct pt_regs *regs, int trapnr, unsigned long error_code, const char *str) { current->thread.error_code = error_code; current->thread.trap_nr = trapnr; show_signal(current, SIGSEGV, "", str, regs, error_code); force_sig(SIGSEGV); } DEFINE_IDTENTRY_ERRORCODE(exc_general_protection) { char desc[sizeof(GPFSTR) + 50 + 2*sizeof(unsigned long) + 1] = GPFSTR; enum kernel_gp_hint hint = GP_NO_HINT; unsigned long gp_addr; if (user_mode(regs) && try_fixup_enqcmd_gp()) return; cond_local_irq_enable(regs); if (v8086_mode(regs)) { local_irq_enable(); handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); local_irq_disable(); return; } if (user_mode(regs)) { if (fixup_iopl_exception(regs)) goto exit; if (fixup_vdso_exception(regs, X86_TRAP_GP, error_code, 0)) goto exit; if (fixup_umip_exception(regs)) goto exit; if (emulate_vsyscall_gp(regs)) goto exit; gp_user_force_sig_segv(regs, X86_TRAP_GP, error_code, desc); goto exit; } if (gp_try_fixup_and_notify(regs, X86_TRAP_GP, error_code, desc, 0)) goto exit; if (error_code) snprintf(desc, sizeof(desc), "segment-related " GPFSTR); else hint = get_kernel_gp_address(regs, &gp_addr); if (hint != GP_NO_HINT) snprintf(desc, sizeof(desc), GPFSTR ", %s 0x%lx", kernel_gp_hint_help[hint], gp_addr); /* * KASAN is interested only in the non-canonical case, clear it * otherwise. */ if (hint != GP_NON_CANONICAL) gp_addr = 0; die_addr(desc, regs, error_code, gp_addr); exit: cond_local_irq_disable(regs); } static bool do_int3(struct pt_regs *regs) { int res; #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP if (kgdb_ll_trap(DIE_INT3, "int3", regs, 0, X86_TRAP_BP, SIGTRAP) == NOTIFY_STOP) return true; #endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */ #ifdef CONFIG_KPROBES if (kprobe_int3_handler(regs)) return true; #endif res = notify_die(DIE_INT3, "int3", regs, 0, X86_TRAP_BP, SIGTRAP); return res == NOTIFY_STOP; } NOKPROBE_SYMBOL(do_int3); static void do_int3_user(struct pt_regs *regs) { if (do_int3(regs)) return; cond_local_irq_enable(regs); do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, 0, 0, NULL); cond_local_irq_disable(regs); } DEFINE_IDTENTRY_RAW(exc_int3) { /* * smp_text_poke_int3_handler() is completely self contained code; it does (and * must) *NOT* call out to anything, lest it hits upon yet another * INT3. */ if (smp_text_poke_int3_handler(regs)) return; /* * irqentry_enter_from_user_mode() uses static_branch_{,un}likely() * and therefore can trigger INT3, hence smp_text_poke_int3_handler() must * be done before. If the entry came from kernel mode, then use * nmi_enter() because the INT3 could have been hit in any context * including NMI. */ if (user_mode(regs)) { irqentry_enter_from_user_mode(regs); instrumentation_begin(); do_int3_user(regs); instrumentation_end(); irqentry_exit_to_user_mode(regs); } else { irqentry_state_t irq_state = irqentry_nmi_enter(regs); instrumentation_begin(); if (!do_int3(regs)) die("int3", regs, 0); instrumentation_end(); irqentry_nmi_exit(regs, irq_state); } } #ifdef CONFIG_X86_64 /* * Help handler running on a per-cpu (IST or entry trampoline) stack * to switch to the normal thread stack if the interrupted code was in * user mode. The actual stack switch is done in entry_64.S */ asmlinkage __visible noinstr struct pt_regs *sync_regs(struct pt_regs *eregs) { struct pt_regs *regs = (struct pt_regs *)current_top_of_stack() - 1; if (regs != eregs) *regs = *eregs; return regs; } #ifdef CONFIG_AMD_MEM_ENCRYPT asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *regs) { unsigned long sp, *stack; struct stack_info info; struct pt_regs *regs_ret; /* * In the SYSCALL entry path the RSP value comes from user-space - don't * trust it and switch to the current kernel stack */ if (ip_within_syscall_gap(regs)) { sp = current_top_of_stack(); goto sync; } /* * From here on the RSP value is trusted. Now check whether entry * happened from a safe stack. Not safe are the entry or unknown stacks, * use the fall-back stack instead in this case. */ sp = regs->sp; stack = (unsigned long *)sp; if (!get_stack_info_noinstr(stack, current, &info) || info.type == STACK_TYPE_ENTRY || info.type > STACK_TYPE_EXCEPTION_LAST) sp = __this_cpu_ist_top_va(VC2); sync: /* * Found a safe stack - switch to it as if the entry didn't happen via * IST stack. The code below only copies pt_regs, the real switch happens * in assembly code. */ sp = ALIGN_DOWN(sp, 8) - sizeof(*regs_ret); regs_ret = (struct pt_regs *)sp; *regs_ret = *regs; return regs_ret; } #endif asmlinkage __visible noinstr struct pt_regs *fixup_bad_iret(struct pt_regs *bad_regs) { struct pt_regs tmp, *new_stack; /* * This is called from entry_64.S early in handling a fault * caused by a bad iret to user mode. To handle the fault * correctly, we want to move our stack frame to where it would * be had we entered directly on the entry stack (rather than * just below the IRET frame) and we want to pretend that the * exception came from the IRET target. */ new_stack = (struct pt_regs *)__this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1; /* Copy the IRET target to the temporary storage. */ __memcpy(&tmp.ip, (void *)bad_regs->sp, 5*8); /* Copy the remainder of the stack from the current stack. */ __memcpy(&tmp, bad_regs, offsetof(struct pt_regs, ip)); /* Update the entry stack */ __memcpy(new_stack, &tmp, sizeof(tmp)); BUG_ON(!user_mode(new_stack)); return new_stack; } #endif static bool is_sysenter_singlestep(struct pt_regs *regs) { /* * We don't try for precision here. If we're anywhere in the region of * code that can be single-stepped in the SYSENTER entry path, then * assume that this is a useless single-step trap due to SYSENTER * being invoked with TF set. (We don't know in advance exactly * which instructions will be hit because BTF could plausibly * be set.) */ #ifdef CONFIG_X86_32 return (regs->ip - (unsigned long)__begin_SYSENTER_singlestep_region) < (unsigned long)__end_SYSENTER_singlestep_region - (unsigned long)__begin_SYSENTER_singlestep_region; #elif defined(CONFIG_IA32_EMULATION) return (regs->ip - (unsigned long)entry_SYSENTER_compat) < (unsigned long)__end_entry_SYSENTER_compat - (unsigned long)entry_SYSENTER_compat; #else return false; #endif } static __always_inline unsigned long debug_read_reset_dr6(void) { unsigned long dr6; get_debugreg(dr6, 6); dr6 ^= DR6_RESERVED; /* Flip to positive polarity */ /* * The Intel SDM says: * * Certain debug exceptions may clear bits 0-3 of DR6. * * BLD induced #DB clears DR6.BLD and any other debug * exception doesn't modify DR6.BLD. * * RTM induced #DB clears DR6.RTM and any other debug * exception sets DR6.RTM. * * To avoid confusion in identifying debug exceptions, * debug handlers should set DR6.BLD and DR6.RTM, and * clear other DR6 bits before returning. * * Keep it simple: write DR6 with its architectural reset * value 0xFFFF0FF0, defined as DR6_RESERVED, immediately. */ set_debugreg(DR6_RESERVED, 6); return dr6; } /* * Our handling of the processor debug registers is non-trivial. * We do not clear them on entry and exit from the kernel. Therefore * it is possible to get a watchpoint trap here from inside the kernel. * However, the code in ./ptrace.c has ensured that the user can * only set watchpoints on userspace addresses. Therefore the in-kernel * watchpoint trap can only occur in code which is reading/writing * from user space. Such code must not hold kernel locks (since it * can equally take a page fault), therefore it is safe to call * force_sig_info even though that claims and releases locks. * * Code in ./signal.c ensures that the debug control register * is restored before we deliver any signal, and therefore that * user code runs with the correct debug control register even though * we clear it here. * * Being careful here means that we don't have to be as careful in a * lot of more complicated places (task switching can be a bit lazy * about restoring all the debug state, and ptrace doesn't have to * find every occurrence of the TF bit that could be saved away even * by user code) * * May run on IST stack. */ static bool notify_debug(struct pt_regs *regs, unsigned long *dr6) { /* * Notifiers will clear bits in @dr6 to indicate the event has been * consumed - hw_breakpoint_handler(), single_stop_cont(). * * Notifiers will set bits in @virtual_dr6 to indicate the desire * for signals - ptrace_triggered(), kgdb_hw_overflow_handler(). */ if (notify_die(DIE_DEBUG, "debug", regs, (long)dr6, 0, SIGTRAP) == NOTIFY_STOP) return true; return false; } static noinstr void exc_debug_kernel(struct pt_regs *regs, unsigned long dr6) { /* * Disable breakpoints during exception handling; recursive exceptions * are exceedingly 'fun'. * * Since this function is NOKPROBE, and that also applies to * HW_BREAKPOINT_X, we can't hit a breakpoint before this (XXX except a * HW_BREAKPOINT_W on our stack) * * Entry text is excluded for HW_BP_X and cpu_entry_area, which * includes the entry stack is excluded for everything. * * For FRED, nested #DB should just work fine. But when a watchpoint or * breakpoint is set in the code path which is executed by #DB handler, * it results in an endless recursion and stack overflow. Thus we stay * with the IDT approach, i.e., save DR7 and disable #DB. */ unsigned long dr7 = local_db_save(); irqentry_state_t irq_state = irqentry_nmi_enter(regs); instrumentation_begin(); /* * If something gets miswired and we end up here for a user mode * #DB, we will malfunction. */ WARN_ON_ONCE(user_mode(regs)); if (test_thread_flag(TIF_BLOCKSTEP)) { /* * The SDM says "The processor clears the BTF flag when it * generates a debug exception." but PTRACE_BLOCKSTEP requested * it for userspace, but we just took a kernel #DB, so re-set * BTF. */ unsigned long debugctl; rdmsrq(MSR_IA32_DEBUGCTLMSR, debugctl); debugctl |= DEBUGCTLMSR_BTF; wrmsrq(MSR_IA32_DEBUGCTLMSR, debugctl); } /* * Catch SYSENTER with TF set and clear DR_STEP. If this hit a * watchpoint at the same time then that will still be handled. */ if (!cpu_feature_enabled(X86_FEATURE_FRED) && (dr6 & DR_STEP) && is_sysenter_singlestep(regs)) dr6 &= ~DR_STEP; /* * The kernel doesn't use INT1 */ if (!dr6) goto out; if (notify_debug(regs, &dr6)) goto out; /* * The kernel doesn't use TF single-step outside of: * * - Kprobes, consumed through kprobe_debug_handler() * - KGDB, consumed through notify_debug() * * So if we get here with DR_STEP set, something is wonky. * * A known way to trigger this is through QEMU's GDB stub, * which leaks #DB into the guest and causes IST recursion. */ if (WARN_ON_ONCE(dr6 & DR_STEP)) regs->flags &= ~X86_EFLAGS_TF; out: instrumentation_end(); irqentry_nmi_exit(regs, irq_state); local_db_restore(dr7); } static noinstr void exc_debug_user(struct pt_regs *regs, unsigned long dr6) { bool icebp; /* * If something gets miswired and we end up here for a kernel mode * #DB, we will malfunction. */ WARN_ON_ONCE(!user_mode(regs)); /* * NB: We can't easily clear DR7 here because * irqentry_exit_to_usermode() can invoke ptrace, schedule, access * user memory, etc. This means that a recursive #DB is possible. If * this happens, that #DB will hit exc_debug_kernel() and clear DR7. * Since we're not on the IST stack right now, everything will be * fine. */ irqentry_enter_from_user_mode(regs); instrumentation_begin(); /* * Start the virtual/ptrace DR6 value with just the DR_STEP mask * of the real DR6. ptrace_triggered() will set the DR_TRAPn bits. * * Userspace expects DR_STEP to be visible in ptrace_get_debugreg(6) * even if it is not the result of PTRACE_SINGLESTEP. */ current->thread.virtual_dr6 = (dr6 & DR_STEP); /* * The SDM says "The processor clears the BTF flag when it * generates a debug exception." Clear TIF_BLOCKSTEP to keep * TIF_BLOCKSTEP in sync with the hardware BTF flag. */ clear_thread_flag(TIF_BLOCKSTEP); /* * If dr6 has no reason to give us about the origin of this trap, * then it's very likely the result of an icebp/int01 trap. * User wants a sigtrap for that. */ icebp = !dr6; if (notify_debug(regs, &dr6)) goto out; /* It's safe to allow irq's after DR6 has been saved */ local_irq_enable(); if (v8086_mode(regs)) { handle_vm86_trap((struct kernel_vm86_regs *)regs, 0, X86_TRAP_DB); goto out_irq; } /* #DB for bus lock can only be triggered from userspace. */ if (dr6 & DR_BUS_LOCK) handle_bus_lock(regs); /* Add the virtual_dr6 bits for signals. */ dr6 |= current->thread.virtual_dr6; if (dr6 & (DR_STEP | DR_TRAP_BITS) || icebp) send_sigtrap(regs, 0, get_si_code(dr6)); out_irq: local_irq_disable(); out: instrumentation_end(); irqentry_exit_to_user_mode(regs); } #ifdef CONFIG_X86_64 /* IST stack entry */ DEFINE_IDTENTRY_DEBUG(exc_debug) { exc_debug_kernel(regs, debug_read_reset_dr6()); } /* User entry, runs on regular task stack */ DEFINE_IDTENTRY_DEBUG_USER(exc_debug) { exc_debug_user(regs, debug_read_reset_dr6()); } #ifdef CONFIG_X86_FRED /* * When occurred on different ring level, i.e., from user or kernel * context, #DB needs to be handled on different stack: User #DB on * current task stack, while kernel #DB on a dedicated stack. * * This is exactly how FRED event delivery invokes an exception * handler: ring 3 event on level 0 stack, i.e., current task stack; * ring 0 event on the #DB dedicated stack specified in the * IA32_FRED_STKLVLS MSR. So unlike IDT, the FRED debug exception * entry stub doesn't do stack switch. */ DEFINE_FREDENTRY_DEBUG(exc_debug) { /* * FRED #DB stores DR6 on the stack in the format which * debug_read_reset_dr6() returns for the IDT entry points. */ unsigned long dr6 = fred_event_data(regs); if (user_mode(regs)) exc_debug_user(regs, dr6); else exc_debug_kernel(regs, dr6); } #endif /* CONFIG_X86_FRED */ #else /* 32 bit does not have separate entry points. */ DEFINE_IDTENTRY_RAW(exc_debug) { unsigned long dr6 = debug_read_reset_dr6(); if (user_mode(regs)) exc_debug_user(regs, dr6); else exc_debug_kernel(regs, dr6); } #endif /* * Note that we play around with the 'TS' bit in an attempt to get * the correct behaviour even in the presence of the asynchronous * IRQ13 behaviour */ static void math_error(struct pt_regs *regs, int trapnr) { struct task_struct *task = current; struct fpu *fpu = x86_task_fpu(task); int si_code; char *str = (trapnr == X86_TRAP_MF) ? "fpu exception" : "simd exception"; cond_local_irq_enable(regs); if (!user_mode(regs)) { if (fixup_exception(regs, trapnr, 0, 0)) goto exit; task->thread.error_code = 0; task->thread.trap_nr = trapnr; if (notify_die(DIE_TRAP, str, regs, 0, trapnr, SIGFPE) != NOTIFY_STOP) die(str, regs, 0); goto exit; } /* * Synchronize the FPU register state to the memory register state * if necessary. This allows the exception handler to inspect it. */ fpu_sync_fpstate(fpu); task->thread.trap_nr = trapnr; task->thread.error_code = 0; si_code = fpu__exception_code(fpu, trapnr); /* Retry when we get spurious exceptions: */ if (!si_code) goto exit; if (fixup_vdso_exception(regs, trapnr, 0, 0)) goto exit; force_sig_fault(SIGFPE, si_code, (void __user *)uprobe_get_trap_addr(regs)); exit: cond_local_irq_disable(regs); } DEFINE_IDTENTRY(exc_coprocessor_error) { math_error(regs, X86_TRAP_MF); } DEFINE_IDTENTRY(exc_simd_coprocessor_error) { if (IS_ENABLED(CONFIG_X86_INVD_BUG)) { /* AMD 486 bug: INVD in CPL 0 raises #XF instead of #GP */ if (!static_cpu_has(X86_FEATURE_XMM)) { __exc_general_protection(regs, 0); return; } } math_error(regs, X86_TRAP_XF); } DEFINE_IDTENTRY(exc_spurious_interrupt_bug) { /* * This addresses a Pentium Pro Erratum: * * PROBLEM: If the APIC subsystem is configured in mixed mode with * Virtual Wire mode implemented through the local APIC, an * interrupt vector of 0Fh (Intel reserved encoding) may be * generated by the local APIC (Int 15). This vector may be * generated upon receipt of a spurious interrupt (an interrupt * which is removed before the system receives the INTA sequence) * instead of the programmed 8259 spurious interrupt vector. * * IMPLICATION: The spurious interrupt vector programmed in the * 8259 is normally handled by an operating system's spurious * interrupt handler. However, a vector of 0Fh is unknown to some * operating systems, which would crash if this erratum occurred. * * In theory this could be limited to 32bit, but the handler is not * hurting and who knows which other CPUs suffer from this. */ } static bool handle_xfd_event(struct pt_regs *regs) { u64 xfd_err; int err; if (!IS_ENABLED(CONFIG_X86_64) || !cpu_feature_enabled(X86_FEATURE_XFD)) return false; rdmsrq(MSR_IA32_XFD_ERR, xfd_err); if (!xfd_err) return false; wrmsrq(MSR_IA32_XFD_ERR, 0); /* Die if that happens in kernel space */ if (WARN_ON(!user_mode(regs))) return false; local_irq_enable(); err = xfd_enable_feature(xfd_err); switch (err) { case -EPERM: force_sig_fault(SIGILL, ILL_ILLOPC, error_get_trap_addr(regs)); break; case -EFAULT: force_sig(SIGSEGV); break; } local_irq_disable(); return true; } DEFINE_IDTENTRY(exc_device_not_available) { unsigned long cr0 = read_cr0(); if (handle_xfd_event(regs)) return; #ifdef CONFIG_MATH_EMULATION if (!boot_cpu_has(X86_FEATURE_FPU) && (cr0 & X86_CR0_EM)) { struct math_emu_info info = { }; cond_local_irq_enable(regs); info.regs = regs; math_emulate(&info); cond_local_irq_disable(regs); return; } #endif /* This should not happen. */ if (WARN(cr0 & X86_CR0_TS, "CR0.TS was set")) { /* Try to fix it up and carry on. */ write_cr0(cr0 & ~X86_CR0_TS); } else { /* * Something terrible happened, and we're better off trying * to kill the task than getting stuck in a never-ending * loop of #NM faults. */ die("unexpected #NM exception", regs, 0); } } #ifdef CONFIG_INTEL_TDX_GUEST #define VE_FAULT_STR "VE fault" static void ve_raise_fault(struct pt_regs *regs, long error_code, unsigned long address) { if (user_mode(regs)) { gp_user_force_sig_segv(regs, X86_TRAP_VE, error_code, VE_FAULT_STR); return; } if (gp_try_fixup_and_notify(regs, X86_TRAP_VE, error_code, VE_FAULT_STR, address)) { return; } die_addr(VE_FAULT_STR, regs, error_code, address); } /* * Virtualization Exceptions (#VE) are delivered to TDX guests due to * specific guest actions which may happen in either user space or the * kernel: * * * Specific instructions (WBINVD, for example) * * Specific MSR accesses * * Specific CPUID leaf accesses * * Access to specific guest physical addresses * * In the settings that Linux will run in, virtualization exceptions are * never generated on accesses to normal, TD-private memory that has been * accepted (by BIOS or with tdx_enc_status_changed()). * * Syscall entry code has a critical window where the kernel stack is not * yet set up. Any exception in this window leads to hard to debug issues * and can be exploited for privilege escalation. Exceptions in the NMI * entry code also cause issues. Returning from the exception handler with * IRET will re-enable NMIs and nested NMI will corrupt the NMI stack. * * For these reasons, the kernel avoids #VEs during the syscall gap and * the NMI entry code. Entry code paths do not access TD-shared memory, * MMIO regions, use #VE triggering MSRs, instructions, or CPUID leaves * that might generate #VE. VMM can remove memory from TD at any point, * but access to unaccepted (or missing) private memory leads to VM * termination, not to #VE. * * Similarly to page faults and breakpoints, #VEs are allowed in NMI * handlers once the kernel is ready to deal with nested NMIs. * * During #VE delivery, all interrupts, including NMIs, are blocked until * TDGETVEINFO is called. It prevents #VE nesting until the kernel reads * the VE info. * * If a guest kernel action which would normally cause a #VE occurs in * the interrupt-disabled region before TDGETVEINFO, a #DF (fault * exception) is delivered to the guest which will result in an oops. * * The entry code has been audited carefully for following these expectations. * Changes in the entry code have to be audited for correctness vs. this * aspect. Similarly to #PF, #VE in these places will expose kernel to * privilege escalation or may lead to random crashes. */ DEFINE_IDTENTRY(exc_virtualization_exception) { struct ve_info ve; /* * NMIs/Machine-checks/Interrupts will be in a disabled state * till TDGETVEINFO TDCALL is executed. This ensures that VE * info cannot be overwritten by a nested #VE. */ tdx_get_ve_info(&ve); cond_local_irq_enable(regs); /* * If tdx_handle_virt_exception() could not process * it successfully, treat it as #GP(0) and handle it. */ if (!tdx_handle_virt_exception(regs, &ve)) ve_raise_fault(regs, 0, ve.gla); cond_local_irq_disable(regs); } #endif #ifdef CONFIG_X86_32 DEFINE_IDTENTRY_SW(iret_error) { local_irq_enable(); if (notify_die(DIE_TRAP, "iret exception", regs, 0, X86_TRAP_IRET, SIGILL) != NOTIFY_STOP) { do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, 0, ILL_BADSTK, (void __user *)NULL); } local_irq_disable(); } #endif void __init trap_init(void) { /* Init cpu_entry_area before IST entries are set up */ setup_cpu_entry_areas(); /* Init GHCB memory pages when running as an SEV-ES guest */ sev_es_init_vc_handling(); /* Initialize TSS before setting up traps so ISTs work */ cpu_init_exception_handling(true); /* Setup traps as cpu_init() might #GP */ if (!cpu_feature_enabled(X86_FEATURE_FRED)) idt_setup_traps(); cpu_init(); }
43 8 7487 6697 77 6659 15 8635 3726 8522 3106 3105 15 1116 1119 7 978 333 3 13 46 44 113 9 2745 149 191 28 47 1 4455 2337 2515 364 3885 1 6252 5440 13 8 687 2555 243 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_LIST_H #define _LINUX_LIST_H #include <linux/container_of.h> #include <linux/types.h> #include <linux/stddef.h> #include <linux/poison.h> #include <linux/const.h> #include <asm/barrier.h> /* * Circular doubly linked list implementation. * * Some of the internal functions ("__xxx") are useful when * manipulating whole lists rather than single entries, as * sometimes we already know the next/prev entries and we can * generate better code by using them directly rather than * using the generic single-entry routines. */ /** * LIST_HEAD_INIT - initialize a &struct list_head's links to point to itself * @name: name of the list_head */ #define LIST_HEAD_INIT(name) { &(name), &(name) } /** * LIST_HEAD - definition of a &struct list_head with initialization values * @name: name of the list_head */ #define LIST_HEAD(name) \ struct list_head name = LIST_HEAD_INIT(name) /** * INIT_LIST_HEAD - Initialize a list_head structure * @list: list_head structure to be initialized. * * Initializes the list_head to point to itself. If it is a list header, * the result is an empty list. */ static inline void INIT_LIST_HEAD(struct list_head *list) { WRITE_ONCE(list->next, list); WRITE_ONCE(list->prev, list); } #ifdef CONFIG_LIST_HARDENED #ifdef CONFIG_DEBUG_LIST # define __list_valid_slowpath #else # define __list_valid_slowpath __cold __preserve_most #endif /* * Performs the full set of list corruption checks before __list_add(). * On list corruption reports a warning, and returns false. */ bool __list_valid_slowpath __list_add_valid_or_report(struct list_head *new, struct list_head *prev, struct list_head *next); /* * Performs list corruption checks before __list_add(). Returns false if a * corruption is detected, true otherwise. * * With CONFIG_LIST_HARDENED only, performs minimal list integrity checking * inline to catch non-faulting corruptions, and only if a corruption is * detected calls the reporting function __list_add_valid_or_report(). */ static __always_inline bool __list_add_valid(struct list_head *new, struct list_head *prev, struct list_head *next) { bool ret = true; if (!IS_ENABLED(CONFIG_DEBUG_LIST)) { /* * With the hardening version, elide checking if next and prev * are NULL, since the immediate dereference of them below would * result in a fault if NULL. * * With the reduced set of checks, we can afford to inline the * checks, which also gives the compiler a chance to elide some * of them completely if they can be proven at compile-time. If * one of the pre-conditions does not hold, the slow-path will * show a report which pre-condition failed. */ if (likely(next->prev == prev && prev->next == next && new != prev && new != next)) return true; ret = false; } ret &= __list_add_valid_or_report(new, prev, next); return ret; } /* * Performs the full set of list corruption checks before __list_del_entry(). * On list corruption reports a warning, and returns false. */ bool __list_valid_slowpath __list_del_entry_valid_or_report(struct list_head *entry); /* * Performs list corruption checks before __list_del_entry(). Returns false if a * corruption is detected, true otherwise. * * With CONFIG_LIST_HARDENED only, performs minimal list integrity checking * inline to catch non-faulting corruptions, and only if a corruption is * detected calls the reporting function __list_del_entry_valid_or_report(). */ static __always_inline bool __list_del_entry_valid(struct list_head *entry) { bool ret = true; if (!IS_ENABLED(CONFIG_DEBUG_LIST)) { struct list_head *prev = entry->prev; struct list_head *next = entry->next; /* * With the hardening version, elide checking if next and prev * are NULL, LIST_POISON1 or LIST_POISON2, since the immediate * dereference of them below would result in a fault. */ if (likely(prev->next == entry && next->prev == entry)) return true; ret = false; } ret &= __list_del_entry_valid_or_report(entry); return ret; } #else static inline bool __list_add_valid(struct list_head *new, struct list_head *prev, struct list_head *next) { return true; } static inline bool __list_del_entry_valid(struct list_head *entry) { return true; } #endif /* * Insert a new entry between two known consecutive entries. * * This is only for internal list manipulation where we know * the prev/next entries already! */ static inline void __list_add(struct list_head *new, struct list_head *prev, struct list_head *next) { if (!__list_add_valid(new, prev, next)) return; next->prev = new; new->next = next; new->prev = prev; WRITE_ONCE(prev->next, new); } /** * list_add - add a new entry * @new: new entry to be added * @head: list head to add it after * * Insert a new entry after the specified head. * This is good for implementing stacks. */ static inline void list_add(struct list_head *new, struct list_head *head) { __list_add(new, head, head->next); } /** * list_add_tail - add a new entry * @new: new entry to be added * @head: list head to add it before * * Insert a new entry before the specified head. * This is useful for implementing queues. */ static inline void list_add_tail(struct list_head *new, struct list_head *head) { __list_add(new, head->prev, head); } /* * Delete a list entry by making the prev/next entries * point to each other. * * This is only for internal list manipulation where we know * the prev/next entries already! */ static inline void __list_del(struct list_head * prev, struct list_head * next) { next->prev = prev; WRITE_ONCE(prev->next, next); } /* * Delete a list entry and clear the 'prev' pointer. * * This is a special-purpose list clearing method used in the networking code * for lists allocated as per-cpu, where we don't want to incur the extra * WRITE_ONCE() overhead of a regular list_del_init(). The code that uses this * needs to check the node 'prev' pointer instead of calling list_empty(). */ static inline void __list_del_clearprev(struct list_head *entry) { __list_del(entry->prev, entry->next); entry->prev = NULL; } static inline void __list_del_entry(struct list_head *entry) { if (!__list_del_entry_valid(entry)) return; __list_del(entry->prev, entry->next); } /** * list_del - deletes entry from list. * @entry: the element to delete from the list. * Note: list_empty() on entry does not return true after this, the entry is * in an undefined state. */ static inline void list_del(struct list_head *entry) { __list_del_entry(entry); entry->next = LIST_POISON1; entry->prev = LIST_POISON2; } /** * list_replace - replace old entry by new one * @old : the element to be replaced * @new : the new element to insert * * If @old was empty, it will be overwritten. */ static inline void list_replace(struct list_head *old, struct list_head *new) { new->next = old->next; new->next->prev = new; new->prev = old->prev; new->prev->next = new; } /** * list_replace_init - replace old entry by new one and initialize the old one * @old : the element to be replaced * @new : the new element to insert * * If @old was empty, it will be overwritten. */ static inline void list_replace_init(struct list_head *old, struct list_head *new) { list_replace(old, new); INIT_LIST_HEAD(old); } /** * list_swap - replace entry1 with entry2 and re-add entry1 at entry2's position * @entry1: the location to place entry2 * @entry2: the location to place entry1 */ static inline void list_swap(struct list_head *entry1, struct list_head *entry2) { struct list_head *pos = entry2->prev; list_del(entry2); list_replace(entry1, entry2); if (pos == entry1) pos = entry2; list_add(entry1, pos); } /** * list_del_init - deletes entry from list and reinitialize it. * @entry: the element to delete from the list. */ static inline void list_del_init(struct list_head *entry) { __list_del_entry(entry); INIT_LIST_HEAD(entry); } /** * list_move - delete from one list and add as another's head * @list: the entry to move * @head: the head that will precede our entry */ static inline void list_move(struct list_head *list, struct list_head *head) { __list_del_entry(list); list_add(list, head); } /** * list_move_tail - delete from one list and add as another's tail * @list: the entry to move * @head: the head that will follow our entry */ static inline void list_move_tail(struct list_head *list, struct list_head *head) { __list_del_entry(list); list_add_tail(list, head); } /** * list_bulk_move_tail - move a subsection of a list to its tail * @head: the head that will follow our entry * @first: first entry to move * @last: last entry to move, can be the same as first * * Move all entries between @first and including @last before @head. * All three entries must belong to the same linked list. */ static inline void list_bulk_move_tail(struct list_head *head, struct list_head *first, struct list_head *last) { first->prev->next = last->next; last->next->prev = first->prev; head->prev->next = first; first->prev = head->prev; last->next = head; head->prev = last; } /** * list_is_first -- tests whether @list is the first entry in list @head * @list: the entry to test * @head: the head of the list */ static inline int list_is_first(const struct list_head *list, const struct list_head *head) { return list->prev == head; } /** * list_is_last - tests whether @list is the last entry in list @head * @list: the entry to test * @head: the head of the list */ static inline int list_is_last(const struct list_head *list, const struct list_head *head) { return list->next == head; } /** * list_is_head - tests whether @list is the list @head * @list: the entry to test * @head: the head of the list */ static inline int list_is_head(const struct list_head *list, const struct list_head *head) { return list == head; } /** * list_empty - tests whether a list is empty * @head: the list to test. */ static inline int list_empty(const struct list_head *head) { return READ_ONCE(head->next) == head; } /** * list_del_init_careful - deletes entry from list and reinitialize it. * @entry: the element to delete from the list. * * This is the same as list_del_init(), except designed to be used * together with list_empty_careful() in a way to guarantee ordering * of other memory operations. * * Any memory operations done before a list_del_init_careful() are * guaranteed to be visible after a list_empty_careful() test. */ static inline void list_del_init_careful(struct list_head *entry) { __list_del_entry(entry); WRITE_ONCE(entry->prev, entry); smp_store_release(&entry->next, entry); } /** * list_empty_careful - tests whether a list is empty and not being modified * @head: the list to test * * Description: * tests whether a list is empty _and_ checks that no other CPU might be * in the process of modifying either member (next or prev) * * NOTE: using list_empty_careful() without synchronization * can only be safe if the only activity that can happen * to the list entry is list_del_init(). Eg. it cannot be used * if another CPU could re-list_add() it. */ static inline int list_empty_careful(const struct list_head *head) { struct list_head *next = smp_load_acquire(&head->next); return list_is_head(next, head) && (next == READ_ONCE(head->prev)); } /** * list_rotate_left - rotate the list to the left * @head: the head of the list */ static inline void list_rotate_left(struct list_head *head) { struct list_head *first; if (!list_empty(head)) { first = head->next; list_move_tail(first, head); } } /** * list_rotate_to_front() - Rotate list to specific item. * @list: The desired new front of the list. * @head: The head of the list. * * Rotates list so that @list becomes the new front of the list. */ static inline void list_rotate_to_front(struct list_head *list, struct list_head *head) { /* * Deletes the list head from the list denoted by @head and * places it as the tail of @list, this effectively rotates the * list so that @list is at the front. */ list_move_tail(head, list); } /** * list_is_singular - tests whether a list has just one entry. * @head: the list to test. */ static inline int list_is_singular(const struct list_head *head) { return !list_empty(head) && (head->next == head->prev); } static inline void __list_cut_position(struct list_head *list, struct list_head *head, struct list_head *entry) { struct list_head *new_first = entry->next; list->next = head->next; list->next->prev = list; list->prev = entry; entry->next = list; head->next = new_first; new_first->prev = head; } /** * list_cut_position - cut a list into two * @list: a new list to add all removed entries * @head: a list with entries * @entry: an entry within head, could be the head itself * and if so we won't cut the list * * This helper moves the initial part of @head, up to and * including @entry, from @head to @list. You should * pass on @entry an element you know is on @head. @list * should be an empty list or a list you do not care about * losing its data. * */ static inline void list_cut_position(struct list_head *list, struct list_head *head, struct list_head *entry) { if (list_empty(head)) return; if (list_is_singular(head) && !list_is_head(entry, head) && (entry != head->next)) return; if (list_is_head(entry, head)) INIT_LIST_HEAD(list); else __list_cut_position(list, head, entry); } /** * list_cut_before - cut a list into two, before given entry * @list: a new list to add all removed entries * @head: a list with entries * @entry: an entry within head, could be the head itself * * This helper moves the initial part of @head, up to but * excluding @entry, from @head to @list. You should pass * in @entry an element you know is on @head. @list should * be an empty list or a list you do not care about losing * its data. * If @entry == @head, all entries on @head are moved to * @list. */ static inline void list_cut_before(struct list_head *list, struct list_head *head, struct list_head *entry) { if (head->next == entry) { INIT_LIST_HEAD(list); return; } list->next = head->next; list->next->prev = list; list->prev = entry->prev; list->prev->next = list; head->next = entry; entry->prev = head; } static inline void __list_splice(const struct list_head *list, struct list_head *prev, struct list_head *next) { struct list_head *first = list->next; struct list_head *last = list->prev; first->prev = prev; prev->next = first; last->next = next; next->prev = last; } /** * list_splice - join two lists, this is designed for stacks * @list: the new list to add. * @head: the place to add it in the first list. */ static inline void list_splice(const struct list_head *list, struct list_head *head) { if (!list_empty(list)) __list_splice(list, head, head->next); } /** * list_splice_tail - join two lists, each list being a queue * @list: the new list to add. * @head: the place to add it in the first list. */ static inline void list_splice_tail(struct list_head *list, struct list_head *head) { if (!list_empty(list)) __list_splice(list, head->prev, head); } /** * list_splice_init - join two lists and reinitialise the emptied list. * @list: the new list to add. * @head: the place to add it in the first list. * * The list at @list is reinitialised */ static inline void list_splice_init(struct list_head *list, struct list_head *head) { if (!list_empty(list)) { __list_splice(list, head, head->next); INIT_LIST_HEAD(list); } } /** * list_splice_tail_init - join two lists and reinitialise the emptied list * @list: the new list to add. * @head: the place to add it in the first list. * * Each of the lists is a queue. * The list at @list is reinitialised */ static inline void list_splice_tail_init(struct list_head *list, struct list_head *head) { if (!list_empty(list)) { __list_splice(list, head->prev, head); INIT_LIST_HEAD(list); } } /** * list_entry - get the struct for this entry * @ptr: the &struct list_head pointer. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. */ #define list_entry(ptr, type, member) \ container_of(ptr, type, member) /** * list_first_entry - get the first element from a list * @ptr: the list head to take the element from. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. * * Note, that list is expected to be not empty. */ #define list_first_entry(ptr, type, member) \ list_entry((ptr)->next, type, member) /** * list_last_entry - get the last element from a list * @ptr: the list head to take the element from. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. * * Note, that list is expected to be not empty. */ #define list_last_entry(ptr, type, member) \ list_entry((ptr)->prev, type, member) /** * list_first_entry_or_null - get the first element from a list * @ptr: the list head to take the element from. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. * * Note that if the list is empty, it returns NULL. */ #define list_first_entry_or_null(ptr, type, member) ({ \ struct list_head *head__ = (ptr); \ struct list_head *pos__ = READ_ONCE(head__->next); \ pos__ != head__ ? list_entry(pos__, type, member) : NULL; \ }) /** * list_last_entry_or_null - get the last element from a list * @ptr: the list head to take the element from. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. * * Note that if the list is empty, it returns NULL. */ #define list_last_entry_or_null(ptr, type, member) ({ \ struct list_head *head__ = (ptr); \ struct list_head *pos__ = READ_ONCE(head__->prev); \ pos__ != head__ ? list_entry(pos__, type, member) : NULL; \ }) /** * list_next_entry - get the next element in list * @pos: the type * to cursor * @member: the name of the list_head within the struct. */ #define list_next_entry(pos, member) \ list_entry((pos)->member.next, typeof(*(pos)), member) /** * list_next_entry_circular - get the next element in list * @pos: the type * to cursor. * @head: the list head to take the element from. * @member: the name of the list_head within the struct. * * Wraparound if pos is the last element (return the first element). * Note, that list is expected to be not empty. */ #define list_next_entry_circular(pos, head, member) \ (list_is_last(&(pos)->member, head) ? \ list_first_entry(head, typeof(*(pos)), member) : list_next_entry(pos, member)) /** * list_prev_entry - get the prev element in list * @pos: the type * to cursor * @member: the name of the list_head within the struct. */ #define list_prev_entry(pos, member) \ list_entry((pos)->member.prev, typeof(*(pos)), member) /** * list_prev_entry_circular - get the prev element in list * @pos: the type * to cursor. * @head: the list head to take the element from. * @member: the name of the list_head within the struct. * * Wraparound if pos is the first element (return the last element). * Note, that list is expected to be not empty. */ #define list_prev_entry_circular(pos, head, member) \ (list_is_first(&(pos)->member, head) ? \ list_last_entry(head, typeof(*(pos)), member) : list_prev_entry(pos, member)) /** * list_for_each - iterate over a list * @pos: the &struct list_head to use as a loop cursor. * @head: the head for your list. */ #define list_for_each(pos, head) \ for (pos = (head)->next; !list_is_head(pos, (head)); pos = pos->next) /** * list_for_each_continue - continue iteration over a list * @pos: the &struct list_head to use as a loop cursor. * @head: the head for your list. * * Continue to iterate over a list, continuing after the current position. */ #define list_for_each_continue(pos, head) \ for (pos = pos->next; !list_is_head(pos, (head)); pos = pos->next) /** * list_for_each_prev - iterate over a list backwards * @pos: the &struct list_head to use as a loop cursor. * @head: the head for your list. */ #define list_for_each_prev(pos, head) \ for (pos = (head)->prev; !list_is_head(pos, (head)); pos = pos->prev) /** * list_for_each_safe - iterate over a list safe against removal of list entry * @pos: the &struct list_head to use as a loop cursor. * @n: another &struct list_head to use as temporary storage * @head: the head for your list. */ #define list_for_each_safe(pos, n, head) \ for (pos = (head)->next, n = pos->next; \ !list_is_head(pos, (head)); \ pos = n, n = pos->next) /** * list_for_each_prev_safe - iterate over a list backwards safe against removal of list entry * @pos: the &struct list_head to use as a loop cursor. * @n: another &struct list_head to use as temporary storage * @head: the head for your list. */ #define list_for_each_prev_safe(pos, n, head) \ for (pos = (head)->prev, n = pos->prev; \ !list_is_head(pos, (head)); \ pos = n, n = pos->prev) /** * list_count_nodes - count nodes in the list * @head: the head for your list. */ static inline size_t list_count_nodes(struct list_head *head) { struct list_head *pos; size_t count = 0; list_for_each(pos, head) count++; return count; } /** * list_entry_is_head - test if the entry points to the head of the list * @pos: the type * to cursor * @head: the head for your list. * @member: the name of the list_head within the struct. */ #define list_entry_is_head(pos, head, member) \ list_is_head(&pos->member, (head)) /** * list_for_each_entry - iterate over list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. */ #define list_for_each_entry(pos, head, member) \ for (pos = list_first_entry(head, typeof(*pos), member); \ !list_entry_is_head(pos, head, member); \ pos = list_next_entry(pos, member)) /** * list_for_each_entry_reverse - iterate backwards over list of given type. * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. */ #define list_for_each_entry_reverse(pos, head, member) \ for (pos = list_last_entry(head, typeof(*pos), member); \ !list_entry_is_head(pos, head, member); \ pos = list_prev_entry(pos, member)) /** * list_prepare_entry - prepare a pos entry for use in list_for_each_entry_continue() * @pos: the type * to use as a start point * @head: the head of the list * @member: the name of the list_head within the struct. * * Prepares a pos entry for use as a start point in list_for_each_entry_continue(). */ #define list_prepare_entry(pos, head, member) \ ((pos) ? : list_entry(head, typeof(*pos), member)) /** * list_for_each_entry_continue - continue iteration over list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. * * Continue to iterate over list of given type, continuing after * the current position. */ #define list_for_each_entry_continue(pos, head, member) \ for (pos = list_next_entry(pos, member); \ !list_entry_is_head(pos, head, member); \ pos = list_next_entry(pos, member)) /** * list_for_each_entry_continue_reverse - iterate backwards from the given point * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. * * Start to iterate over list of given type backwards, continuing after * the current position. */ #define list_for_each_entry_continue_reverse(pos, head, member) \ for (pos = list_prev_entry(pos, member); \ !list_entry_is_head(pos, head, member); \ pos = list_prev_entry(pos, member)) /** * list_for_each_entry_from - iterate over list of given type from the current point * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. * * Iterate over list of given type, continuing from current position. */ #define list_for_each_entry_from(pos, head, member) \ for (; !list_entry_is_head(pos, head, member); \ pos = list_next_entry(pos, member)) /** * list_for_each_entry_from_reverse - iterate backwards over list of given type * from the current point * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. * * Iterate backwards over list of given type, continuing from current position. */ #define list_for_each_entry_from_reverse(pos, head, member) \ for (; !list_entry_is_head(pos, head, member); \ pos = list_prev_entry(pos, member)) /** * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry * @pos: the type * to use as a loop cursor. * @n: another type * to use as temporary storage * @head: the head for your list. * @member: the name of the list_head within the struct. */ #define list_for_each_entry_safe(pos, n, head, member) \ for (pos = list_first_entry(head, typeof(*pos), member), \ n = list_next_entry(pos, member); \ !list_entry_is_head(pos, head, member); \ pos = n, n = list_next_entry(n, member)) /** * list_for_each_entry_safe_continue - continue list iteration safe against removal * @pos: the type * to use as a loop cursor. * @n: another type * to use as temporary storage * @head: the head for your list. * @member: the name of the list_head within the struct. * * Iterate over list of given type, continuing after current point, * safe against removal of list entry. */ #define list_for_each_entry_safe_continue(pos, n, head, member) \ for (pos = list_next_entry(pos, member), \ n = list_next_entry(pos, member); \ !list_entry_is_head(pos, head, member); \ pos = n, n = list_next_entry(n, member)) /** * list_for_each_entry_safe_from - iterate over list from current point safe against removal * @pos: the type * to use as a loop cursor. * @n: another type * to use as temporary storage * @head: the head for your list. * @member: the name of the list_head within the struct. * * Iterate over list of given type from current point, safe against * removal of list entry. */ #define list_for_each_entry_safe_from(pos, n, head, member) \ for (n = list_next_entry(pos, member); \ !list_entry_is_head(pos, head, member); \ pos = n, n = list_next_entry(n, member)) /** * list_for_each_entry_safe_reverse - iterate backwards over list safe against removal * @pos: the type * to use as a loop cursor. * @n: another type * to use as temporary storage * @head: the head for your list. * @member: the name of the list_head within the struct. * * Iterate backwards over list of given type, safe against removal * of list entry. */ #define list_for_each_entry_safe_reverse(pos, n, head, member) \ for (pos = list_last_entry(head, typeof(*pos), member), \ n = list_prev_entry(pos, member); \ !list_entry_is_head(pos, head, member); \ pos = n, n = list_prev_entry(n, member)) /** * list_safe_reset_next - reset a stale list_for_each_entry_safe loop * @pos: the loop cursor used in the list_for_each_entry_safe loop * @n: temporary storage used in list_for_each_entry_safe * @member: the name of the list_head within the struct. * * list_safe_reset_next is not safe to use in general if the list may be * modified concurrently (eg. the lock is dropped in the loop body). An * exception to this is if the cursor element (pos) is pinned in the list, * and list_safe_reset_next is called after re-taking the lock and before * completing the current iteration of the loop body. */ #define list_safe_reset_next(pos, n, member) \ n = list_next_entry(pos, member) /* * Double linked lists with a single pointer list head. * Mostly useful for hash tables where the two pointer list head is * too wasteful. * You lose the ability to access the tail in O(1). */ #define HLIST_HEAD_INIT { .first = NULL } #define HLIST_HEAD(name) struct hlist_head name = { .first = NULL } #define INIT_HLIST_HEAD(ptr) ((ptr)->first = NULL) static inline void INIT_HLIST_NODE(struct hlist_node *h) { h->next = NULL; h->pprev = NULL; } /** * hlist_unhashed - Has node been removed from list and reinitialized? * @h: Node to be checked * * Not that not all removal functions will leave a node in unhashed * state. For example, hlist_nulls_del_init_rcu() does leave the * node in unhashed state, but hlist_nulls_del() does not. */ static inline int hlist_unhashed(const struct hlist_node *h) { return !h->pprev; } /** * hlist_unhashed_lockless - Version of hlist_unhashed for lockless use * @h: Node to be checked * * This variant of hlist_unhashed() must be used in lockless contexts * to avoid potential load-tearing. The READ_ONCE() is paired with the * various WRITE_ONCE() in hlist helpers that are defined below. */ static inline int hlist_unhashed_lockless(const struct hlist_node *h) { return !READ_ONCE(h->pprev); } /** * hlist_empty - Is the specified hlist_head structure an empty hlist? * @h: Structure to check. */ static inline int hlist_empty(const struct hlist_head *h) { return !READ_ONCE(h->first); } static inline void __hlist_del(struct hlist_node *n) { struct hlist_node *next = n->next; struct hlist_node **pprev = n->pprev; WRITE_ONCE(*pprev, next); if (next) WRITE_ONCE(next->pprev, pprev); } /** * hlist_del - Delete the specified hlist_node from its list * @n: Node to delete. * * Note that this function leaves the node in hashed state. Use * hlist_del_init() or similar instead to unhash @n. */ static inline void hlist_del(struct hlist_node *n) { __hlist_del(n); n->next = LIST_POISON1; n->pprev = LIST_POISON2; } /** * hlist_del_init - Delete the specified hlist_node from its list and initialize * @n: Node to delete. * * Note that this function leaves the node in unhashed state. */ static inline void hlist_del_init(struct hlist_node *n) { if (!hlist_unhashed(n)) { __hlist_del(n); INIT_HLIST_NODE(n); } } /** * hlist_add_head - add a new entry at the beginning of the hlist * @n: new entry to be added * @h: hlist head to add it after * * Insert a new entry after the specified head. * This is good for implementing stacks. */ static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h) { struct hlist_node *first = h->first; WRITE_ONCE(n->next, first); if (first) WRITE_ONCE(first->pprev, &n->next); WRITE_ONCE(h->first, n); WRITE_ONCE(n->pprev, &h->first); } /** * hlist_add_before - add a new entry before the one specified * @n: new entry to be added * @next: hlist node to add it before, which must be non-NULL */ static inline void hlist_add_before(struct hlist_node *n, struct hlist_node *next) { WRITE_ONCE(n->pprev, next->pprev); WRITE_ONCE(n->next, next); WRITE_ONCE(next->pprev, &n->next); WRITE_ONCE(*(n->pprev), n); } /** * hlist_add_behind - add a new entry after the one specified * @n: new entry to be added * @prev: hlist node to add it after, which must be non-NULL */ static inline void hlist_add_behind(struct hlist_node *n, struct hlist_node *prev) { WRITE_ONCE(n->next, prev->next); WRITE_ONCE(prev->next, n); WRITE_ONCE(n->pprev, &prev->next); if (n->next) WRITE_ONCE(n->next->pprev, &n->next); } /** * hlist_add_fake - create a fake hlist consisting of a single headless node * @n: Node to make a fake list out of * * This makes @n appear to be its own predecessor on a headless hlist. * The point of this is to allow things like hlist_del() to work correctly * in cases where there is no list. */ static inline void hlist_add_fake(struct hlist_node *n) { n->pprev = &n->next; } /** * hlist_fake: Is this node a fake hlist? * @h: Node to check for being a self-referential fake hlist. */ static inline bool hlist_fake(struct hlist_node *h) { return h->pprev == &h->next; } /** * hlist_is_singular_node - is node the only element of the specified hlist? * @n: Node to check for singularity. * @h: Header for potentially singular list. * * Check whether the node is the only node of the head without * accessing head, thus avoiding unnecessary cache misses. */ static inline bool hlist_is_singular_node(struct hlist_node *n, struct hlist_head *h) { return !n->next && n->pprev == &h->first; } /** * hlist_move_list - Move an hlist * @old: hlist_head for old list. * @new: hlist_head for new list. * * Move a list from one list head to another. Fixup the pprev * reference of the first entry if it exists. */ static inline void hlist_move_list(struct hlist_head *old, struct hlist_head *new) { new->first = old->first; if (new->first) new->first->pprev = &new->first; old->first = NULL; } /** * hlist_splice_init() - move all entries from one list to another * @from: hlist_head from which entries will be moved * @last: last entry on the @from list * @to: hlist_head to which entries will be moved * * @to can be empty, @from must contain at least @last. */ static inline void hlist_splice_init(struct hlist_head *from, struct hlist_node *last, struct hlist_head *to) { if (to->first) to->first->pprev = &last->next; last->next = to->first; to->first = from->first; from->first->pprev = &to->first; from->first = NULL; } #define hlist_entry(ptr, type, member) container_of(ptr,type,member) #define hlist_for_each(pos, head) \ for (pos = (head)->first; pos ; pos = pos->next) #define hlist_for_each_safe(pos, n, head) \ for (pos = (head)->first; pos && ({ n = pos->next; 1; }); \ pos = n) #define hlist_entry_safe(ptr, type, member) \ ({ typeof(ptr) ____ptr = (ptr); \ ____ptr ? hlist_entry(____ptr, type, member) : NULL; \ }) /** * hlist_for_each_entry - iterate over list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry(pos, head, member) \ for (pos = hlist_entry_safe((head)->first, typeof(*(pos)), member);\ pos; \ pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member)) /** * hlist_for_each_entry_continue - iterate over a hlist continuing after current point * @pos: the type * to use as a loop cursor. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry_continue(pos, member) \ for (pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member);\ pos; \ pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member)) /** * hlist_for_each_entry_from - iterate over a hlist continuing from current point * @pos: the type * to use as a loop cursor. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry_from(pos, member) \ for (; pos; \ pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member)) /** * hlist_for_each_entry_safe - iterate over list of given type safe against removal of list entry * @pos: the type * to use as a loop cursor. * @n: a &struct hlist_node to use as temporary storage * @head: the head for your list. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry_safe(pos, n, head, member) \ for (pos = hlist_entry_safe((head)->first, typeof(*pos), member);\ pos && ({ n = pos->member.next; 1; }); \ pos = hlist_entry_safe(n, typeof(*pos), member)) /** * hlist_count_nodes - count nodes in the hlist * @head: the head for your hlist. */ static inline size_t hlist_count_nodes(struct hlist_head *head) { struct hlist_node *pos; size_t count = 0; hlist_for_each(pos, head) count++; return count; } #endif
5 26 239 239 239 220 239 220 295 295 77 291 300 337 336 336 337 337 337 209 345 60 10 345 345 367 369 327 345 345 327 327 327 326 327 369 368 369 369 369 10 327 326 345 345 369 16 369 369 10 9 26 345 369 1 369 327 345 20 20 367 13 9 13 12 10 232 4 231 232 232 356 356 239 231 227 227 378 379 357 239 291 74 73 10 351 352 351 352 379 379 24 19 12 11 12 1 1 1 1 337 337 2 231 329 22 332 336 24 1 9 23 11 14 1 41 41 20 1 1 1 1 1 3 1 2 3 337 336 336 337 337 337 24 337 338 337 336 24 337 338 219 220 223 224 222 222 220 220 220 219 220 222 14 14 226 226 226 232 232 232 231 23 16 16 23 9 23 22 14 23 23 26 26 23 26 23 341 293 293 288 288 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2006 Silicon Graphics, Inc. * All Rights Reserved. */ #include "xfs_platform.h" #include <linux/backing-dev.h> #include <linux/dax.h> #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_trace.h" #include "xfs_log.h" #include "xfs_log_recover.h" #include "xfs_log_priv.h" #include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_ag.h" #include "xfs_buf_mem.h" #include "xfs_notify_failure.h" struct kmem_cache *xfs_buf_cache; /* * Locking orders * * xfs_buf_stale: * b_sema (caller holds) * b_lockref.lock * lru_lock * * xfs_buf_rele: * b_lockref.lock * lru_lock * * xfs_buftarg_drain_rele * lru_lock * b_lockref.lock (trylock due to inversion) * * xfs_buftarg_isolate * lru_lock * b_lockref.lock (trylock due to inversion) */ static void xfs_buf_submit(struct xfs_buf *bp); static int xfs_buf_iowait(struct xfs_buf *bp); static inline bool xfs_buf_is_uncached(struct xfs_buf *bp) { return bp->b_rhash_key == XFS_BUF_DADDR_NULL; } /* * When we mark a buffer stale, we remove the buffer from the LRU and clear the * b_lru_ref count so that the buffer is freed immediately when the buffer * reference count falls to zero. If the buffer is already on the LRU, we need * to remove the reference that LRU holds on the buffer. * * This prevents build-up of stale buffers on the LRU. */ void xfs_buf_stale( struct xfs_buf *bp) { ASSERT(xfs_buf_islocked(bp)); bp->b_flags |= XBF_STALE; /* * Clear the delwri status so that a delwri queue walker will not * flush this buffer to disk now that it is stale. The delwri queue has * a reference to the buffer, so this is safe to do. */ bp->b_flags &= ~_XBF_DELWRI_Q; spin_lock(&bp->b_lockref.lock); atomic_set(&bp->b_lru_ref, 0); if (!__lockref_is_dead(&bp->b_lockref)) list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru); spin_unlock(&bp->b_lockref.lock); } static void xfs_buf_free_callback( struct callback_head *cb) { struct xfs_buf *bp = container_of(cb, struct xfs_buf, b_rcu); if (bp->b_maps != &bp->__b_map) kfree(bp->b_maps); kmem_cache_free(xfs_buf_cache, bp); } static void xfs_buf_free( struct xfs_buf *bp) { unsigned int size = BBTOB(bp->b_length); might_sleep(); trace_xfs_buf_free(bp, _RET_IP_); ASSERT(list_empty(&bp->b_lru)); if (!xfs_buftarg_is_mem(bp->b_target) && size >= PAGE_SIZE) mm_account_reclaimed_pages(howmany(size, PAGE_SHIFT)); if (is_vmalloc_addr(bp->b_addr)) vfree(bp->b_addr); else if (bp->b_flags & _XBF_KMEM) kfree(bp->b_addr); else folio_put(virt_to_folio(bp->b_addr)); call_rcu(&bp->b_rcu, xfs_buf_free_callback); } static int xfs_buf_alloc_kmem( struct xfs_buf *bp, size_t size, gfp_t gfp_mask) { ASSERT(is_power_of_2(size)); ASSERT(size < PAGE_SIZE); bp->b_addr = kmalloc(size, gfp_mask | __GFP_NOFAIL); if (!bp->b_addr) return -ENOMEM; /* * Slab guarantees that we get back naturally aligned allocations for * power of two sizes. Keep this check as the canary in the coal mine * if anything changes in slab. */ if (WARN_ON_ONCE(!IS_ALIGNED((unsigned long)bp->b_addr, size))) { kfree(bp->b_addr); bp->b_addr = NULL; return -ENOMEM; } bp->b_flags |= _XBF_KMEM; trace_xfs_buf_backing_kmem(bp, _RET_IP_); return 0; } /* * Allocate backing memory for a buffer. * * For tmpfs-backed buffers used by in-memory btrees this directly maps the * tmpfs page cache folios. * * For real file system buffers there are three different kinds backing memory: * * The first type backs the buffer by a kmalloc allocation. This is done for * less than PAGE_SIZE allocations to avoid wasting memory. * * The second type is a single folio buffer - this may be a high order folio or * just a single page sized folio, but either way they get treated the same way * by the rest of the code - the buffer memory spans a single contiguous memory * region that we don't have to map and unmap to access the data directly. * * The third type of buffer is the vmalloc()d buffer. This provides the buffer * with the required contiguous memory region but backed by discontiguous * physical pages. */ static int xfs_buf_alloc_backing_mem( struct xfs_buf *bp, xfs_buf_flags_t flags) { size_t size = BBTOB(bp->b_length); gfp_t gfp_mask = GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOWARN; struct folio *folio; if (xfs_buftarg_is_mem(bp->b_target)) return xmbuf_map_backing_mem(bp); /* Assure zeroed buffer for non-read cases. */ if (!(flags & XBF_READ)) gfp_mask |= __GFP_ZERO; if (flags & XBF_READ_AHEAD) gfp_mask |= __GFP_NORETRY; /* * For buffers smaller than PAGE_SIZE use a kmalloc allocation if that * is properly aligned. The slab allocator now guarantees an aligned * allocation for all power of two sizes, which matches most of the * smaller than PAGE_SIZE buffers used by XFS. */ if (size < PAGE_SIZE && is_power_of_2(size)) return xfs_buf_alloc_kmem(bp, size, gfp_mask); /* * Don't bother with the retry loop for single PAGE allocations: vmalloc * won't do any better. */ if (size <= PAGE_SIZE) gfp_mask |= __GFP_NOFAIL; /* * Optimistically attempt a single high order folio allocation for * larger than PAGE_SIZE buffers. * * Allocating a high order folio makes the assumption that buffers are a * power-of-2 size, matching the power-of-2 folios sizes available. * * The exception here are user xattr data buffers, which can be arbitrarily * sized up to 64kB plus structure metadata, skip straight to the vmalloc * path for them instead of wasting memory here. */ if (size > PAGE_SIZE) { if (!is_power_of_2(size)) goto fallback; gfp_mask &= ~__GFP_DIRECT_RECLAIM; gfp_mask |= __GFP_NORETRY; } folio = folio_alloc(gfp_mask, get_order(size)); if (!folio) { if (size <= PAGE_SIZE) return -ENOMEM; trace_xfs_buf_backing_fallback(bp, _RET_IP_); goto fallback; } bp->b_addr = folio_address(folio); trace_xfs_buf_backing_folio(bp, _RET_IP_); return 0; fallback: for (;;) { bp->b_addr = __vmalloc(size, gfp_mask); if (bp->b_addr) break; if (flags & XBF_READ_AHEAD) return -ENOMEM; XFS_STATS_INC(bp->b_mount, xb_page_retries); memalloc_retry_wait(gfp_mask); } trace_xfs_buf_backing_vmalloc(bp, _RET_IP_); return 0; } static int xfs_buf_alloc( struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, xfs_buf_flags_t flags, struct xfs_buf **bpp) { struct xfs_buf *bp; int error; int i; *bpp = NULL; bp = kmem_cache_zalloc(xfs_buf_cache, GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); /* * We don't want certain flags to appear in b_flags unless they are * specifically set by later operations on the buffer. */ flags &= ~(XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD); /* * A new buffer is held and locked by the owner. This ensures that the * buffer is owned by the caller and racing RCU lookups right after * inserting into the hash table are safe (and will have to wait for * the unlock to do anything non-trivial). */ lockref_init(&bp->b_lockref); sema_init(&bp->b_sema, 0); /* held, no waiters */ atomic_set(&bp->b_lru_ref, 1); init_completion(&bp->b_iowait); INIT_LIST_HEAD(&bp->b_lru); INIT_LIST_HEAD(&bp->b_list); INIT_LIST_HEAD(&bp->b_li_list); bp->b_target = target; bp->b_mount = target->bt_mount; bp->b_flags = flags; bp->b_rhash_key = map[0].bm_bn; bp->b_length = 0; bp->b_map_count = nmaps; if (nmaps == 1) bp->b_maps = &bp->__b_map; else bp->b_maps = kzalloc_objs(struct xfs_buf_map, nmaps, GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); for (i = 0; i < nmaps; i++) { bp->b_maps[i].bm_bn = map[i].bm_bn; bp->b_maps[i].bm_len = map[i].bm_len; bp->b_length += map[i].bm_len; } atomic_set(&bp->b_pin_count, 0); init_waitqueue_head(&bp->b_waiters); XFS_STATS_INC(bp->b_mount, xb_create); trace_xfs_buf_init(bp, _RET_IP_); error = xfs_buf_alloc_backing_mem(bp, flags); if (error) { xfs_buf_free(bp); return error; } *bpp = bp; return 0; } /* * Finding and Reading Buffers */ static int _xfs_buf_obj_cmp( struct rhashtable_compare_arg *arg, const void *obj) { const struct xfs_buf_map *map = arg->key; const struct xfs_buf *bp = obj; /* * The key hashing in the lookup path depends on the key being the * first element of the compare_arg, make sure to assert this. */ BUILD_BUG_ON(offsetof(struct xfs_buf_map, bm_bn) != 0); if (bp->b_rhash_key != map->bm_bn) return 1; if (unlikely(bp->b_length != map->bm_len)) { /* * found a block number match. If the range doesn't * match, the only way this is allowed is if the buffer * in the cache is stale and the transaction that made * it stale has not yet committed. i.e. we are * reallocating a busy extent. Skip this buffer and * continue searching for an exact match. * * Note: If we're scanning for incore buffers to stale, don't * complain if we find non-stale buffers. */ if (!(map->bm_flags & XBM_LIVESCAN)) ASSERT(bp->b_flags & XBF_STALE); return 1; } return 0; } static const struct rhashtable_params xfs_buf_hash_params = { .min_size = 32, /* empty AGs have minimal footprint */ .nelem_hint = 16, .key_len = sizeof(xfs_daddr_t), .key_offset = offsetof(struct xfs_buf, b_rhash_key), .head_offset = offsetof(struct xfs_buf, b_rhash_head), .automatic_shrinking = true, .obj_cmpfn = _xfs_buf_obj_cmp, }; static int xfs_buf_map_verify( struct xfs_buftarg *btp, struct xfs_buf_map *map) { /* Check for IOs smaller than the sector size / not sector aligned */ ASSERT(!(BBTOB(map->bm_len) < btp->bt_meta_sectorsize)); ASSERT(!(BBTOB(map->bm_bn) & (xfs_off_t)btp->bt_meta_sectormask)); /* * Corrupted block numbers can get through to here, unfortunately, so we * have to check that the buffer falls within the filesystem bounds. */ if (map->bm_bn < 0 || map->bm_bn >= btp->bt_nr_sectors) { xfs_alert(btp->bt_mount, "%s: daddr 0x%llx out of range, EOFS 0x%llx", __func__, map->bm_bn, btp->bt_nr_sectors); WARN_ON(1); return -EFSCORRUPTED; } return 0; } static int xfs_buf_find_lock( struct xfs_buf *bp, xfs_buf_flags_t flags) { if (flags & XBF_TRYLOCK) { if (!xfs_buf_trylock(bp)) { XFS_STATS_INC(bp->b_mount, xb_busy_locked); return -EAGAIN; } } else { xfs_buf_lock(bp); XFS_STATS_INC(bp->b_mount, xb_get_locked_waited); } /* * if the buffer is stale, clear all the external state associated with * it. We need to keep flags such as how we allocated the buffer memory * intact here. */ if (bp->b_flags & XBF_STALE) { if (flags & XBF_LIVESCAN) { xfs_buf_unlock(bp); return -ENOENT; } ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); bp->b_flags &= _XBF_KMEM; bp->b_ops = NULL; } return 0; } static inline int xfs_buf_lookup( struct xfs_buftarg *btp, struct xfs_buf_map *map, xfs_buf_flags_t flags, struct xfs_buf **bpp) { struct xfs_buf *bp; int error; rcu_read_lock(); bp = rhashtable_lookup(&btp->bt_hash, map, xfs_buf_hash_params); if (!bp || !lockref_get_not_dead(&bp->b_lockref)) { rcu_read_unlock(); return -ENOENT; } rcu_read_unlock(); error = xfs_buf_find_lock(bp, flags); if (error) { xfs_buf_rele(bp); return error; } trace_xfs_buf_find(bp, flags, _RET_IP_); *bpp = bp; return 0; } /* * Insert the new_bp into the hash table. This consumes the perag reference * taken for the lookup regardless of the result of the insert. */ static int xfs_buf_find_insert( struct xfs_buftarg *btp, struct xfs_perag *pag, struct xfs_buf_map *cmap, struct xfs_buf_map *map, int nmaps, xfs_buf_flags_t flags, struct xfs_buf **bpp) { struct xfs_buf *new_bp; struct xfs_buf *bp; int error; error = xfs_buf_alloc(btp, map, nmaps, flags, &new_bp); if (error) goto out_drop_pag; /* The new buffer keeps the perag reference until it is freed. */ new_bp->b_pag = pag; rcu_read_lock(); bp = rhashtable_lookup_get_insert_fast(&btp->bt_hash, &new_bp->b_rhash_head, xfs_buf_hash_params); if (IS_ERR(bp)) { rcu_read_unlock(); error = PTR_ERR(bp); goto out_free_buf; } if (bp && lockref_get_not_dead(&bp->b_lockref)) { /* found an existing buffer */ rcu_read_unlock(); error = xfs_buf_find_lock(bp, flags); if (error) xfs_buf_rele(bp); else *bpp = bp; goto out_free_buf; } rcu_read_unlock(); *bpp = new_bp; return 0; out_free_buf: xfs_buf_free(new_bp); out_drop_pag: if (pag) xfs_perag_put(pag); return error; } static inline struct xfs_perag * xfs_buftarg_get_pag( struct xfs_buftarg *btp, const struct xfs_buf_map *map) { struct xfs_mount *mp = btp->bt_mount; if (xfs_buftarg_is_mem(btp)) return NULL; return xfs_perag_get(mp, xfs_daddr_to_agno(mp, map->bm_bn)); } /* * Assembles a buffer covering the specified range. The code is optimised for * cache hits, as metadata intensive workloads will see 3 orders of magnitude * more hits than misses. */ int xfs_buf_get_map( struct xfs_buftarg *btp, struct xfs_buf_map *map, int nmaps, xfs_buf_flags_t flags, struct xfs_buf **bpp) { struct xfs_perag *pag; struct xfs_buf *bp = NULL; struct xfs_buf_map cmap = { .bm_bn = map[0].bm_bn }; int error; int i; if (flags & XBF_LIVESCAN) cmap.bm_flags |= XBM_LIVESCAN; for (i = 0; i < nmaps; i++) cmap.bm_len += map[i].bm_len; error = xfs_buf_map_verify(btp, &cmap); if (error) return error; pag = xfs_buftarg_get_pag(btp, &cmap); error = xfs_buf_lookup(btp, &cmap, flags, &bp); if (error && error != -ENOENT) goto out_put_perag; /* cache hits always outnumber misses by at least 10:1 */ if (unlikely(!bp)) { XFS_STATS_INC(btp->bt_mount, xb_miss_locked); if (flags & XBF_INCORE) goto out_put_perag; /* xfs_buf_find_insert() consumes the perag reference. */ error = xfs_buf_find_insert(btp, pag, &cmap, map, nmaps, flags, &bp); if (error) return error; } else { XFS_STATS_INC(btp->bt_mount, xb_get_locked); if (pag) xfs_perag_put(pag); } /* * Clear b_error if this is a lookup from a caller that doesn't expect * valid data to be found in the buffer. */ if (!(flags & XBF_READ)) xfs_buf_ioerror(bp, 0); XFS_STATS_INC(btp->bt_mount, xb_get); trace_xfs_buf_get(bp, flags, _RET_IP_); *bpp = bp; return 0; out_put_perag: if (pag) xfs_perag_put(pag); return error; } int _xfs_buf_read( struct xfs_buf *bp) { ASSERT(bp->b_maps[0].bm_bn != XFS_BUF_DADDR_NULL); bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD | XBF_DONE); bp->b_flags |= XBF_READ; xfs_buf_submit(bp); return xfs_buf_iowait(bp); } /* * Reverify a buffer found in cache without an attached ->b_ops. * * If the caller passed an ops structure and the buffer doesn't have ops * assigned, set the ops and use it to verify the contents. If verification * fails, clear XBF_DONE. We assume the buffer has no recorded errors and is * already in XBF_DONE state on entry. * * Under normal operations, every in-core buffer is verified on read I/O * completion. There are two scenarios that can lead to in-core buffers without * an assigned ->b_ops. The first is during log recovery of buffers on a V4 * filesystem, though these buffers are purged at the end of recovery. The * other is online repair, which intentionally reads with a NULL buffer ops to * run several verifiers across an in-core buffer in order to establish buffer * type. If repair can't establish that, the buffer will be left in memory * with NULL buffer ops. */ int xfs_buf_reverify( struct xfs_buf *bp, const struct xfs_buf_ops *ops) { ASSERT(bp->b_flags & XBF_DONE); ASSERT(bp->b_error == 0); if (!ops || bp->b_ops) return 0; bp->b_ops = ops; bp->b_ops->verify_read(bp); if (bp->b_error) bp->b_flags &= ~XBF_DONE; return bp->b_error; } int xfs_buf_read_map( struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, xfs_buf_flags_t flags, struct xfs_buf **bpp, const struct xfs_buf_ops *ops, xfs_failaddr_t fa) { struct xfs_buf *bp; int error; ASSERT(!(flags & (XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD))); flags |= XBF_READ; *bpp = NULL; error = xfs_buf_get_map(target, map, nmaps, flags, &bp); if (error) return error; trace_xfs_buf_read(bp, flags, _RET_IP_); if (!(bp->b_flags & XBF_DONE)) { /* Initiate the buffer read and wait. */ XFS_STATS_INC(target->bt_mount, xb_get_read); bp->b_ops = ops; error = _xfs_buf_read(bp); } else { /* Buffer already read; all we need to do is check it. */ error = xfs_buf_reverify(bp, ops); /* We do not want read in the flags */ bp->b_flags &= ~XBF_READ; ASSERT(bp->b_ops != NULL || ops == NULL); } /* * If we've had a read error, then the contents of the buffer are * invalid and should not be used. To ensure that a followup read tries * to pull the buffer from disk again, we clear the XBF_DONE flag and * mark the buffer stale. This ensures that anyone who has a current * reference to the buffer will interpret it's contents correctly and * future cache lookups will also treat it as an empty, uninitialised * buffer. */ if (error) { /* * Check against log shutdown for error reporting because * metadata writeback may require a read first and we need to * report errors in metadata writeback until the log is shut * down. High level transaction read functions already check * against mount shutdown, anyway, so we only need to be * concerned about low level IO interactions here. */ if (!xlog_is_shutdown(target->bt_mount->m_log)) xfs_buf_ioerror_alert(bp, fa); bp->b_flags &= ~XBF_DONE; xfs_buf_stale(bp); xfs_buf_relse(bp); /* bad CRC means corrupted metadata */ if (error == -EFSBADCRC) error = -EFSCORRUPTED; return error; } *bpp = bp; return 0; } /* * If we are not low on memory then do the readahead in a deadlock * safe manner. */ void xfs_buf_readahead_map( struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, const struct xfs_buf_ops *ops) { const xfs_buf_flags_t flags = XBF_READ | XBF_ASYNC | XBF_READ_AHEAD; struct xfs_buf *bp; /* * Currently we don't have a good means or justification for performing * xmbuf_map_page asynchronously, so we don't do readahead. */ if (xfs_buftarg_is_mem(target)) return; if (xfs_buf_get_map(target, map, nmaps, flags | XBF_TRYLOCK, &bp)) return; trace_xfs_buf_readahead(bp, 0, _RET_IP_); if (bp->b_flags & XBF_DONE) { xfs_buf_reverify(bp, ops); xfs_buf_relse(bp); return; } XFS_STATS_INC(target->bt_mount, xb_get_read); bp->b_ops = ops; bp->b_flags &= ~(XBF_WRITE | XBF_DONE); bp->b_flags |= flags; percpu_counter_inc(&target->bt_readahead_count); xfs_buf_submit(bp); } /* * Read an uncached buffer from disk. Allocates and returns a locked * buffer containing the disk contents or nothing. Uncached buffers always have * a cache index of XFS_BUF_DADDR_NULL so we can easily determine if the buffer * is cached or uncached during fault diagnosis. */ int xfs_buf_read_uncached( struct xfs_buftarg *target, xfs_daddr_t daddr, size_t numblks, struct xfs_buf **bpp, const struct xfs_buf_ops *ops) { struct xfs_buf *bp; int error; *bpp = NULL; error = xfs_buf_get_uncached(target, numblks, &bp); if (error) return error; /* set up the buffer for a read IO */ ASSERT(bp->b_map_count == 1); bp->b_rhash_key = XFS_BUF_DADDR_NULL; bp->b_maps[0].bm_bn = daddr; bp->b_flags |= XBF_READ; bp->b_ops = ops; xfs_buf_submit(bp); error = xfs_buf_iowait(bp); if (error) { xfs_buf_relse(bp); return error; } *bpp = bp; return 0; } int xfs_buf_get_uncached( struct xfs_buftarg *target, size_t numblks, struct xfs_buf **bpp) { int error; DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks); error = xfs_buf_alloc(target, &map, 1, 0, bpp); if (!error) trace_xfs_buf_get_uncached(*bpp, _RET_IP_); return error; } /* * Increment reference count on buffer, to hold the buffer concurrently * with another thread which may release (free) the buffer asynchronously. * Must hold the buffer already to call this function. */ void xfs_buf_hold( struct xfs_buf *bp) { trace_xfs_buf_hold(bp, _RET_IP_); lockref_get(&bp->b_lockref); } static void xfs_buf_destroy( struct xfs_buf *bp) { ASSERT(__lockref_is_dead(&bp->b_lockref)); ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); if (!xfs_buf_is_uncached(bp)) { rhashtable_remove_fast(&bp->b_target->bt_hash, &bp->b_rhash_head, xfs_buf_hash_params); if (bp->b_pag) xfs_perag_put(bp->b_pag); } xfs_buf_free(bp); } /* * Release a hold on the specified buffer. */ void xfs_buf_rele( struct xfs_buf *bp) { trace_xfs_buf_rele(bp, _RET_IP_); if (lockref_put_or_lock(&bp->b_lockref)) return; if (!--bp->b_lockref.count) { if (xfs_buf_is_uncached(bp) || !atomic_read(&bp->b_lru_ref)) goto kill; list_lru_add_obj(&bp->b_target->bt_lru, &bp->b_lru); } spin_unlock(&bp->b_lockref.lock); return; kill: lockref_mark_dead(&bp->b_lockref); list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru); spin_unlock(&bp->b_lockref.lock); xfs_buf_destroy(bp); } /* * Lock a buffer object, if it is not already locked. * * If we come across a stale, pinned, locked buffer, we know that we are * being asked to lock a buffer that has been reallocated. Because it is * pinned, we know that the log has not been pushed to disk and hence it * will still be locked. Rather than continuing to have trylock attempts * fail until someone else pushes the log, push it ourselves before * returning. This means that the xfsaild will not get stuck trying * to push on stale inode buffers. */ int xfs_buf_trylock( struct xfs_buf *bp) { int locked; locked = down_trylock(&bp->b_sema) == 0; if (locked) trace_xfs_buf_trylock(bp, _RET_IP_); else trace_xfs_buf_trylock_fail(bp, _RET_IP_); return locked; } /* * Lock a buffer object. * * If we come across a stale, pinned, locked buffer, we know that we * are being asked to lock a buffer that has been reallocated. Because * it is pinned, we know that the log has not been pushed to disk and * hence it will still be locked. Rather than sleeping until someone * else pushes the log, push it ourselves before trying to get the lock. */ void xfs_buf_lock( struct xfs_buf *bp) { trace_xfs_buf_lock(bp, _RET_IP_); if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE)) xfs_log_force(bp->b_mount, 0); down(&bp->b_sema); trace_xfs_buf_lock_done(bp, _RET_IP_); } void xfs_buf_unlock( struct xfs_buf *bp) { ASSERT(xfs_buf_islocked(bp)); up(&bp->b_sema); trace_xfs_buf_unlock(bp, _RET_IP_); } STATIC void xfs_buf_wait_unpin( struct xfs_buf *bp) { DECLARE_WAITQUEUE (wait, current); if (atomic_read(&bp->b_pin_count) == 0) return; add_wait_queue(&bp->b_waiters, &wait); for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); if (atomic_read(&bp->b_pin_count) == 0) break; io_schedule(); } remove_wait_queue(&bp->b_waiters, &wait); set_current_state(TASK_RUNNING); } static void xfs_buf_ioerror_alert_ratelimited( struct xfs_buf *bp) { static unsigned long lasttime; static struct xfs_buftarg *lasttarg; if (bp->b_target != lasttarg || time_after(jiffies, (lasttime + 5*HZ))) { lasttime = jiffies; xfs_buf_ioerror_alert(bp, __this_address); } lasttarg = bp->b_target; } /* * Account for this latest trip around the retry handler, and decide if * we've failed enough times to constitute a permanent failure. */ static bool xfs_buf_ioerror_permanent( struct xfs_buf *bp, struct xfs_error_cfg *cfg) { struct xfs_mount *mp = bp->b_mount; if (cfg->max_retries != XFS_ERR_RETRY_FOREVER && ++bp->b_retries > cfg->max_retries) return true; if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER && time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time)) return true; /* At unmount we may treat errors differently */ if (xfs_is_unmounting(mp) && mp->m_fail_unmount) return true; return false; } /* * On a sync write or shutdown we just want to stale the buffer and let the * caller handle the error in bp->b_error appropriately. * * If the write was asynchronous then no one will be looking for the error. If * this is the first failure of this type, clear the error state and write the * buffer out again. This means we always retry an async write failure at least * once, but we also need to set the buffer up to behave correctly now for * repeated failures. * * If we get repeated async write failures, then we take action according to the * error configuration we have been set up to use. * * Returns true if this function took care of error handling and the caller must * not touch the buffer again. Return false if the caller should proceed with * normal I/O completion handling. */ static bool xfs_buf_ioend_handle_error( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_mount; struct xfs_error_cfg *cfg; struct xfs_log_item *lip; /* * If we've already shutdown the journal because of I/O errors, there's * no point in giving this a retry. */ if (xlog_is_shutdown(mp->m_log)) goto out_stale; xfs_buf_ioerror_alert_ratelimited(bp); /* * We're not going to bother about retrying this during recovery. * One strike! */ if (bp->b_flags & _XBF_LOGRECOVERY) { xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); return false; } /* * Synchronous writes will have callers process the error. */ if (!(bp->b_flags & XBF_ASYNC)) goto out_stale; trace_xfs_buf_iodone_async(bp, _RET_IP_); cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error); if (bp->b_last_error != bp->b_error || !(bp->b_flags & (XBF_STALE | XBF_WRITE_FAIL))) { bp->b_last_error = bp->b_error; if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER && !bp->b_first_retry_time) bp->b_first_retry_time = jiffies; goto resubmit; } /* * Permanent error - we need to trigger a shutdown if we haven't already * to indicate that inconsistency will result from this action. */ if (xfs_buf_ioerror_permanent(bp, cfg)) { xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); goto out_stale; } /* Still considered a transient error. Caller will schedule retries. */ list_for_each_entry(lip, &bp->b_li_list, li_bio_list) { set_bit(XFS_LI_FAILED, &lip->li_flags); clear_bit(XFS_LI_FLUSHING, &lip->li_flags); } xfs_buf_ioerror(bp, 0); xfs_buf_relse(bp); return true; resubmit: xfs_buf_ioerror(bp, 0); bp->b_flags |= (XBF_DONE | XBF_WRITE_FAIL); reinit_completion(&bp->b_iowait); xfs_buf_submit(bp); return true; out_stale: xfs_buf_stale(bp); bp->b_flags |= XBF_DONE; bp->b_flags &= ~XBF_WRITE; trace_xfs_buf_error_relse(bp, _RET_IP_); return false; } /* returns false if the caller needs to resubmit the I/O, else true */ static bool __xfs_buf_ioend( struct xfs_buf *bp) { trace_xfs_buf_iodone(bp, _RET_IP_); if (bp->b_flags & XBF_READ) { if (!bp->b_error && is_vmalloc_addr(bp->b_addr)) invalidate_kernel_vmap_range(bp->b_addr, roundup(BBTOB(bp->b_length), PAGE_SIZE)); if (!bp->b_error && bp->b_ops) bp->b_ops->verify_read(bp); if (!bp->b_error) bp->b_flags |= XBF_DONE; if (bp->b_flags & XBF_READ_AHEAD) percpu_counter_dec(&bp->b_target->bt_readahead_count); } else { if (!bp->b_error) { bp->b_flags &= ~XBF_WRITE_FAIL; bp->b_flags |= XBF_DONE; } if (unlikely(bp->b_error) && xfs_buf_ioend_handle_error(bp)) return false; /* clear the retry state */ bp->b_last_error = 0; bp->b_retries = 0; bp->b_first_retry_time = 0; /* * Note that for things like remote attribute buffers, there may * not be a buffer log item here, so processing the buffer log * item must remain optional. */ if (bp->b_log_item) xfs_buf_item_done(bp); if (bp->b_iodone) bp->b_iodone(bp); } bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD | _XBF_LOGRECOVERY); return true; } static void xfs_buf_ioend( struct xfs_buf *bp) { if (!__xfs_buf_ioend(bp)) return; if (bp->b_flags & XBF_ASYNC) xfs_buf_relse(bp); else complete(&bp->b_iowait); } static void xfs_buf_ioend_work( struct work_struct *work) { struct xfs_buf *bp = container_of(work, struct xfs_buf, b_ioend_work); if (__xfs_buf_ioend(bp)) xfs_buf_relse(bp); } void __xfs_buf_ioerror( struct xfs_buf *bp, int error, xfs_failaddr_t failaddr) { ASSERT(error <= 0 && error >= -1000); bp->b_error = error; trace_xfs_buf_ioerror(bp, error, failaddr); } void xfs_buf_ioerror_alert( struct xfs_buf *bp, xfs_failaddr_t func) { xfs_buf_alert_ratelimited(bp, "XFS: metadata IO error", "metadata I/O error in \"%pS\" at daddr 0x%llx len %d error %d", func, (uint64_t)xfs_buf_daddr(bp), bp->b_length, -bp->b_error); } /* * To simulate an I/O failure, the buffer must be locked and held with at least * two references. * * The buf item reference is dropped via ioend processing. The second reference * is owned by the caller and is dropped on I/O completion if the buffer is * XBF_ASYNC. */ void xfs_buf_ioend_fail( struct xfs_buf *bp) { bp->b_flags &= ~XBF_DONE; xfs_buf_stale(bp); xfs_buf_ioerror(bp, -EIO); xfs_buf_ioend(bp); } int xfs_bwrite( struct xfs_buf *bp) { int error; ASSERT(xfs_buf_islocked(bp)); bp->b_flags |= XBF_WRITE; bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q | XBF_DONE); xfs_buf_submit(bp); error = xfs_buf_iowait(bp); if (error) xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR); return error; } static void xfs_buf_bio_end_io( struct bio *bio) { struct xfs_buf *bp = bio->bi_private; if (bio->bi_status) xfs_buf_ioerror(bp, blk_status_to_errno(bio->bi_status)); else if ((bp->b_flags & XBF_WRITE) && (bp->b_flags & XBF_ASYNC) && XFS_TEST_ERROR(bp->b_mount, XFS_ERRTAG_BUF_IOERROR)) xfs_buf_ioerror(bp, -EIO); if (bp->b_flags & XBF_ASYNC) { INIT_WORK(&bp->b_ioend_work, xfs_buf_ioend_work); queue_work(bp->b_mount->m_buf_workqueue, &bp->b_ioend_work); } else { complete(&bp->b_iowait); } bio_put(bio); } static inline blk_opf_t xfs_buf_bio_op( struct xfs_buf *bp) { blk_opf_t op; if (bp->b_flags & XBF_WRITE) { op = REQ_OP_WRITE; } else { op = REQ_OP_READ; if (bp->b_flags & XBF_READ_AHEAD) op |= REQ_RAHEAD; } return op | REQ_META; } static void xfs_buf_submit_bio( struct xfs_buf *bp) { unsigned int len = BBTOB(bp->b_length); unsigned int nr_vecs = bio_add_max_vecs(bp->b_addr, len); unsigned int map = 0; struct blk_plug plug; struct bio *bio; bio = bio_alloc(bp->b_target->bt_bdev, nr_vecs, xfs_buf_bio_op(bp), GFP_NOIO); if (is_vmalloc_addr(bp->b_addr)) bio_add_vmalloc(bio, bp->b_addr, len); else bio_add_virt_nofail(bio, bp->b_addr, len); bio->bi_private = bp; bio->bi_end_io = xfs_buf_bio_end_io; /* * If there is more than one map segment, split out a new bio for each * map except of the last one. The last map is handled by the * remainder of the original bio outside the loop. */ blk_start_plug(&plug); for (map = 0; map < bp->b_map_count - 1; map++) { struct bio *split; split = bio_split(bio, bp->b_maps[map].bm_len, GFP_NOFS, &fs_bio_set); split->bi_iter.bi_sector = bp->b_maps[map].bm_bn; bio_chain(split, bio); submit_bio(split); } bio->bi_iter.bi_sector = bp->b_maps[map].bm_bn; submit_bio(bio); blk_finish_plug(&plug); } /* * Wait for I/O completion of a sync buffer and return the I/O error code. */ static int xfs_buf_iowait( struct xfs_buf *bp) { ASSERT(!(bp->b_flags & XBF_ASYNC)); do { trace_xfs_buf_iowait(bp, _RET_IP_); wait_for_completion(&bp->b_iowait); trace_xfs_buf_iowait_done(bp, _RET_IP_); } while (!__xfs_buf_ioend(bp)); return bp->b_error; } /* * Run the write verifier callback function if it exists. If this fails, mark * the buffer with an error and do not dispatch the I/O. */ static bool xfs_buf_verify_write( struct xfs_buf *bp) { if (bp->b_ops) { bp->b_ops->verify_write(bp); if (bp->b_error) return false; } else if (bp->b_rhash_key != XFS_BUF_DADDR_NULL) { /* * Non-crc filesystems don't attach verifiers during log * recovery, so don't warn for such filesystems. */ if (xfs_has_crc(bp->b_mount)) { xfs_warn(bp->b_mount, "%s: no buf ops on daddr 0x%llx len %d", __func__, xfs_buf_daddr(bp), bp->b_length); xfs_hex_dump(bp->b_addr, XFS_CORRUPTION_DUMP_LEN); dump_stack(); } } return true; } /* * Buffer I/O submission path, read or write. Asynchronous submission transfers * the buffer lock ownership and the current reference to the IO. It is not * safe to reference the buffer after a call to this function unless the caller * holds an additional reference itself. */ static void xfs_buf_submit( struct xfs_buf *bp) { trace_xfs_buf_submit(bp, _RET_IP_); ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); /* * On log shutdown we stale and complete the buffer immediately. We can * be called to read the superblock before the log has been set up, so * be careful checking the log state. * * Checking the mount shutdown state here can result in the log tail * moving inappropriately on disk as the log may not yet be shut down. * i.e. failing this buffer on mount shutdown can remove it from the AIL * and move the tail of the log forwards without having written this * buffer to disk. This corrupts the log tail state in memory, and * because the log may not be shut down yet, it can then be propagated * to disk before the log is shutdown. Hence we check log shutdown * state here rather than mount state to avoid corrupting the log tail * on shutdown. */ if (bp->b_mount->m_log && xlog_is_shutdown(bp->b_mount->m_log)) { xfs_buf_ioend_fail(bp); return; } if (bp->b_flags & XBF_WRITE) xfs_buf_wait_unpin(bp); /* * Make sure we capture only current IO errors rather than stale errors * left over from previous use of the buffer (e.g. failed readahead). */ bp->b_error = 0; if ((bp->b_flags & XBF_WRITE) && !xfs_buf_verify_write(bp)) { xfs_force_shutdown(bp->b_mount, SHUTDOWN_CORRUPT_INCORE); xfs_buf_ioend(bp); return; } /* In-memory targets are directly mapped, no I/O required. */ if (xfs_buftarg_is_mem(bp->b_target)) { xfs_buf_ioend(bp); return; } xfs_buf_submit_bio(bp); } /* * Log a message about and stale a buffer that a caller has decided is corrupt. * * This function should be called for the kinds of metadata corruption that * cannot be detect from a verifier, such as incorrect inter-block relationship * data. Do /not/ call this function from a verifier function. * * The buffer must be XBF_DONE prior to the call. Afterwards, the buffer will * be marked stale, but b_error will not be set. The caller is responsible for * releasing the buffer or fixing it. */ void __xfs_buf_mark_corrupt( struct xfs_buf *bp, xfs_failaddr_t fa) { ASSERT(bp->b_flags & XBF_DONE); xfs_buf_corruption_error(bp, fa); xfs_buf_stale(bp); } /* * Handling of buffer targets (buftargs). */ /* * Wait for any bufs with callbacks that have been submitted but have not yet * returned. These buffers will have an elevated hold count, so wait on those * while freeing all the buffers only held by the LRU. */ static enum lru_status xfs_buftarg_drain_rele( struct list_head *item, struct list_lru_one *lru, void *arg) { struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru); struct list_head *dispose = arg; if (!spin_trylock(&bp->b_lockref.lock)) return LRU_SKIP; if (bp->b_lockref.count > 0) { /* need to wait, so skip it this pass */ spin_unlock(&bp->b_lockref.lock); trace_xfs_buf_drain_buftarg(bp, _RET_IP_); return LRU_SKIP; } lockref_mark_dead(&bp->b_lockref); list_lru_isolate_move(lru, item, dispose); spin_unlock(&bp->b_lockref.lock); return LRU_REMOVED; } /* * Wait for outstanding I/O on the buftarg to complete. */ void xfs_buftarg_wait( struct xfs_buftarg *btp) { /* * First wait for all in-flight readahead buffers to be released. This is * critical as new buffers do not make the LRU until they are released. * * Next, flush the buffer workqueue to ensure all completion processing * has finished. Just waiting on buffer locks is not sufficient for * async IO as the reference count held over IO is not released until * after the buffer lock is dropped. Hence we need to ensure here that * all reference counts have been dropped before we start walking the * LRU list. */ while (percpu_counter_sum(&btp->bt_readahead_count)) delay(100); flush_workqueue(btp->bt_mount->m_buf_workqueue); } void xfs_buftarg_drain( struct xfs_buftarg *btp) { LIST_HEAD(dispose); int loop = 0; bool write_fail = false; xfs_buftarg_wait(btp); /* loop until there is nothing left on the lru list. */ while (list_lru_count(&btp->bt_lru)) { list_lru_walk(&btp->bt_lru, xfs_buftarg_drain_rele, &dispose, LONG_MAX); while (!list_empty(&dispose)) { struct xfs_buf *bp; bp = list_first_entry(&dispose, struct xfs_buf, b_lru); list_del_init(&bp->b_lru); if (bp->b_flags & XBF_WRITE_FAIL) { write_fail = true; xfs_buf_alert_ratelimited(bp, "XFS: Corruption Alert", "Corruption Alert: Buffer at daddr 0x%llx had permanent write failures!", (long long)xfs_buf_daddr(bp)); } xfs_buf_destroy(bp); } if (loop++ != 0) delay(100); } /* * If one or more failed buffers were freed, that means dirty metadata * was thrown away. This should only ever happen after I/O completion * handling has elevated I/O error(s) to permanent failures and shuts * down the journal. */ if (write_fail) { ASSERT(xlog_is_shutdown(btp->bt_mount->m_log)); xfs_alert(btp->bt_mount, "Please run xfs_repair to determine the extent of the problem."); } } static enum lru_status xfs_buftarg_isolate( struct list_head *item, struct list_lru_one *lru, void *arg) { struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru); struct list_head *dispose = arg; /* * We are inverting the lru lock vs bp->b_lockref.lock order here, so * use a trylock. If we fail to get the lock, just skip the buffer. */ if (!spin_trylock(&bp->b_lockref.lock)) return LRU_SKIP; /* * If the buffer is in use, remove it from the LRU for now. We can't * free it while someone is using it, and we should also not count * eviction passed for it, just as if it hadn't been added to the LRU * yet. */ if (bp->b_lockref.count > 0) { list_lru_isolate(lru, &bp->b_lru); spin_unlock(&bp->b_lockref.lock); return LRU_REMOVED; } /* * Decrement the b_lru_ref count unless the value is already * zero. If the value is already zero, we need to reclaim the * buffer, otherwise it gets another trip through the LRU. */ if (atomic_add_unless(&bp->b_lru_ref, -1, 0)) { spin_unlock(&bp->b_lockref.lock); return LRU_ROTATE; } lockref_mark_dead(&bp->b_lockref); list_lru_isolate_move(lru, item, dispose); spin_unlock(&bp->b_lockref.lock); return LRU_REMOVED; } static unsigned long xfs_buftarg_shrink_scan( struct shrinker *shrink, struct shrink_control *sc) { struct xfs_buftarg *btp = shrink->private_data; LIST_HEAD(dispose); unsigned long freed; freed = list_lru_shrink_walk(&btp->bt_lru, sc, xfs_buftarg_isolate, &dispose); while (!list_empty(&dispose)) { struct xfs_buf *bp; bp = list_first_entry(&dispose, struct xfs_buf, b_lru); list_del_init(&bp->b_lru); xfs_buf_destroy(bp); } return freed; } static unsigned long xfs_buftarg_shrink_count( struct shrinker *shrink, struct shrink_control *sc) { struct xfs_buftarg *btp = shrink->private_data; return list_lru_shrink_count(&btp->bt_lru, sc); } void xfs_destroy_buftarg( struct xfs_buftarg *btp) { shrinker_free(btp->bt_shrinker); ASSERT(percpu_counter_sum(&btp->bt_readahead_count) == 0); percpu_counter_destroy(&btp->bt_readahead_count); list_lru_destroy(&btp->bt_lru); rhashtable_destroy(&btp->bt_hash); } void xfs_free_buftarg( struct xfs_buftarg *btp) { xfs_destroy_buftarg(btp); fs_put_dax(btp->bt_daxdev, btp->bt_mount); /* the main block device is closed by kill_block_super */ if (btp->bt_bdev != btp->bt_mount->m_super->s_bdev) bdev_fput(btp->bt_file); kfree(btp); } /* * Configure this buffer target for hardware-assisted atomic writes if the * underlying block device supports is congruent with the filesystem geometry. */ static inline void xfs_configure_buftarg_atomic_writes( struct xfs_buftarg *btp) { struct xfs_mount *mp = btp->bt_mount; unsigned int min_bytes, max_bytes; min_bytes = bdev_atomic_write_unit_min_bytes(btp->bt_bdev); max_bytes = bdev_atomic_write_unit_max_bytes(btp->bt_bdev); /* * Ignore atomic write geometry that is nonsense or doesn't even cover * a single fsblock. */ if (min_bytes > max_bytes || min_bytes > mp->m_sb.sb_blocksize || max_bytes < mp->m_sb.sb_blocksize) { min_bytes = 0; max_bytes = 0; } btp->bt_awu_min = min_bytes; btp->bt_awu_max = max_bytes; } /* Configure a buffer target that abstracts a block device. */ int xfs_configure_buftarg( struct xfs_buftarg *btp, unsigned int sectorsize, xfs_rfsblock_t nr_blocks) { struct xfs_mount *mp = btp->bt_mount; if (btp->bt_bdev) { int error; error = bdev_validate_blocksize(btp->bt_bdev, sectorsize); if (error) { xfs_warn(mp, "Cannot use blocksize %u on device %pg, err %d", sectorsize, btp->bt_bdev, error); return -EINVAL; } if (bdev_can_atomic_write(btp->bt_bdev)) xfs_configure_buftarg_atomic_writes(btp); } btp->bt_meta_sectorsize = sectorsize; btp->bt_meta_sectormask = sectorsize - 1; /* m_blkbb_log is not set up yet */ btp->bt_nr_sectors = nr_blocks << (mp->m_sb.sb_blocklog - BBSHIFT); return 0; } int xfs_init_buftarg( struct xfs_buftarg *btp, size_t logical_sectorsize, const char *descr) { /* The maximum size of the buftarg is only known once the sb is read. */ btp->bt_nr_sectors = XFS_BUF_DADDR_MAX; /* Set up device logical sector size mask */ btp->bt_logical_sectorsize = logical_sectorsize; btp->bt_logical_sectormask = logical_sectorsize - 1; /* * Buffer IO error rate limiting. Limit it to no more than 10 messages * per 30 seconds so as to not spam logs too much on repeated errors. */ ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ, DEFAULT_RATELIMIT_BURST); if (rhashtable_init(&btp->bt_hash, &xfs_buf_hash_params)) return -ENOMEM; if (list_lru_init(&btp->bt_lru)) goto out_destroy_hash; if (percpu_counter_init(&btp->bt_readahead_count, 0, GFP_KERNEL)) goto out_destroy_lru; btp->bt_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE, "xfs-buf:%s", descr); if (!btp->bt_shrinker) goto out_destroy_io_count; btp->bt_shrinker->count_objects = xfs_buftarg_shrink_count; btp->bt_shrinker->scan_objects = xfs_buftarg_shrink_scan; btp->bt_shrinker->private_data = btp; shrinker_register(btp->bt_shrinker); return 0; out_destroy_io_count: percpu_counter_destroy(&btp->bt_readahead_count); out_destroy_lru: list_lru_destroy(&btp->bt_lru); out_destroy_hash: rhashtable_destroy(&btp->bt_hash); return -ENOMEM; } struct xfs_buftarg * xfs_alloc_buftarg( struct xfs_mount *mp, struct file *bdev_file) { struct xfs_buftarg *btp; const struct dax_holder_operations *ops = NULL; int error; #if defined(CONFIG_FS_DAX) && defined(CONFIG_MEMORY_FAILURE) ops = &xfs_dax_holder_operations; #endif btp = kzalloc_obj(*btp, GFP_KERNEL | __GFP_NOFAIL); btp->bt_mount = mp; btp->bt_file = bdev_file; btp->bt_bdev = file_bdev(bdev_file); btp->bt_dev = btp->bt_bdev->bd_dev; btp->bt_daxdev = fs_dax_get_by_bdev(btp->bt_bdev, &btp->bt_dax_part_off, mp, ops); /* * Flush and invalidate all devices' pagecaches before reading any * metadata because XFS doesn't use the bdev pagecache. */ error = sync_blockdev(btp->bt_bdev); if (error) goto error_free; /* * When allocating the buftargs we have not yet read the super block and * thus don't know the file system sector size yet. */ btp->bt_meta_sectorsize = bdev_logical_block_size(btp->bt_bdev); btp->bt_meta_sectormask = btp->bt_meta_sectorsize - 1; error = xfs_init_buftarg(btp, btp->bt_meta_sectorsize, mp->m_super->s_id); if (error) goto error_free; return btp; error_free: fs_put_dax(btp->bt_daxdev, mp); kfree(btp); return ERR_PTR(error); } static inline void xfs_buf_list_del( struct xfs_buf *bp) { list_del_init(&bp->b_list); wake_up_var(&bp->b_list); } /* * Cancel a delayed write list. * * Remove each buffer from the list, clear the delwri queue flag and drop the * associated buffer reference. */ void xfs_buf_delwri_cancel( struct list_head *list) { struct xfs_buf *bp; while (!list_empty(list)) { bp = list_first_entry(list, struct xfs_buf, b_list); xfs_buf_lock(bp); bp->b_flags &= ~_XBF_DELWRI_Q; xfs_buf_list_del(bp); xfs_buf_relse(bp); } } /* * Add a buffer to the delayed write list. * * This queues a buffer for writeout if it hasn't already been. Note that * neither this routine nor the buffer list submission functions perform * any internal synchronization. It is expected that the lists are thread-local * to the callers. * * Returns true if we queued up the buffer, or false if it already had * been on the buffer list. */ bool xfs_buf_delwri_queue( struct xfs_buf *bp, struct list_head *list) { ASSERT(xfs_buf_islocked(bp)); ASSERT(!(bp->b_flags & XBF_READ)); /* * If the buffer is already marked delwri it already is queued up * by someone else for imediate writeout. Just ignore it in that * case. */ if (bp->b_flags & _XBF_DELWRI_Q) { trace_xfs_buf_delwri_queued(bp, _RET_IP_); return false; } trace_xfs_buf_delwri_queue(bp, _RET_IP_); /* * If a buffer gets written out synchronously or marked stale while it * is on a delwri list we lazily remove it. To do this, the other party * clears the _XBF_DELWRI_Q flag but otherwise leaves the buffer alone. * It remains referenced and on the list. In a rare corner case it * might get readded to a delwri list after the synchronous writeout, in * which case we need just need to re-add the flag here. */ bp->b_flags |= _XBF_DELWRI_Q; if (list_empty(&bp->b_list)) { xfs_buf_hold(bp); list_add_tail(&bp->b_list, list); } return true; } /* * Queue a buffer to this delwri list as part of a data integrity operation. * If the buffer is on any other delwri list, we'll wait for that to clear * so that the caller can submit the buffer for IO and wait for the result. * Callers must ensure the buffer is not already on the list. */ void xfs_buf_delwri_queue_here( struct xfs_buf *bp, struct list_head *buffer_list) { /* * We need this buffer to end up on the /caller's/ delwri list, not any * old list. This can happen if the buffer is marked stale (which * clears DELWRI_Q) after the AIL queues the buffer to its list but * before the AIL has a chance to submit the list. */ while (!list_empty(&bp->b_list)) { xfs_buf_unlock(bp); wait_var_event(&bp->b_list, list_empty(&bp->b_list)); xfs_buf_lock(bp); } ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); xfs_buf_delwri_queue(bp, buffer_list); } /* * Compare function is more complex than it needs to be because * the return value is only 32 bits and we are doing comparisons * on 64 bit values */ static int xfs_buf_cmp( void *priv, const struct list_head *a, const struct list_head *b) { struct xfs_buf *ap = container_of(a, struct xfs_buf, b_list); struct xfs_buf *bp = container_of(b, struct xfs_buf, b_list); xfs_daddr_t diff; diff = ap->b_maps[0].bm_bn - bp->b_maps[0].bm_bn; if (diff < 0) return -1; if (diff > 0) return 1; return 0; } static bool xfs_buf_delwri_submit_prep( struct xfs_buf *bp) { /* * Someone else might have written the buffer synchronously or marked it * stale in the meantime. In that case only the _XBF_DELWRI_Q flag got * cleared, and we have to drop the reference and remove it from the * list here. */ if (!(bp->b_flags & _XBF_DELWRI_Q)) { xfs_buf_list_del(bp); xfs_buf_relse(bp); return false; } trace_xfs_buf_delwri_split(bp, _RET_IP_); bp->b_flags &= ~_XBF_DELWRI_Q; bp->b_flags |= XBF_WRITE; return true; } /* * Write out a buffer list asynchronously. * * This will take the @buffer_list, write all non-locked and non-pinned buffers * out and not wait for I/O completion on any of the buffers. This interface * is only safely useable for callers that can track I/O completion by higher * level means, e.g. AIL pushing as the @buffer_list is consumed in this * function. * * Note: this function will skip buffers it would block on, and in doing so * leaves them on @buffer_list so they can be retried on a later pass. As such, * it is up to the caller to ensure that the buffer list is fully submitted or * cancelled appropriately when they are finished with the list. Failure to * cancel or resubmit the list until it is empty will result in leaked buffers * at unmount time. */ int xfs_buf_delwri_submit_nowait( struct list_head *buffer_list) { struct xfs_buf *bp, *n; int pinned = 0; struct blk_plug plug; list_sort(NULL, buffer_list, xfs_buf_cmp); blk_start_plug(&plug); list_for_each_entry_safe(bp, n, buffer_list, b_list) { if (!xfs_buf_trylock(bp)) continue; if (xfs_buf_ispinned(bp)) { xfs_buf_unlock(bp); pinned++; continue; } if (!xfs_buf_delwri_submit_prep(bp)) continue; bp->b_flags |= XBF_ASYNC; xfs_buf_list_del(bp); xfs_buf_submit(bp); } blk_finish_plug(&plug); return pinned; } /* * Write out a buffer list synchronously. * * This will take the @buffer_list, write all buffers out and wait for I/O * completion on all of the buffers. @buffer_list is consumed by the function, * so callers must have some other way of tracking buffers if they require such * functionality. */ int xfs_buf_delwri_submit( struct list_head *buffer_list) { LIST_HEAD (wait_list); int error = 0, error2; struct xfs_buf *bp, *n; struct blk_plug plug; list_sort(NULL, buffer_list, xfs_buf_cmp); blk_start_plug(&plug); list_for_each_entry_safe(bp, n, buffer_list, b_list) { xfs_buf_lock(bp); if (!xfs_buf_delwri_submit_prep(bp)) continue; bp->b_flags &= ~XBF_ASYNC; list_move_tail(&bp->b_list, &wait_list); xfs_buf_submit(bp); } blk_finish_plug(&plug); /* Wait for IO to complete. */ while (!list_empty(&wait_list)) { bp = list_first_entry(&wait_list, struct xfs_buf, b_list); xfs_buf_list_del(bp); /* * Wait on the locked buffer, check for errors and unlock and * release the delwri queue reference. */ error2 = xfs_buf_iowait(bp); xfs_buf_relse(bp); if (!error) error = error2; } return error; } void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref) { /* * Set the lru reference count to 0 based on the error injection tag. * This allows userspace to disrupt buffer caching for debug/testing * purposes. */ if (XFS_TEST_ERROR(bp->b_mount, XFS_ERRTAG_BUF_LRU_REF)) lru_ref = 0; atomic_set(&bp->b_lru_ref, lru_ref); } /* * Verify an on-disk magic value against the magic value specified in the * verifier structure. The verifier magic is in disk byte order so the caller is * expected to pass the value directly from disk. */ bool xfs_verify_magic( struct xfs_buf *bp, __be32 dmagic) { struct xfs_mount *mp = bp->b_mount; int idx; idx = xfs_has_crc(mp); if (WARN_ON(!bp->b_ops || !bp->b_ops->magic[idx])) return false; return dmagic == bp->b_ops->magic[idx]; } /* * Verify an on-disk magic value against the magic value specified in the * verifier structure. The verifier magic is in disk byte order so the caller is * expected to pass the value directly from disk. */ bool xfs_verify_magic16( struct xfs_buf *bp, __be16 dmagic) { struct xfs_mount *mp = bp->b_mount; int idx; idx = xfs_has_crc(mp); if (WARN_ON(!bp->b_ops || !bp->b_ops->magic16[idx])) return false; return dmagic == bp->b_ops->magic16[idx]; }
12 11 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 2 2 10 8 10 7 8 10 9 10 9 3 1 5 6 4 4 1 4 5 8 9 11 6 8 4 1 1 1 7 3 4 2 4 1 1 4 13 2 2 13 13 13 6 11 2 2 1 6 2 4 6 8 2 6 2 6 2 2 2 2 2 2 1 1 3 3 3 3 2 2 1 2 2 1 1 1 1 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 1 1 1 1 8 8 8 7 3 3 2 3 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 // SPDX-License-Identifier: GPL-2.0-or-later /* * ALSA sequencer Client Manager * Copyright (c) 1998-2001 by Frank van de Pol <fvdpol@coil.demon.nl> * Jaroslav Kysela <perex@perex.cz> * Takashi Iwai <tiwai@suse.de> */ #include <linux/init.h> #include <linux/export.h> #include <linux/slab.h> #include <sound/core.h> #include <sound/minors.h> #include <linux/kmod.h> #include <sound/seq_kernel.h> #include <sound/ump.h> #include "seq_clientmgr.h" #include "seq_memory.h" #include "seq_queue.h" #include "seq_timer.h" #include "seq_info.h" #include "seq_system.h" #include "seq_ump_convert.h" #include <sound/seq_device.h> #ifdef CONFIG_COMPAT #include <linux/compat.h> #endif /* Client Manager * this module handles the connections of userland and kernel clients * */ /* * There are four ranges of client numbers (last two shared): * 0..15: global clients * 16..127: statically allocated client numbers for cards 0..27 * 128..191: dynamically allocated client numbers for cards 28..31 * 128..191: dynamically allocated client numbers for applications */ /* number of kernel non-card clients */ #define SNDRV_SEQ_GLOBAL_CLIENTS 16 /* clients per cards, for static clients */ #define SNDRV_SEQ_CLIENTS_PER_CARD 4 /* dynamically allocated client numbers (both kernel drivers and user space) */ #define SNDRV_SEQ_DYNAMIC_CLIENTS_BEGIN 128 #define SNDRV_SEQ_LFLG_INPUT 0x0001 #define SNDRV_SEQ_LFLG_OUTPUT 0x0002 #define SNDRV_SEQ_LFLG_OPEN (SNDRV_SEQ_LFLG_INPUT|SNDRV_SEQ_LFLG_OUTPUT) static DEFINE_SPINLOCK(clients_lock); static DEFINE_MUTEX(register_mutex); /* * client table */ static char clienttablock[SNDRV_SEQ_MAX_CLIENTS]; static struct snd_seq_client *clienttab[SNDRV_SEQ_MAX_CLIENTS]; static struct snd_seq_usage client_usage; /* * prototypes */ static int bounce_error_event(struct snd_seq_client *client, struct snd_seq_event *event, int err, int atomic, int hop); static int snd_seq_deliver_single_event(struct snd_seq_client *client, struct snd_seq_event *event, int atomic, int hop); #if IS_ENABLED(CONFIG_SND_SEQ_UMP) static void free_ump_info(struct snd_seq_client *client); #endif /* */ static inline unsigned short snd_seq_file_flags(struct file *file) { switch (file->f_mode & (FMODE_READ | FMODE_WRITE)) { case FMODE_WRITE: return SNDRV_SEQ_LFLG_OUTPUT; case FMODE_READ: return SNDRV_SEQ_LFLG_INPUT; default: return SNDRV_SEQ_LFLG_OPEN; } } static inline int snd_seq_write_pool_allocated(struct snd_seq_client *client) { return snd_seq_total_cells(client->pool) > 0; } /* return pointer to client structure for specified id */ static struct snd_seq_client *clientptr(int clientid) { if (clientid < 0 || clientid >= SNDRV_SEQ_MAX_CLIENTS) { pr_debug("ALSA: seq: oops. Trying to get pointer to client %d\n", clientid); return NULL; } return clienttab[clientid]; } static struct snd_seq_client *client_use_ptr(int clientid, bool load_module) { struct snd_seq_client *client; if (clientid < 0 || clientid >= SNDRV_SEQ_MAX_CLIENTS) { pr_debug("ALSA: seq: oops. Trying to get pointer to client %d\n", clientid); return NULL; } scoped_guard(spinlock_irqsave, &clients_lock) { client = clientptr(clientid); if (client) return snd_seq_client_ref(client); if (clienttablock[clientid]) return NULL; } #ifdef CONFIG_MODULES if (load_module) { static DECLARE_BITMAP(client_requested, SNDRV_SEQ_GLOBAL_CLIENTS); static DECLARE_BITMAP(card_requested, SNDRV_CARDS); if (clientid < SNDRV_SEQ_GLOBAL_CLIENTS) { int idx; if (!test_and_set_bit(clientid, client_requested)) { for (idx = 0; idx < 15; idx++) { if (seq_client_load[idx] < 0) break; if (seq_client_load[idx] == clientid) { request_module("snd-seq-client-%i", clientid); break; } } } } else if (clientid < SNDRV_SEQ_DYNAMIC_CLIENTS_BEGIN) { int card = (clientid - SNDRV_SEQ_GLOBAL_CLIENTS) / SNDRV_SEQ_CLIENTS_PER_CARD; if (card < snd_ecards_limit) { if (!test_and_set_bit(card, card_requested)) snd_request_card(card); snd_seq_device_load_drivers(); } } scoped_guard(spinlock_irqsave, &clients_lock) { client = clientptr(clientid); if (client) return snd_seq_client_ref(client); } } #endif return NULL; } /* get snd_seq_client object for the given id quickly */ struct snd_seq_client *snd_seq_client_use_ptr(int clientid) { return client_use_ptr(clientid, false); } /* get snd_seq_client object for the given id; * if not found, retry after loading the modules */ static struct snd_seq_client *client_load_and_use_ptr(int clientid) { return client_use_ptr(clientid, IS_ENABLED(CONFIG_MODULES)); } static void usage_alloc(struct snd_seq_usage *res, int num) { res->cur += num; if (res->cur > res->peak) res->peak = res->cur; } static void usage_free(struct snd_seq_usage *res, int num) { res->cur -= num; } /* initialise data structures */ int __init client_init_data(void) { /* zap out the client table */ memset(&clienttablock, 0, sizeof(clienttablock)); memset(&clienttab, 0, sizeof(clienttab)); return 0; } static struct snd_seq_client *seq_create_client1(int client_index, int poolsize) { int c; struct snd_seq_client *client; /* init client data */ client = kzalloc(sizeof(*client), GFP_KERNEL); if (client == NULL) return NULL; client->pool = snd_seq_pool_new(poolsize); if (client->pool == NULL) { kfree(client); return NULL; } client->type = NO_CLIENT; snd_use_lock_init(&client->use_lock); rwlock_init(&client->ports_lock); mutex_init(&client->ports_mutex); INIT_LIST_HEAD(&client->ports_list_head); mutex_init(&client->ioctl_mutex); client->ump_endpoint_port = -1; /* find free slot in the client table */ scoped_guard(spinlock_irq, &clients_lock) { if (client_index < 0) { for (c = SNDRV_SEQ_DYNAMIC_CLIENTS_BEGIN; c < SNDRV_SEQ_MAX_CLIENTS; c++) { if (clienttab[c] || clienttablock[c]) continue; clienttab[client->number = c] = client; return client; } } else { if (clienttab[client_index] == NULL && !clienttablock[client_index]) { clienttab[client->number = client_index] = client; return client; } } } snd_seq_pool_delete(&client->pool); kfree(client); return NULL; /* no free slot found or busy, return failure code */ } static int seq_free_client1(struct snd_seq_client *client) { if (!client) return 0; scoped_guard(spinlock_irq, &clients_lock) { clienttablock[client->number] = 1; clienttab[client->number] = NULL; } snd_seq_delete_all_ports(client); snd_seq_queue_client_leave(client->number); snd_use_lock_sync(&client->use_lock); if (client->pool) snd_seq_pool_delete(&client->pool); scoped_guard(spinlock_irq, &clients_lock) { clienttablock[client->number] = 0; } return 0; } static void seq_free_client(struct snd_seq_client * client) { scoped_guard(mutex, &register_mutex) { switch (client->type) { case NO_CLIENT: pr_warn("ALSA: seq: Trying to free unused client %d\n", client->number); break; case USER_CLIENT: case KERNEL_CLIENT: seq_free_client1(client); usage_free(&client_usage, 1); break; default: pr_err("ALSA: seq: Trying to free client %d with undefined type = %d\n", client->number, client->type); } } snd_seq_system_client_ev_client_exit(client->number); } /* -------------------------------------------------------- */ /* create a user client */ static int snd_seq_open(struct inode *inode, struct file *file) { int c, mode; /* client id */ struct snd_seq_client *client; struct snd_seq_user_client *user; int err; err = stream_open(inode, file); if (err < 0) return err; scoped_guard(mutex, &register_mutex) { client = seq_create_client1(-1, SNDRV_SEQ_DEFAULT_EVENTS); if (!client) return -ENOMEM; /* failure code */ mode = snd_seq_file_flags(file); if (mode & SNDRV_SEQ_LFLG_INPUT) client->accept_input = 1; if (mode & SNDRV_SEQ_LFLG_OUTPUT) client->accept_output = 1; user = &client->data.user; user->fifo = NULL; user->fifo_pool_size = 0; if (mode & SNDRV_SEQ_LFLG_INPUT) { user->fifo_pool_size = SNDRV_SEQ_DEFAULT_CLIENT_EVENTS; user->fifo = snd_seq_fifo_new(user->fifo_pool_size); if (user->fifo == NULL) { seq_free_client1(client); kfree(client); return -ENOMEM; } } usage_alloc(&client_usage, 1); client->type = USER_CLIENT; } c = client->number; file->private_data = client; /* fill client data */ user->file = file; sprintf(client->name, "Client-%d", c); client->data.user.owner = get_pid(task_pid(current)); /* make others aware this new client */ snd_seq_system_client_ev_client_start(c); return 0; } /* delete a user client */ static int snd_seq_release(struct inode *inode, struct file *file) { struct snd_seq_client *client = file->private_data; if (client) { seq_free_client(client); if (client->data.user.fifo) snd_seq_fifo_delete(&client->data.user.fifo); #if IS_ENABLED(CONFIG_SND_SEQ_UMP) free_ump_info(client); #endif put_pid(client->data.user.owner); kfree(client); } return 0; } static bool event_is_compatible(const struct snd_seq_client *client, const struct snd_seq_event *ev) { if (snd_seq_ev_is_ump(ev) && !client->midi_version) return false; if (snd_seq_ev_is_ump(ev) && snd_seq_ev_is_variable(ev)) return false; return true; } /* handle client read() */ /* possible error values: * -ENXIO invalid client or file open mode * -ENOSPC FIFO overflow (the flag is cleared after this error report) * -EINVAL no enough user-space buffer to write the whole event * -EFAULT seg. fault during copy to user space */ static ssize_t snd_seq_read(struct file *file, char __user *buf, size_t count, loff_t *offset) { struct snd_seq_client *client = file->private_data; struct snd_seq_fifo *fifo; size_t aligned_size; int err; long result = 0; struct snd_seq_event_cell *cell; if (!(snd_seq_file_flags(file) & SNDRV_SEQ_LFLG_INPUT)) return -ENXIO; if (!access_ok(buf, count)) return -EFAULT; /* check client structures are in place */ if (snd_BUG_ON(!client)) return -ENXIO; if (!client->accept_input) return -ENXIO; fifo = client->data.user.fifo; if (!fifo) return -ENXIO; if (atomic_read(&fifo->overflow) > 0) { /* buffer overflow is detected */ snd_seq_fifo_clear(fifo); /* return error code */ return -ENOSPC; } cell = NULL; err = 0; guard(snd_seq_fifo)(fifo); if (IS_ENABLED(CONFIG_SND_SEQ_UMP) && client->midi_version > 0) aligned_size = sizeof(struct snd_seq_ump_event); else aligned_size = sizeof(struct snd_seq_event); /* while data available in queue */ while (count >= aligned_size) { int nonblock; nonblock = (file->f_flags & O_NONBLOCK) || result > 0; err = snd_seq_fifo_cell_out(fifo, &cell, nonblock); if (err < 0) break; if (!event_is_compatible(client, &cell->event)) { snd_seq_cell_free(cell); cell = NULL; continue; } if (snd_seq_ev_is_variable(&cell->event)) { struct snd_seq_ump_event tmpev; memcpy(&tmpev, &cell->event, aligned_size); tmpev.data.ext.len &= ~SNDRV_SEQ_EXT_MASK; if (copy_to_user(buf, &tmpev, aligned_size)) { err = -EFAULT; break; } count -= aligned_size; buf += aligned_size; err = snd_seq_expand_var_event(&cell->event, count, (char __force *)buf, 0, aligned_size); if (err < 0) break; result += err; count -= err; buf += err; } else { if (copy_to_user(buf, &cell->event, aligned_size)) { err = -EFAULT; break; } count -= aligned_size; buf += aligned_size; } snd_seq_cell_free(cell); cell = NULL; /* to be sure */ result += aligned_size; } if (err < 0) { if (cell) snd_seq_fifo_cell_putback(fifo, cell); if (err == -EAGAIN && result > 0) err = 0; } return (err < 0) ? err : result; } /* * check access permission to the port */ static int check_port_perm(struct snd_seq_client_port *port, unsigned int flags) { if ((port->capability & flags) != flags) return 0; return flags; } /* * check if the destination client is available, and return the pointer */ static struct snd_seq_client *get_event_dest_client(struct snd_seq_event *event) { struct snd_seq_client *dest __free(snd_seq_client) = snd_seq_client_use_ptr(event->dest.client); if (dest == NULL) return NULL; if (! dest->accept_input) return NULL; if (snd_seq_ev_is_ump(event)) return no_free_ptr(dest); /* ok - no filter checks */ if ((dest->filter & SNDRV_SEQ_FILTER_USE_EVENT) && ! test_bit(event->type, dest->event_filter)) return NULL; return no_free_ptr(dest); /* ok - accessible */ } /* * Return the error event. * * If the receiver client is a user client, the original event is * encapsulated in SNDRV_SEQ_EVENT_BOUNCE as variable length event. If * the original event is also variable length, the external data is * copied after the event record. * If the receiver client is a kernel client, the original event is * quoted in SNDRV_SEQ_EVENT_KERNEL_ERROR, since this requires no extra * kmalloc. */ static int bounce_error_event(struct snd_seq_client *client, struct snd_seq_event *event, int err, int atomic, int hop) { struct snd_seq_event bounce_ev; int result; if (client == NULL || ! (client->filter & SNDRV_SEQ_FILTER_BOUNCE) || ! client->accept_input) return 0; /* ignored */ /* set up quoted error */ memset(&bounce_ev, 0, sizeof(bounce_ev)); bounce_ev.type = SNDRV_SEQ_EVENT_KERNEL_ERROR; bounce_ev.flags = SNDRV_SEQ_EVENT_LENGTH_FIXED; bounce_ev.queue = SNDRV_SEQ_QUEUE_DIRECT; bounce_ev.source.client = SNDRV_SEQ_CLIENT_SYSTEM; bounce_ev.source.port = SNDRV_SEQ_PORT_SYSTEM_ANNOUNCE; bounce_ev.dest.client = client->number; bounce_ev.dest.port = event->source.port; bounce_ev.data.quote.origin = event->dest; bounce_ev.data.quote.event = event; bounce_ev.data.quote.value = -err; /* use positive value */ result = snd_seq_deliver_single_event(NULL, &bounce_ev, atomic, hop + 1); if (result < 0) { client->event_lost++; return result; } return result; } /* * rewrite the time-stamp of the event record with the curren time * of the given queue. * return non-zero if updated. */ static int update_timestamp_of_queue(struct snd_seq_event *event, int queue, int real_time) { struct snd_seq_queue *q __free(snd_seq_queue) = queueptr(queue); if (! q) return 0; event->queue = queue; event->flags &= ~SNDRV_SEQ_TIME_STAMP_MASK; if (real_time) { event->time.time = snd_seq_timer_get_cur_time(q->timer, true); event->flags |= SNDRV_SEQ_TIME_STAMP_REAL; } else { event->time.tick = snd_seq_timer_get_cur_tick(q->timer); event->flags |= SNDRV_SEQ_TIME_STAMP_TICK; } return 1; } /* deliver a single event; called from below and UMP converter */ int __snd_seq_deliver_single_event(struct snd_seq_client *dest, struct snd_seq_client_port *dest_port, struct snd_seq_event *event, int atomic, int hop) { switch (dest->type) { case USER_CLIENT: if (!dest->data.user.fifo) return 0; return snd_seq_fifo_event_in(dest->data.user.fifo, event); case KERNEL_CLIENT: if (!dest_port->event_input) return 0; return dest_port->event_input(event, snd_seq_ev_is_direct(event), dest_port->private_data, atomic, hop); } return 0; } /* deliver a single event; called from snd_seq_deliver_single_event() */ static int _snd_seq_deliver_single_event(struct snd_seq_client *client, struct snd_seq_event *event, int atomic, int hop) { struct snd_seq_client *dest __free(snd_seq_client) = get_event_dest_client(event); if (dest == NULL) return -ENOENT; struct snd_seq_client_port *dest_port __free(snd_seq_port) = snd_seq_port_use_ptr(dest, event->dest.port); if (dest_port == NULL) return -ENOENT; /* check permission */ if (!check_port_perm(dest_port, SNDRV_SEQ_PORT_CAP_WRITE)) return -EPERM; if (dest_port->timestamping) update_timestamp_of_queue(event, dest_port->time_queue, dest_port->time_real); #if IS_ENABLED(CONFIG_SND_SEQ_UMP) if (snd_seq_ev_is_ump(event)) { if (!(dest->filter & SNDRV_SEQ_FILTER_NO_CONVERT)) return snd_seq_deliver_from_ump(client, dest, dest_port, event, atomic, hop); else if (dest->type == USER_CLIENT && !snd_seq_client_is_ump(dest)) return 0; // drop the event } else if (snd_seq_client_is_ump(dest)) { if (!(dest->filter & SNDRV_SEQ_FILTER_NO_CONVERT)) return snd_seq_deliver_to_ump(client, dest, dest_port, event, atomic, hop); } #endif /* CONFIG_SND_SEQ_UMP */ return __snd_seq_deliver_single_event(dest, dest_port, event, atomic, hop); } /* * deliver an event to the specified destination. * if filter is non-zero, client filter bitmap is tested. * * RETURN VALUE: 0 : if succeeded * <0 : error */ static int snd_seq_deliver_single_event(struct snd_seq_client *client, struct snd_seq_event *event, int atomic, int hop) { int result = _snd_seq_deliver_single_event(client, event, atomic, hop); if (result < 0 && !snd_seq_ev_is_direct(event)) return bounce_error_event(client, event, result, atomic, hop); return result; } /* * send the event to all subscribers: */ static int __deliver_to_subscribers(struct snd_seq_client *client, struct snd_seq_event *event, int port, int atomic, int hop) { struct snd_seq_subscribers *subs; int err, result = 0, num_ev = 0; union __snd_seq_event event_saved; size_t saved_size; struct snd_seq_port_subs_info *grp; if (port < 0) return 0; struct snd_seq_client_port *src_port __free(snd_seq_port) = snd_seq_port_use_ptr(client, port); if (!src_port) return 0; /* save original event record */ saved_size = snd_seq_event_packet_size(event); memcpy(&event_saved, event, saved_size); grp = &src_port->c_src; /* lock list */ if (atomic) read_lock(&grp->list_lock); else down_read_nested(&grp->list_mutex, hop); list_for_each_entry(subs, &grp->list_head, src_list) { /* both ports ready? */ if (atomic_read(&subs->ref_count) != 2) continue; event->dest = subs->info.dest; if (subs->info.flags & SNDRV_SEQ_PORT_SUBS_TIMESTAMP) /* convert time according to flag with subscription */ update_timestamp_of_queue(event, subs->info.queue, subs->info.flags & SNDRV_SEQ_PORT_SUBS_TIME_REAL); err = snd_seq_deliver_single_event(client, event, atomic, hop); if (err < 0) { /* save first error that occurs and continue */ if (!result) result = err; continue; } num_ev++; /* restore original event record */ memcpy(event, &event_saved, saved_size); } if (atomic) read_unlock(&grp->list_lock); else up_read(&grp->list_mutex); memcpy(event, &event_saved, saved_size); return (result < 0) ? result : num_ev; } static int deliver_to_subscribers(struct snd_seq_client *client, struct snd_seq_event *event, int atomic, int hop) { int ret; #if IS_ENABLED(CONFIG_SND_SEQ_UMP) int ret2; #endif ret = __deliver_to_subscribers(client, event, event->source.port, atomic, hop); #if IS_ENABLED(CONFIG_SND_SEQ_UMP) if (!snd_seq_client_is_ump(client) || client->ump_endpoint_port < 0) return ret; /* If it's an event from EP port (and with a UMP group), * deliver to subscribers of the corresponding UMP group port, too. * Or, if it's from non-EP port, deliver to subscribers of EP port, too. */ if (event->source.port == client->ump_endpoint_port) ret2 = __deliver_to_subscribers(client, event, snd_seq_ump_group_port(event), atomic, hop); else ret2 = __deliver_to_subscribers(client, event, client->ump_endpoint_port, atomic, hop); if (ret2 < 0) return ret2; #endif return ret; } /* deliver an event to the destination port(s). * if the event is to subscribers or broadcast, the event is dispatched * to multiple targets. * * RETURN VALUE: n > 0 : the number of delivered events. * n == 0 : the event was not passed to any client. * n < 0 : error - event was not processed. */ static int snd_seq_deliver_event(struct snd_seq_client *client, struct snd_seq_event *event, int atomic, int hop) { int result; hop++; if (hop >= SNDRV_SEQ_MAX_HOPS) { pr_debug("ALSA: seq: too long delivery path (%d:%d->%d:%d)\n", event->source.client, event->source.port, event->dest.client, event->dest.port); return -EMLINK; } if (snd_seq_ev_is_variable(event) && snd_BUG_ON(atomic && (event->data.ext.len & SNDRV_SEQ_EXT_USRPTR))) return -EINVAL; if (event->queue == SNDRV_SEQ_ADDRESS_SUBSCRIBERS || event->dest.client == SNDRV_SEQ_ADDRESS_SUBSCRIBERS) result = deliver_to_subscribers(client, event, atomic, hop); else result = snd_seq_deliver_single_event(client, event, atomic, hop); return result; } /* * dispatch an event cell: * This function is called only from queue check routines in timer * interrupts or after enqueued. * The event cell shall be released or re-queued in this function. * * RETURN VALUE: n > 0 : the number of delivered events. * n == 0 : the event was not passed to any client. * n < 0 : error - event was not processed. */ int snd_seq_dispatch_event(struct snd_seq_event_cell *cell, int atomic, int hop) { int result; if (snd_BUG_ON(!cell)) return -EINVAL; struct snd_seq_client *client __free(snd_seq_client) = snd_seq_client_use_ptr(cell->event.source.client); if (client == NULL) { snd_seq_cell_free(cell); /* release this cell */ return -EINVAL; } if (!snd_seq_ev_is_ump(&cell->event) && cell->event.type == SNDRV_SEQ_EVENT_NOTE) { /* NOTE event: * the event cell is re-used as a NOTE-OFF event and * enqueued again. */ struct snd_seq_event tmpev, *ev; /* reserve this event to enqueue note-off later */ tmpev = cell->event; tmpev.type = SNDRV_SEQ_EVENT_NOTEON; result = snd_seq_deliver_event(client, &tmpev, atomic, hop); /* * This was originally a note event. We now re-use the * cell for the note-off event. */ ev = &cell->event; ev->type = SNDRV_SEQ_EVENT_NOTEOFF; ev->flags |= SNDRV_SEQ_PRIORITY_HIGH; /* add the duration time */ switch (ev->flags & SNDRV_SEQ_TIME_STAMP_MASK) { case SNDRV_SEQ_TIME_STAMP_TICK: cell->event.time.tick += ev->data.note.duration; break; case SNDRV_SEQ_TIME_STAMP_REAL: /* unit for duration is ms */ ev->time.time.tv_nsec += 1000000 * (ev->data.note.duration % 1000); ev->time.time.tv_sec += ev->data.note.duration / 1000 + ev->time.time.tv_nsec / 1000000000; ev->time.time.tv_nsec %= 1000000000; break; } ev->data.note.velocity = ev->data.note.off_velocity; /* Now queue this cell as the note off event */ if (snd_seq_enqueue_event(cell, atomic, hop) < 0) snd_seq_cell_free(cell); /* release this cell */ } else { /* Normal events: * event cell is freed after processing the event */ result = snd_seq_deliver_event(client, &cell->event, atomic, hop); snd_seq_cell_free(cell); } return result; } /* Allocate a cell from client pool and enqueue it to queue: * if pool is empty and blocking is TRUE, sleep until a new cell is * available. */ static int snd_seq_client_enqueue_event(struct snd_seq_client *client, struct snd_seq_event *event, struct file *file, int blocking, int atomic, int hop, struct mutex *mutexp) { struct snd_seq_event_cell *cell; int err; /* special queue values - force direct passing */ if (event->queue == SNDRV_SEQ_ADDRESS_SUBSCRIBERS) { event->dest.client = SNDRV_SEQ_ADDRESS_SUBSCRIBERS; event->queue = SNDRV_SEQ_QUEUE_DIRECT; } else if (event->dest.client == SNDRV_SEQ_ADDRESS_SUBSCRIBERS) { /* check presence of source port */ struct snd_seq_client_port *src_port __free(snd_seq_port) = snd_seq_port_use_ptr(client, event->source.port); if (!src_port) return -EINVAL; } /* direct event processing without enqueued */ if (snd_seq_ev_is_direct(event)) { if (!snd_seq_ev_is_ump(event) && event->type == SNDRV_SEQ_EVENT_NOTE) return -EINVAL; /* this event must be enqueued! */ return snd_seq_deliver_event(client, event, atomic, hop); } /* Not direct, normal queuing */ if (snd_seq_queue_is_used(event->queue, client->number) <= 0) return -EINVAL; /* invalid queue */ if (! snd_seq_write_pool_allocated(client)) return -ENXIO; /* queue is not allocated */ /* allocate an event cell */ err = snd_seq_event_dup(client->pool, event, &cell, !blocking || atomic, file, mutexp); if (err < 0) return err; /* we got a cell. enqueue it. */ err = snd_seq_enqueue_event(cell, atomic, hop); if (err < 0) { snd_seq_cell_free(cell); return err; } return 0; } /* * check validity of event type and data length. * return non-zero if invalid. */ static int check_event_type_and_length(struct snd_seq_event *ev) { switch (snd_seq_ev_length_type(ev)) { case SNDRV_SEQ_EVENT_LENGTH_FIXED: if (snd_seq_ev_is_variable_type(ev)) return -EINVAL; break; case SNDRV_SEQ_EVENT_LENGTH_VARIABLE: if (! snd_seq_ev_is_variable_type(ev) || (ev->data.ext.len & ~SNDRV_SEQ_EXT_MASK) >= SNDRV_SEQ_MAX_EVENT_LEN) return -EINVAL; break; case SNDRV_SEQ_EVENT_LENGTH_VARUSR: if (! snd_seq_ev_is_direct(ev)) return -EINVAL; break; } return 0; } /* handle write() */ /* possible error values: * -ENXIO invalid client or file open mode * -ENOMEM malloc failed * -EFAULT seg. fault during copy from user space * -EINVAL invalid event * -EAGAIN no space in output pool * -EINTR interrupts while sleep * -EMLINK too many hops * others depends on return value from driver callback */ static ssize_t snd_seq_write(struct file *file, const char __user *buf, size_t count, loff_t *offset) { struct snd_seq_client *client = file->private_data; int written = 0, len; int err, handled; union __snd_seq_event __event; struct snd_seq_event *ev = &__event.legacy; if (!(snd_seq_file_flags(file) & SNDRV_SEQ_LFLG_OUTPUT)) return -ENXIO; /* check client structures are in place */ if (snd_BUG_ON(!client)) return -ENXIO; if (!client->accept_output || client->pool == NULL) return -ENXIO; repeat: handled = 0; /* allocate the pool now if the pool is not allocated yet */ mutex_lock(&client->ioctl_mutex); if (client->pool->size > 0 && !snd_seq_write_pool_allocated(client)) { err = snd_seq_pool_init(client->pool); if (err < 0) goto out; } /* only process whole events */ err = -EINVAL; while (count >= sizeof(struct snd_seq_event)) { /* Read in the event header from the user */ len = sizeof(struct snd_seq_event); if (copy_from_user(ev, buf, len)) { err = -EFAULT; break; } /* read in the rest bytes for UMP events */ if (snd_seq_ev_is_ump(ev)) { if (count < sizeof(struct snd_seq_ump_event)) break; if (copy_from_user((char *)ev + len, buf + len, sizeof(struct snd_seq_ump_event) - len)) { err = -EFAULT; break; } len = sizeof(struct snd_seq_ump_event); } ev->source.client = client->number; /* fill in client number */ /* Check for extension data length */ if (check_event_type_and_length(ev)) { err = -EINVAL; break; } if (!event_is_compatible(client, ev)) { err = -EINVAL; break; } /* check for special events */ if (!snd_seq_ev_is_ump(ev)) { if (ev->type == SNDRV_SEQ_EVENT_NONE) goto __skip_event; else if (snd_seq_ev_is_reserved(ev)) { err = -EINVAL; break; } } if (snd_seq_ev_is_variable(ev)) { int extlen = ev->data.ext.len & ~SNDRV_SEQ_EXT_MASK; if ((size_t)(extlen + len) > count) { /* back out, will get an error this time or next */ err = -EINVAL; break; } /* set user space pointer */ ev->data.ext.len = extlen | SNDRV_SEQ_EXT_USRPTR; ev->data.ext.ptr = (char __force *)buf + len; len += extlen; /* increment data length */ } else { #ifdef CONFIG_COMPAT if (client->convert32 && snd_seq_ev_is_varusr(ev)) ev->data.ext.ptr = (void __force *)compat_ptr(ev->data.raw32.d[1]); #endif } /* ok, enqueue it */ err = snd_seq_client_enqueue_event(client, ev, file, !(file->f_flags & O_NONBLOCK), 0, 0, &client->ioctl_mutex); if (err < 0) break; handled++; __skip_event: /* Update pointers and counts */ count -= len; buf += len; written += len; /* let's have a coffee break if too many events are queued */ if (++handled >= 200) { mutex_unlock(&client->ioctl_mutex); goto repeat; } } out: mutex_unlock(&client->ioctl_mutex); return written ? written : err; } /* * handle polling */ static __poll_t snd_seq_poll(struct file *file, poll_table * wait) { struct snd_seq_client *client = file->private_data; __poll_t mask = 0; /* check client structures are in place */ if (snd_BUG_ON(!client)) return EPOLLERR; if ((snd_seq_file_flags(file) & SNDRV_SEQ_LFLG_INPUT) && client->data.user.fifo) { /* check if data is available in the outqueue */ if (snd_seq_fifo_poll_wait(client->data.user.fifo, file, wait)) mask |= EPOLLIN | EPOLLRDNORM; } if (snd_seq_file_flags(file) & SNDRV_SEQ_LFLG_OUTPUT) { /* check if data is available in the pool */ if (snd_seq_pool_poll_wait(client->pool, file, wait)) mask |= EPOLLOUT | EPOLLWRNORM; } return mask; } /*-----------------------------------------------------*/ static int snd_seq_ioctl_pversion(struct snd_seq_client *client, void *arg) { int *pversion = arg; *pversion = SNDRV_SEQ_VERSION; return 0; } static int snd_seq_ioctl_user_pversion(struct snd_seq_client *client, void *arg) { client->user_pversion = *(unsigned int *)arg; return 0; } static int snd_seq_ioctl_client_id(struct snd_seq_client *client, void *arg) { int *client_id = arg; *client_id = client->number; return 0; } /* SYSTEM_INFO ioctl() */ static int snd_seq_ioctl_system_info(struct snd_seq_client *client, void *arg) { struct snd_seq_system_info *info = arg; memset(info, 0, sizeof(*info)); /* fill the info fields */ info->queues = SNDRV_SEQ_MAX_QUEUES; info->clients = SNDRV_SEQ_MAX_CLIENTS; info->ports = SNDRV_SEQ_MAX_PORTS; info->channels = 256; /* fixed limit */ info->cur_clients = client_usage.cur; info->cur_queues = snd_seq_queue_get_cur_queues(); return 0; } /* RUNNING_MODE ioctl() */ static int snd_seq_ioctl_running_mode(struct snd_seq_client *client, void *arg) { struct snd_seq_running_info *info = arg; /* requested client number */ struct snd_seq_client *cptr __free(snd_seq_client) = client_load_and_use_ptr(info->client); if (cptr == NULL) return -ENOENT; /* don't change !!! */ #ifdef SNDRV_BIG_ENDIAN if (!info->big_endian) return -EINVAL; #else if (info->big_endian) return -EINVAL; #endif if (info->cpu_mode > sizeof(long)) return -EINVAL; cptr->convert32 = (info->cpu_mode < sizeof(long)); return 0; } /* CLIENT_INFO ioctl() */ static void get_client_info(struct snd_seq_client *cptr, struct snd_seq_client_info *info) { info->client = cptr->number; /* fill the info fields */ info->type = cptr->type; strscpy(info->name, cptr->name); info->filter = cptr->filter; info->event_lost = cptr->event_lost; memcpy(info->event_filter, cptr->event_filter, 32); info->group_filter = cptr->group_filter; info->num_ports = cptr->num_ports; if (cptr->type == USER_CLIENT) info->pid = pid_vnr(cptr->data.user.owner); else info->pid = -1; if (cptr->type == KERNEL_CLIENT) info->card = cptr->data.kernel.card ? cptr->data.kernel.card->number : -1; else info->card = -1; info->midi_version = cptr->midi_version; memset(info->reserved, 0, sizeof(info->reserved)); } static int snd_seq_ioctl_get_client_info(struct snd_seq_client *client, void *arg) { struct snd_seq_client_info *client_info = arg; /* requested client number */ struct snd_seq_client *cptr __free(snd_seq_client) = client_load_and_use_ptr(client_info->client); if (cptr == NULL) return -ENOENT; /* don't change !!! */ get_client_info(cptr, client_info); return 0; } /* CLIENT_INFO ioctl() */ static int snd_seq_ioctl_set_client_info(struct snd_seq_client *client, void *arg) { struct snd_seq_client_info *client_info = arg; /* it is not allowed to set the info fields for an another client */ if (client->number != client_info->client) return -EPERM; /* also client type must be set now */ if (client->type != client_info->type) return -EINVAL; if (client->user_pversion >= SNDRV_PROTOCOL_VERSION(1, 0, 3)) { /* check validity of midi_version field */ if (client_info->midi_version > SNDRV_SEQ_CLIENT_UMP_MIDI_2_0) return -EINVAL; /* check if UMP is supported in kernel */ if (!IS_ENABLED(CONFIG_SND_SEQ_UMP) && client_info->midi_version > 0) return -EINVAL; } /* fill the info fields */ if (client_info->name[0]) strscpy(client->name, client_info->name, sizeof(client->name)); client->filter = client_info->filter; client->event_lost = client_info->event_lost; if (client->user_pversion >= SNDRV_PROTOCOL_VERSION(1, 0, 3)) client->midi_version = client_info->midi_version; memcpy(client->event_filter, client_info->event_filter, 32); client->group_filter = client_info->group_filter & SND_SEQ_GROUP_FILTER_MASK; /* notify the change */ snd_seq_system_client_ev_client_change(client->number); return 0; } /* * CREATE PORT ioctl() */ static int snd_seq_ioctl_create_port(struct snd_seq_client *client, void *arg) { struct snd_seq_port_info *info = arg; struct snd_seq_client_port *port; struct snd_seq_port_callback *callback; int port_idx, err; /* it is not allowed to create the port for an another client */ if (info->addr.client != client->number) return -EPERM; if (client->type == USER_CLIENT && info->kernel) return -EINVAL; if ((info->capability & SNDRV_SEQ_PORT_CAP_UMP_ENDPOINT) && client->ump_endpoint_port >= 0) return -EBUSY; if (info->flags & SNDRV_SEQ_PORT_FLG_GIVEN_PORT) port_idx = info->addr.port; else port_idx = -1; if (port_idx >= SNDRV_SEQ_ADDRESS_UNKNOWN) return -EINVAL; err = snd_seq_create_port(client, port_idx, &port); if (err < 0) return err; if (client->type == KERNEL_CLIENT) { callback = info->kernel; if (callback) { if (callback->owner) port->owner = callback->owner; port->private_data = callback->private_data; port->private_free = callback->private_free; port->event_input = callback->event_input; port->c_src.open = callback->subscribe; port->c_src.close = callback->unsubscribe; port->c_dest.open = callback->use; port->c_dest.close = callback->unuse; } } info->addr = port->addr; snd_seq_set_port_info(port, info); if (info->capability & SNDRV_SEQ_PORT_CAP_UMP_ENDPOINT) client->ump_endpoint_port = port->addr.port; snd_seq_system_client_ev_port_start(port->addr.client, port->addr.port); snd_seq_port_unlock(port); return 0; } /* * DELETE PORT ioctl() */ static int snd_seq_ioctl_delete_port(struct snd_seq_client *client, void *arg) { struct snd_seq_port_info *info = arg; int err; /* it is not allowed to remove the port for an another client */ if (info->addr.client != client->number) return -EPERM; err = snd_seq_delete_port(client, info->addr.port); if (err >= 0) { if (client->ump_endpoint_port == info->addr.port) client->ump_endpoint_port = -1; snd_seq_system_client_ev_port_exit(client->number, info->addr.port); } return err; } /* * GET_PORT_INFO ioctl() (on any client) */ static int snd_seq_ioctl_get_port_info(struct snd_seq_client *client, void *arg) { struct snd_seq_port_info *info = arg; struct snd_seq_client *cptr __free(snd_seq_client) = client_load_and_use_ptr(info->addr.client); if (cptr == NULL) return -ENXIO; struct snd_seq_client_port *port __free(snd_seq_port) = snd_seq_port_use_ptr(cptr, info->addr.port); if (port == NULL) return -ENOENT; /* don't change */ /* get port info */ snd_seq_get_port_info(port, info); return 0; } /* * SET_PORT_INFO ioctl() (only ports on this/own client) */ static int snd_seq_ioctl_set_port_info(struct snd_seq_client *client, void *arg) { struct snd_seq_port_info *info = arg; if (info->addr.client != client->number) /* only set our own ports ! */ return -EPERM; struct snd_seq_client_port *port __free(snd_seq_port) = snd_seq_port_use_ptr(client, info->addr.port); if (port) { snd_seq_set_port_info(port, info); /* notify the change */ snd_seq_system_client_ev_port_change(info->addr.client, info->addr.port); } return 0; } /* * port subscription (connection) */ #define PERM_RD (SNDRV_SEQ_PORT_CAP_READ|SNDRV_SEQ_PORT_CAP_SUBS_READ) #define PERM_WR (SNDRV_SEQ_PORT_CAP_WRITE|SNDRV_SEQ_PORT_CAP_SUBS_WRITE) static int check_subscription_permission(struct snd_seq_client *client, struct snd_seq_client_port *sport, struct snd_seq_client_port *dport, struct snd_seq_port_subscribe *subs) { if (client->number != subs->sender.client && client->number != subs->dest.client) { /* connection by third client - check export permission */ if (check_port_perm(sport, SNDRV_SEQ_PORT_CAP_NO_EXPORT)) return -EPERM; if (check_port_perm(dport, SNDRV_SEQ_PORT_CAP_NO_EXPORT)) return -EPERM; } /* check read permission */ /* if sender or receiver is the subscribing client itself, * no permission check is necessary */ if (client->number != subs->sender.client) { if (! check_port_perm(sport, PERM_RD)) return -EPERM; } /* check write permission */ if (client->number != subs->dest.client) { if (! check_port_perm(dport, PERM_WR)) return -EPERM; } return 0; } /* * send an subscription notify event to user client: * client must be user client. */ int snd_seq_client_notify_subscription(int client, int port, struct snd_seq_port_subscribe *info, int evtype) { struct snd_seq_event event; memset(&event, 0, sizeof(event)); event.type = evtype; event.data.connect.dest = info->dest; event.data.connect.sender = info->sender; return snd_seq_system_notify(client, port, &event, false); /* non-atomic */ } /* * add to port's subscription list IOCTL interface */ static int snd_seq_ioctl_subscribe_port(struct snd_seq_client *client, void *arg) { struct snd_seq_port_subscribe *subs = arg; int result; struct snd_seq_client *receiver __free(snd_seq_client) = client_load_and_use_ptr(subs->dest.client); if (!receiver) return -EINVAL; struct snd_seq_client *sender __free(snd_seq_client) = client_load_and_use_ptr(subs->sender.client); if (!sender) return -EINVAL; struct snd_seq_client_port *sport __free(snd_seq_port) = snd_seq_port_use_ptr(sender, subs->sender.port); if (!sport) return -EINVAL; struct snd_seq_client_port *dport __free(snd_seq_port) = snd_seq_port_use_ptr(receiver, subs->dest.port); if (!dport) return -EINVAL; result = check_subscription_permission(client, sport, dport, subs); if (result < 0) return result; /* connect them */ result = snd_seq_port_connect(client, sender, sport, receiver, dport, subs); if (! result) /* broadcast announce */ snd_seq_client_notify_subscription(SNDRV_SEQ_ADDRESS_SUBSCRIBERS, 0, subs, SNDRV_SEQ_EVENT_PORT_SUBSCRIBED); return result; } /* * remove from port's subscription list */ static int snd_seq_ioctl_unsubscribe_port(struct snd_seq_client *client, void *arg) { struct snd_seq_port_subscribe *subs = arg; int result; struct snd_seq_client *receiver __free(snd_seq_client) = snd_seq_client_use_ptr(subs->dest.client); if (!receiver) return -ENXIO; struct snd_seq_client *sender __free(snd_seq_client) = snd_seq_client_use_ptr(subs->sender.client); if (!sender) return -ENXIO; struct snd_seq_client_port *sport __free(snd_seq_port) = snd_seq_port_use_ptr(sender, subs->sender.port); if (!sport) return -ENXIO; struct snd_seq_client_port *dport __free(snd_seq_port) = snd_seq_port_use_ptr(receiver, subs->dest.port); if (!dport) return -ENXIO; result = check_subscription_permission(client, sport, dport, subs); if (result < 0) return result; result = snd_seq_port_disconnect(client, sender, sport, receiver, dport, subs); if (! result) /* broadcast announce */ snd_seq_client_notify_subscription(SNDRV_SEQ_ADDRESS_SUBSCRIBERS, 0, subs, SNDRV_SEQ_EVENT_PORT_UNSUBSCRIBED); return result; } /* CREATE_QUEUE ioctl() */ static int snd_seq_ioctl_create_queue(struct snd_seq_client *client, void *arg) { struct snd_seq_queue_info *info = arg; struct snd_seq_queue *q __free(snd_seq_queue) = snd_seq_queue_alloc(client->number, info->locked, info->flags); if (IS_ERR(q)) return PTR_ERR(q); info->queue = q->queue; info->locked = q->locked; info->owner = q->owner; /* set queue name */ if (!info->name[0]) snprintf(info->name, sizeof(info->name), "Queue-%d", q->queue); strscpy(q->name, info->name, sizeof(q->name)); return 0; } /* DELETE_QUEUE ioctl() */ static int snd_seq_ioctl_delete_queue(struct snd_seq_client *client, void *arg) { struct snd_seq_queue_info *info = arg; return snd_seq_queue_delete(client->number, info->queue); } /* GET_QUEUE_INFO ioctl() */ static int snd_seq_ioctl_get_queue_info(struct snd_seq_client *client, void *arg) { struct snd_seq_queue_info *info = arg; struct snd_seq_queue *q __free(snd_seq_queue) = queueptr(info->queue); if (q == NULL) return -EINVAL; memset(info, 0, sizeof(*info)); info->queue = q->queue; info->owner = q->owner; info->locked = q->locked; strscpy(info->name, q->name, sizeof(info->name)); return 0; } /* SET_QUEUE_INFO ioctl() */ static int snd_seq_ioctl_set_queue_info(struct snd_seq_client *client, void *arg) { struct snd_seq_queue_info *info = arg; if (info->owner != client->number) return -EINVAL; /* change owner/locked permission */ if (snd_seq_queue_check_access(info->queue, client->number)) { if (snd_seq_queue_set_owner(info->queue, client->number, info->locked) < 0) return -EPERM; if (info->locked) snd_seq_queue_use(info->queue, client->number, 1); } else { return -EPERM; } struct snd_seq_queue *q __free(snd_seq_queue) = queueptr(info->queue); if (! q) return -EINVAL; if (q->owner != client->number) return -EPERM; strscpy(q->name, info->name, sizeof(q->name)); return 0; } /* GET_NAMED_QUEUE ioctl() */ static int snd_seq_ioctl_get_named_queue(struct snd_seq_client *client, void *arg) { struct snd_seq_queue_info *info = arg; struct snd_seq_queue *q __free(snd_seq_queue) = snd_seq_queue_find_name(info->name); if (q == NULL) return -EINVAL; info->queue = q->queue; info->owner = q->owner; info->locked = q->locked; return 0; } /* GET_QUEUE_STATUS ioctl() */ static int snd_seq_ioctl_get_queue_status(struct snd_seq_client *client, void *arg) { struct snd_seq_queue_status *status = arg; struct snd_seq_timer *tmr; struct snd_seq_queue *queue __free(snd_seq_queue) = queueptr(status->queue); if (queue == NULL) return -EINVAL; memset(status, 0, sizeof(*status)); status->queue = queue->queue; tmr = queue->timer; status->events = queue->tickq->cells + queue->timeq->cells; status->time = snd_seq_timer_get_cur_time(tmr, true); status->tick = snd_seq_timer_get_cur_tick(tmr); status->running = tmr->running; status->flags = queue->flags; return 0; } /* GET_QUEUE_TEMPO ioctl() */ static int snd_seq_ioctl_get_queue_tempo(struct snd_seq_client *client, void *arg) { struct snd_seq_queue_tempo *tempo = arg; struct snd_seq_timer *tmr; struct snd_seq_queue *queue __free(snd_seq_queue) = queueptr(tempo->queue); if (queue == NULL) return -EINVAL; memset(tempo, 0, sizeof(*tempo)); tempo->queue = queue->queue; tmr = queue->timer; tempo->tempo = tmr->tempo; tempo->ppq = tmr->ppq; tempo->skew_value = tmr->skew; tempo->skew_base = tmr->skew_base; if (client->user_pversion >= SNDRV_PROTOCOL_VERSION(1, 0, 4)) tempo->tempo_base = tmr->tempo_base; return 0; } /* SET_QUEUE_TEMPO ioctl() */ int snd_seq_set_queue_tempo(int client, struct snd_seq_queue_tempo *tempo) { if (!snd_seq_queue_check_access(tempo->queue, client)) return -EPERM; return snd_seq_queue_timer_set_tempo(tempo->queue, client, tempo); } EXPORT_SYMBOL(snd_seq_set_queue_tempo); static int snd_seq_ioctl_set_queue_tempo(struct snd_seq_client *client, void *arg) { struct snd_seq_queue_tempo *tempo = arg; int result; if (client->user_pversion < SNDRV_PROTOCOL_VERSION(1, 0, 4)) tempo->tempo_base = 0; result = snd_seq_set_queue_tempo(client->number, tempo); return result < 0 ? result : 0; } /* GET_QUEUE_TIMER ioctl() */ static int snd_seq_ioctl_get_queue_timer(struct snd_seq_client *client, void *arg) { struct snd_seq_queue_timer *timer = arg; struct snd_seq_timer *tmr; struct snd_seq_queue *queue __free(snd_seq_queue) = queueptr(timer->queue); if (queue == NULL) return -EINVAL; guard(mutex)(&queue->timer_mutex); tmr = queue->timer; memset(timer, 0, sizeof(*timer)); timer->queue = queue->queue; timer->type = tmr->type; if (tmr->type == SNDRV_SEQ_TIMER_ALSA) { timer->u.alsa.id = tmr->alsa_id; timer->u.alsa.resolution = tmr->preferred_resolution; } return 0; } /* SET_QUEUE_TIMER ioctl() */ static int snd_seq_ioctl_set_queue_timer(struct snd_seq_client *client, void *arg) { struct snd_seq_queue_timer *timer = arg; int result = 0; if (timer->type != SNDRV_SEQ_TIMER_ALSA) return -EINVAL; if (snd_seq_queue_check_access(timer->queue, client->number)) { struct snd_seq_timer *tmr; struct snd_seq_queue *q __free(snd_seq_queue) = queueptr(timer->queue); if (q == NULL) return -ENXIO; guard(mutex)(&q->timer_mutex); tmr = q->timer; snd_seq_queue_timer_close(timer->queue); tmr->type = timer->type; if (tmr->type == SNDRV_SEQ_TIMER_ALSA) { tmr->alsa_id = timer->u.alsa.id; tmr->preferred_resolution = timer->u.alsa.resolution; } result = snd_seq_queue_timer_open(timer->queue); } else { return -EPERM; } return result; } /* GET_QUEUE_CLIENT ioctl() */ static int snd_seq_ioctl_get_queue_client(struct snd_seq_client *client, void *arg) { struct snd_seq_queue_client *info = arg; int used; used = snd_seq_queue_is_used(info->queue, client->number); if (used < 0) return -EINVAL; info->used = used; info->client = client->number; return 0; } /* SET_QUEUE_CLIENT ioctl() */ static int snd_seq_ioctl_set_queue_client(struct snd_seq_client *client, void *arg) { struct snd_seq_queue_client *info = arg; int err; if (info->used >= 0) { err = snd_seq_queue_use(info->queue, client->number, info->used); if (err < 0) return err; } return snd_seq_ioctl_get_queue_client(client, arg); } /* GET_CLIENT_POOL ioctl() */ static int snd_seq_ioctl_get_client_pool(struct snd_seq_client *client, void *arg) { struct snd_seq_client_pool *info = arg; struct snd_seq_client *cptr __free(snd_seq_client) = client_load_and_use_ptr(info->client); if (cptr == NULL) return -ENOENT; memset(info, 0, sizeof(*info)); info->client = cptr->number; info->output_pool = cptr->pool->size; info->output_room = cptr->pool->room; info->output_free = info->output_pool; info->output_free = snd_seq_unused_cells(cptr->pool); if (cptr->type == USER_CLIENT) { info->input_pool = cptr->data.user.fifo_pool_size; info->input_free = info->input_pool; info->input_free = snd_seq_fifo_unused_cells(cptr->data.user.fifo); } else { info->input_pool = 0; info->input_free = 0; } return 0; } /* SET_CLIENT_POOL ioctl() */ static int snd_seq_ioctl_set_client_pool(struct snd_seq_client *client, void *arg) { struct snd_seq_client_pool *info = arg; int rc; if (client->number != info->client) return -EINVAL; /* can't change other clients */ if (info->output_pool >= 1 && info->output_pool <= SNDRV_SEQ_MAX_EVENTS && (! snd_seq_write_pool_allocated(client) || info->output_pool != client->pool->size)) { if (snd_seq_write_pool_allocated(client)) { /* is the pool in use? */ if (atomic_read(&client->pool->counter)) return -EBUSY; /* remove all existing cells */ snd_seq_pool_mark_closing(client->pool); snd_seq_pool_done(client->pool); } client->pool->size = info->output_pool; rc = snd_seq_pool_init(client->pool); if (rc < 0) return rc; } if (client->type == USER_CLIENT && client->data.user.fifo != NULL && info->input_pool >= 1 && info->input_pool <= SNDRV_SEQ_MAX_CLIENT_EVENTS && info->input_pool != client->data.user.fifo_pool_size) { /* change pool size */ rc = snd_seq_fifo_resize(client->data.user.fifo, info->input_pool); if (rc < 0) return rc; client->data.user.fifo_pool_size = info->input_pool; } if (info->output_room >= 1 && info->output_room <= client->pool->size) { client->pool->room = info->output_room; } return snd_seq_ioctl_get_client_pool(client, arg); } /* REMOVE_EVENTS ioctl() */ static int snd_seq_ioctl_remove_events(struct snd_seq_client *client, void *arg) { struct snd_seq_remove_events *info = arg; /* * Input mostly not implemented XXX. */ if (info->remove_mode & SNDRV_SEQ_REMOVE_INPUT) { /* * No restrictions so for a user client we can clear * the whole fifo */ if (client->type == USER_CLIENT && client->data.user.fifo) snd_seq_fifo_clear(client->data.user.fifo); } if (info->remove_mode & SNDRV_SEQ_REMOVE_OUTPUT) snd_seq_queue_remove_cells(client->number, info); return 0; } /* * get subscription info */ static int snd_seq_ioctl_get_subscription(struct snd_seq_client *client, void *arg) { struct snd_seq_port_subscribe *subs = arg; struct snd_seq_client *sender __free(snd_seq_client) = client_load_and_use_ptr(subs->sender.client); if (!sender) return -EINVAL; struct snd_seq_client_port *sport __free(snd_seq_port) = snd_seq_port_use_ptr(sender, subs->sender.port); if (!sport) return -EINVAL; return snd_seq_port_get_subscription(&sport->c_src, &subs->dest, subs); } /* * get subscription info - check only its presence */ static int snd_seq_ioctl_query_subs(struct snd_seq_client *client, void *arg) { struct snd_seq_query_subs *subs = arg; struct snd_seq_port_subs_info *group; struct list_head *p; int i; struct snd_seq_client *cptr __free(snd_seq_client) = client_load_and_use_ptr(subs->root.client); if (!cptr) return -ENXIO; struct snd_seq_client_port *port __free(snd_seq_port) = snd_seq_port_use_ptr(cptr, subs->root.port); if (!port) return -ENXIO; switch (subs->type) { case SNDRV_SEQ_QUERY_SUBS_READ: group = &port->c_src; break; case SNDRV_SEQ_QUERY_SUBS_WRITE: group = &port->c_dest; break; default: return -ENXIO; } guard(rwsem_read)(&group->list_mutex); /* search for the subscriber */ subs->num_subs = group->count; i = 0; list_for_each(p, &group->list_head) { if (i++ == subs->index) { /* found! */ struct snd_seq_subscribers *s; if (subs->type == SNDRV_SEQ_QUERY_SUBS_READ) { s = list_entry(p, struct snd_seq_subscribers, src_list); subs->addr = s->info.dest; } else { s = list_entry(p, struct snd_seq_subscribers, dest_list); subs->addr = s->info.sender; } subs->flags = s->info.flags; subs->queue = s->info.queue; return 0; } } return -ENOENT; } /* * query next client */ static int snd_seq_ioctl_query_next_client(struct snd_seq_client *client, void *arg) { struct snd_seq_client_info *info = arg; /* search for next client */ if (info->client < INT_MAX) info->client++; if (info->client < 0) info->client = 0; for (; info->client < SNDRV_SEQ_MAX_CLIENTS; info->client++) { struct snd_seq_client *cptr __free(snd_seq_client) = client_load_and_use_ptr(info->client); if (cptr) { get_client_info(cptr, info); return 0; /* found */ } } return -ENOENT; } /* * query next port */ static int snd_seq_ioctl_query_next_port(struct snd_seq_client *client, void *arg) { struct snd_seq_port_info *info = arg; struct snd_seq_client *cptr __free(snd_seq_client) = client_load_and_use_ptr(info->addr.client); if (cptr == NULL) return -ENXIO; /* search for next port */ info->addr.port++; struct snd_seq_client_port *port __free(snd_seq_port) = snd_seq_port_query_nearest(cptr, info); if (port == NULL) return -ENOENT; /* get port info */ info->addr = port->addr; snd_seq_get_port_info(port, info); return 0; } #if IS_ENABLED(CONFIG_SND_SEQ_UMP) #define NUM_UMP_INFOS (SNDRV_UMP_MAX_BLOCKS + 1) static void free_ump_info(struct snd_seq_client *client) { int i; if (!client->ump_info) return; for (i = 0; i < NUM_UMP_INFOS; i++) kfree(client->ump_info[i]); kfree(client->ump_info); client->ump_info = NULL; } static void terminate_ump_info_strings(void *p, int type) { if (type == SNDRV_SEQ_CLIENT_UMP_INFO_ENDPOINT) { struct snd_ump_endpoint_info *ep = p; ep->name[sizeof(ep->name) - 1] = 0; } else { struct snd_ump_block_info *bp = p; bp->name[sizeof(bp->name) - 1] = 0; } } #ifdef CONFIG_SND_PROC_FS static void dump_ump_info(struct snd_info_buffer *buffer, struct snd_seq_client *client) { struct snd_ump_endpoint_info *ep; struct snd_ump_block_info *bp; int i; if (!client->ump_info) return; ep = client->ump_info[SNDRV_SEQ_CLIENT_UMP_INFO_ENDPOINT]; if (ep && *ep->name) snd_iprintf(buffer, " UMP Endpoint: \"%s\"\n", ep->name); for (i = 0; i < SNDRV_UMP_MAX_BLOCKS; i++) { bp = client->ump_info[i + 1]; if (bp && *bp->name) { snd_iprintf(buffer, " UMP Block %d: \"%s\" [%s]\n", i, bp->name, bp->active ? "Active" : "Inactive"); snd_iprintf(buffer, " Groups: %d-%d\n", bp->first_group + 1, bp->first_group + bp->num_groups); } } } #endif /* UMP-specific ioctls -- called directly without data copy */ static int snd_seq_ioctl_client_ump_info(struct snd_seq_client *caller, unsigned int cmd, unsigned long arg) { struct snd_seq_client_ump_info __user *argp = (struct snd_seq_client_ump_info __user *)arg; int client, type, err = 0; size_t size; void *p; if (get_user(client, &argp->client) || get_user(type, &argp->type)) return -EFAULT; if (cmd == SNDRV_SEQ_IOCTL_SET_CLIENT_UMP_INFO && caller->number != client) return -EPERM; if (type < 0 || type >= NUM_UMP_INFOS) return -EINVAL; if (type == SNDRV_SEQ_CLIENT_UMP_INFO_ENDPOINT) size = sizeof(struct snd_ump_endpoint_info); else size = sizeof(struct snd_ump_block_info); struct snd_seq_client *cptr __free(snd_seq_client) = client_load_and_use_ptr(client); if (!cptr) return -ENOENT; scoped_guard(mutex, &cptr->ioctl_mutex) { if (!cptr->midi_version) { err = -EBADFD; break; } if (cmd == SNDRV_SEQ_IOCTL_GET_CLIENT_UMP_INFO) { if (!cptr->ump_info) p = NULL; else p = cptr->ump_info[type]; if (!p) { err = -ENODEV; break; } if (copy_to_user(argp->info, p, size)) { err = -EFAULT; break; } } else { if (cptr->type != USER_CLIENT) { err = -EBADFD; break; } if (!cptr->ump_info) { cptr->ump_info = kcalloc(NUM_UMP_INFOS, sizeof(void *), GFP_KERNEL); if (!cptr->ump_info) { err = -ENOMEM; break; } } p = memdup_user(argp->info, size); if (IS_ERR(p)) { err = PTR_ERR(p); break; } kfree(cptr->ump_info[type]); terminate_ump_info_strings(p, type); cptr->ump_info[type] = p; } } if (!err && cmd == SNDRV_SEQ_IOCTL_SET_CLIENT_UMP_INFO) { if (type == SNDRV_SEQ_CLIENT_UMP_INFO_ENDPOINT) snd_seq_system_ump_notify(client, 0, SNDRV_SEQ_EVENT_UMP_EP_CHANGE, false); else snd_seq_system_ump_notify(client, type - 1, SNDRV_SEQ_EVENT_UMP_BLOCK_CHANGE, false); } return err; } #endif /* -------------------------------------------------------- */ static const struct ioctl_handler { unsigned int cmd; int (*func)(struct snd_seq_client *client, void *arg); } ioctl_handlers[] = { { SNDRV_SEQ_IOCTL_PVERSION, snd_seq_ioctl_pversion }, { SNDRV_SEQ_IOCTL_USER_PVERSION, snd_seq_ioctl_user_pversion }, { SNDRV_SEQ_IOCTL_CLIENT_ID, snd_seq_ioctl_client_id }, { SNDRV_SEQ_IOCTL_SYSTEM_INFO, snd_seq_ioctl_system_info }, { SNDRV_SEQ_IOCTL_RUNNING_MODE, snd_seq_ioctl_running_mode }, { SNDRV_SEQ_IOCTL_GET_CLIENT_INFO, snd_seq_ioctl_get_client_info }, { SNDRV_SEQ_IOCTL_SET_CLIENT_INFO, snd_seq_ioctl_set_client_info }, { SNDRV_SEQ_IOCTL_CREATE_PORT, snd_seq_ioctl_create_port }, { SNDRV_SEQ_IOCTL_DELETE_PORT, snd_seq_ioctl_delete_port }, { SNDRV_SEQ_IOCTL_GET_PORT_INFO, snd_seq_ioctl_get_port_info }, { SNDRV_SEQ_IOCTL_SET_PORT_INFO, snd_seq_ioctl_set_port_info }, { SNDRV_SEQ_IOCTL_SUBSCRIBE_PORT, snd_seq_ioctl_subscribe_port }, { SNDRV_SEQ_IOCTL_UNSUBSCRIBE_PORT, snd_seq_ioctl_unsubscribe_port }, { SNDRV_SEQ_IOCTL_CREATE_QUEUE, snd_seq_ioctl_create_queue }, { SNDRV_SEQ_IOCTL_DELETE_QUEUE, snd_seq_ioctl_delete_queue }, { SNDRV_SEQ_IOCTL_GET_QUEUE_INFO, snd_seq_ioctl_get_queue_info }, { SNDRV_SEQ_IOCTL_SET_QUEUE_INFO, snd_seq_ioctl_set_queue_info }, { SNDRV_SEQ_IOCTL_GET_NAMED_QUEUE, snd_seq_ioctl_get_named_queue }, { SNDRV_SEQ_IOCTL_GET_QUEUE_STATUS, snd_seq_ioctl_get_queue_status }, { SNDRV_SEQ_IOCTL_GET_QUEUE_TEMPO, snd_seq_ioctl_get_queue_tempo }, { SNDRV_SEQ_IOCTL_SET_QUEUE_TEMPO, snd_seq_ioctl_set_queue_tempo }, { SNDRV_SEQ_IOCTL_GET_QUEUE_TIMER, snd_seq_ioctl_get_queue_timer }, { SNDRV_SEQ_IOCTL_SET_QUEUE_TIMER, snd_seq_ioctl_set_queue_timer }, { SNDRV_SEQ_IOCTL_GET_QUEUE_CLIENT, snd_seq_ioctl_get_queue_client }, { SNDRV_SEQ_IOCTL_SET_QUEUE_CLIENT, snd_seq_ioctl_set_queue_client }, { SNDRV_SEQ_IOCTL_GET_CLIENT_POOL, snd_seq_ioctl_get_client_pool }, { SNDRV_SEQ_IOCTL_SET_CLIENT_POOL, snd_seq_ioctl_set_client_pool }, { SNDRV_SEQ_IOCTL_GET_SUBSCRIPTION, snd_seq_ioctl_get_subscription }, { SNDRV_SEQ_IOCTL_QUERY_NEXT_CLIENT, snd_seq_ioctl_query_next_client }, { SNDRV_SEQ_IOCTL_QUERY_NEXT_PORT, snd_seq_ioctl_query_next_port }, { SNDRV_SEQ_IOCTL_REMOVE_EVENTS, snd_seq_ioctl_remove_events }, { SNDRV_SEQ_IOCTL_QUERY_SUBS, snd_seq_ioctl_query_subs }, { 0, NULL }, }; static long snd_seq_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct snd_seq_client *client = file->private_data; /* To use kernel stack for ioctl data. */ union { int pversion; int client_id; struct snd_seq_system_info system_info; struct snd_seq_running_info running_info; struct snd_seq_client_info client_info; struct snd_seq_port_info port_info; struct snd_seq_port_subscribe port_subscribe; struct snd_seq_queue_info queue_info; struct snd_seq_queue_status queue_status; struct snd_seq_queue_tempo tempo; struct snd_seq_queue_timer queue_timer; struct snd_seq_queue_client queue_client; struct snd_seq_client_pool client_pool; struct snd_seq_remove_events remove_events; struct snd_seq_query_subs query_subs; } buf; const struct ioctl_handler *handler; unsigned long size; int err; if (snd_BUG_ON(!client)) return -ENXIO; #if IS_ENABLED(CONFIG_SND_SEQ_UMP) /* exception - handling large data */ switch (cmd) { case SNDRV_SEQ_IOCTL_GET_CLIENT_UMP_INFO: case SNDRV_SEQ_IOCTL_SET_CLIENT_UMP_INFO: return snd_seq_ioctl_client_ump_info(client, cmd, arg); } #endif for (handler = ioctl_handlers; handler->cmd > 0; ++handler) { if (handler->cmd == cmd) break; } if (handler->cmd == 0) return -ENOTTY; memset(&buf, 0, sizeof(buf)); /* * All of ioctl commands for ALSA sequencer get an argument of size * within 13 bits. We can safely pick up the size from the command. */ size = _IOC_SIZE(handler->cmd); if (handler->cmd & IOC_IN) { if (copy_from_user(&buf, (const void __user *)arg, size)) return -EFAULT; } scoped_guard(mutex, &client->ioctl_mutex) { err = handler->func(client, &buf); } if (err >= 0) { /* Some commands includes a bug in 'dir' field. */ if (handler->cmd == SNDRV_SEQ_IOCTL_SET_QUEUE_CLIENT || handler->cmd == SNDRV_SEQ_IOCTL_SET_CLIENT_POOL || (handler->cmd & IOC_OUT)) if (copy_to_user((void __user *)arg, &buf, size)) return -EFAULT; } return err; } #ifdef CONFIG_COMPAT #include "seq_compat.c" #else #define snd_seq_ioctl_compat NULL #endif /* -------------------------------------------------------- */ /* exported to kernel modules */ int snd_seq_create_kernel_client(struct snd_card *card, int client_index, const char *name_fmt, ...) { struct snd_seq_client *client; va_list args; if (snd_BUG_ON(in_interrupt())) return -EBUSY; if (card && client_index >= SNDRV_SEQ_CLIENTS_PER_CARD) return -EINVAL; if (card == NULL && client_index >= SNDRV_SEQ_GLOBAL_CLIENTS) return -EINVAL; scoped_guard(mutex, &register_mutex) { if (card) { client_index += SNDRV_SEQ_GLOBAL_CLIENTS + card->number * SNDRV_SEQ_CLIENTS_PER_CARD; if (client_index >= SNDRV_SEQ_DYNAMIC_CLIENTS_BEGIN) client_index = -1; } /* empty write queue as default */ client = seq_create_client1(client_index, 0); if (client == NULL) return -EBUSY; /* failure code */ usage_alloc(&client_usage, 1); client->accept_input = 1; client->accept_output = 1; client->data.kernel.card = card; client->user_pversion = SNDRV_SEQ_VERSION; va_start(args, name_fmt); vsnprintf(client->name, sizeof(client->name), name_fmt, args); va_end(args); client->type = KERNEL_CLIENT; } /* make others aware this new client */ snd_seq_system_client_ev_client_start(client->number); /* return client number to caller */ return client->number; } EXPORT_SYMBOL(snd_seq_create_kernel_client); /* exported to kernel modules */ int snd_seq_delete_kernel_client(int client) { struct snd_seq_client *ptr; if (snd_BUG_ON(in_interrupt())) return -EBUSY; ptr = clientptr(client); if (ptr == NULL) return -EINVAL; seq_free_client(ptr); kfree(ptr); return 0; } EXPORT_SYMBOL(snd_seq_delete_kernel_client); /* * exported, called by kernel clients to enqueue events (w/o blocking) * * RETURN VALUE: zero if succeed, negative if error */ int snd_seq_kernel_client_enqueue(int client, struct snd_seq_event *ev, struct file *file, bool blocking) { if (snd_BUG_ON(!ev)) return -EINVAL; if (!snd_seq_ev_is_ump(ev)) { if (ev->type == SNDRV_SEQ_EVENT_NONE) return 0; /* ignore this */ if (ev->type == SNDRV_SEQ_EVENT_KERNEL_ERROR) return -EINVAL; /* quoted events can't be enqueued */ } /* fill in client number */ ev->source.client = client; if (check_event_type_and_length(ev)) return -EINVAL; struct snd_seq_client *cptr __free(snd_seq_client) = client_load_and_use_ptr(client); if (cptr == NULL) return -EINVAL; if (!cptr->accept_output) { return -EPERM; } else { /* send it */ guard(mutex)(&cptr->ioctl_mutex); return snd_seq_client_enqueue_event(cptr, ev, file, blocking, false, 0, &cptr->ioctl_mutex); } } EXPORT_SYMBOL(snd_seq_kernel_client_enqueue); /* * exported, called by kernel clients to dispatch events directly to other * clients, bypassing the queues. Event time-stamp will be updated. * * RETURN VALUE: negative = delivery failed, * zero, or positive: the number of delivered events */ int snd_seq_kernel_client_dispatch(int client, struct snd_seq_event * ev, int atomic, int hop) { if (snd_BUG_ON(!ev)) return -EINVAL; /* fill in client number */ ev->queue = SNDRV_SEQ_QUEUE_DIRECT; ev->source.client = client; if (check_event_type_and_length(ev)) return -EINVAL; struct snd_seq_client *cptr __free(snd_seq_client) = snd_seq_client_use_ptr(client); if (cptr == NULL) return -EINVAL; if (!cptr->accept_output) return -EPERM; else return snd_seq_deliver_event(cptr, ev, atomic, hop); } EXPORT_SYMBOL(snd_seq_kernel_client_dispatch); static int call_seq_client_ctl(struct snd_seq_client *client, unsigned int cmd, void *arg) { const struct ioctl_handler *handler; for (handler = ioctl_handlers; handler->cmd > 0; ++handler) { if (handler->cmd == cmd) return handler->func(client, arg); } pr_debug("ALSA: seq unknown ioctl() 0x%x (type='%c', number=0x%02x)\n", cmd, _IOC_TYPE(cmd), _IOC_NR(cmd)); return -ENOTTY; } /** * snd_seq_kernel_client_ctl - operate a command for a client with data in * kernel space. * @clientid: A numerical ID for a client. * @cmd: An ioctl(2) command for ALSA sequencer operation. * @arg: A pointer to data in kernel space. * * Against its name, both kernel/application client can be handled by this * kernel API. A pointer of 'arg' argument should be in kernel space. * * Return: 0 at success. Negative error code at failure. */ int snd_seq_kernel_client_ctl(int clientid, unsigned int cmd, void *arg) { struct snd_seq_client *client; client = clientptr(clientid); if (client == NULL) return -ENXIO; return call_seq_client_ctl(client, cmd, arg); } EXPORT_SYMBOL(snd_seq_kernel_client_ctl); /* a similar like above but taking locks; used only from OSS sequencer layer */ int snd_seq_kernel_client_ioctl(int clientid, unsigned int cmd, void *arg) { struct snd_seq_client *client __free(snd_seq_client) = client_load_and_use_ptr(clientid); if (!client) return -ENXIO; guard(mutex)(&client->ioctl_mutex); return call_seq_client_ctl(client, cmd, arg); } EXPORT_SYMBOL_GPL(snd_seq_kernel_client_ioctl); /* exported (for OSS emulator) */ int snd_seq_kernel_client_write_poll(int clientid, struct file *file, poll_table *wait) { struct snd_seq_client *client; client = clientptr(clientid); if (client == NULL) return -ENXIO; if (snd_seq_pool_poll_wait(client->pool, file, wait)) return 1; return 0; } EXPORT_SYMBOL(snd_seq_kernel_client_write_poll); /* get a sequencer client object; for internal use from a kernel client */ struct snd_seq_client *snd_seq_kernel_client_get(int id) { return snd_seq_client_use_ptr(id); } EXPORT_SYMBOL_GPL(snd_seq_kernel_client_get); /* put a sequencer client object; for internal use from a kernel client */ void snd_seq_kernel_client_put(struct snd_seq_client *cptr) { if (cptr) snd_seq_client_unref(cptr); } EXPORT_SYMBOL_GPL(snd_seq_kernel_client_put); /*---------------------------------------------------------------------------*/ #ifdef CONFIG_SND_PROC_FS /* * /proc interface */ static void snd_seq_info_dump_subscribers(struct snd_info_buffer *buffer, struct snd_seq_port_subs_info *group, int is_src, char *msg) { struct list_head *p; struct snd_seq_subscribers *s; int count = 0; guard(rwsem_read)(&group->list_mutex); if (list_empty(&group->list_head)) return; snd_iprintf(buffer, msg); list_for_each(p, &group->list_head) { if (is_src) s = list_entry(p, struct snd_seq_subscribers, src_list); else s = list_entry(p, struct snd_seq_subscribers, dest_list); if (count++) snd_iprintf(buffer, ", "); snd_iprintf(buffer, "%d:%d", is_src ? s->info.dest.client : s->info.sender.client, is_src ? s->info.dest.port : s->info.sender.port); if (s->info.flags & SNDRV_SEQ_PORT_SUBS_TIMESTAMP) snd_iprintf(buffer, "[%c:%d]", ((s->info.flags & SNDRV_SEQ_PORT_SUBS_TIME_REAL) ? 'r' : 't'), s->info.queue); if (group->exclusive) snd_iprintf(buffer, "[ex]"); } snd_iprintf(buffer, "\n"); } #define FLAG_PERM_RD(perm) ((perm) & SNDRV_SEQ_PORT_CAP_READ ? ((perm) & SNDRV_SEQ_PORT_CAP_SUBS_READ ? 'R' : 'r') : '-') #define FLAG_PERM_WR(perm) ((perm) & SNDRV_SEQ_PORT_CAP_WRITE ? ((perm) & SNDRV_SEQ_PORT_CAP_SUBS_WRITE ? 'W' : 'w') : '-') #define FLAG_PERM_EX(perm) ((perm) & SNDRV_SEQ_PORT_CAP_NO_EXPORT ? '-' : 'e') #define FLAG_PERM_DUPLEX(perm) ((perm) & SNDRV_SEQ_PORT_CAP_DUPLEX ? 'X' : '-') static const char *port_direction_name(unsigned char dir) { static const char *names[4] = { "-", "In", "Out", "In/Out" }; if (dir > SNDRV_SEQ_PORT_DIR_BIDIRECTION) return "Invalid"; return names[dir]; } static void snd_seq_info_dump_ports(struct snd_info_buffer *buffer, struct snd_seq_client *client) { struct snd_seq_client_port *p; guard(mutex)(&client->ports_mutex); list_for_each_entry(p, &client->ports_list_head, list) { if (p->capability & SNDRV_SEQ_PORT_CAP_INACTIVE) continue; snd_iprintf(buffer, " Port %3d : \"%s\" (%c%c%c%c) [%s]", p->addr.port, p->name, FLAG_PERM_RD(p->capability), FLAG_PERM_WR(p->capability), FLAG_PERM_EX(p->capability), FLAG_PERM_DUPLEX(p->capability), port_direction_name(p->direction)); #if IS_ENABLED(CONFIG_SND_SEQ_UMP) if (snd_seq_client_is_midi2(client) && p->is_midi1) snd_iprintf(buffer, " [MIDI1]"); #endif snd_iprintf(buffer, "\n"); snd_seq_info_dump_subscribers(buffer, &p->c_src, 1, " Connecting To: "); snd_seq_info_dump_subscribers(buffer, &p->c_dest, 0, " Connected From: "); } } static const char *midi_version_string(unsigned int version) { switch (version) { case SNDRV_SEQ_CLIENT_LEGACY_MIDI: return "Legacy"; case SNDRV_SEQ_CLIENT_UMP_MIDI_1_0: return "UMP MIDI1"; case SNDRV_SEQ_CLIENT_UMP_MIDI_2_0: return "UMP MIDI2"; default: return "Unknown"; } } /* exported to seq_info.c */ void snd_seq_info_clients_read(struct snd_info_entry *entry, struct snd_info_buffer *buffer) { int c; snd_iprintf(buffer, "Client info\n"); snd_iprintf(buffer, " cur clients : %d\n", client_usage.cur); snd_iprintf(buffer, " peak clients : %d\n", client_usage.peak); snd_iprintf(buffer, " max clients : %d\n", SNDRV_SEQ_MAX_CLIENTS); snd_iprintf(buffer, "\n"); /* list the client table */ for (c = 0; c < SNDRV_SEQ_MAX_CLIENTS; c++) { struct snd_seq_client *client __free(snd_seq_client) = client_load_and_use_ptr(c); if (client == NULL) continue; if (client->type == NO_CLIENT) continue; guard(mutex)(&client->ioctl_mutex); snd_iprintf(buffer, "Client %3d : \"%s\" [%s %s]\n", c, client->name, client->type == USER_CLIENT ? "User" : "Kernel", midi_version_string(client->midi_version)); #if IS_ENABLED(CONFIG_SND_SEQ_UMP) dump_ump_info(buffer, client); #endif snd_seq_info_dump_ports(buffer, client); if (snd_seq_write_pool_allocated(client)) { snd_iprintf(buffer, " Output pool :\n"); snd_seq_info_pool(buffer, client->pool, " "); } if (client->type == USER_CLIENT && client->data.user.fifo && client->data.user.fifo->pool) { snd_iprintf(buffer, " Input pool :\n"); snd_seq_info_pool(buffer, client->data.user.fifo->pool, " "); } } } #endif /* CONFIG_SND_PROC_FS */ /*---------------------------------------------------------------------------*/ /* * REGISTRATION PART */ static const struct file_operations snd_seq_f_ops = { .owner = THIS_MODULE, .read = snd_seq_read, .write = snd_seq_write, .open = snd_seq_open, .release = snd_seq_release, .poll = snd_seq_poll, .unlocked_ioctl = snd_seq_ioctl, .compat_ioctl = snd_seq_ioctl_compat, }; static struct device *seq_dev; /* * register sequencer device */ int __init snd_sequencer_device_init(void) { int err; err = snd_device_alloc(&seq_dev, NULL); if (err < 0) return err; dev_set_name(seq_dev, "seq"); scoped_guard(mutex, &register_mutex) { err = snd_register_device(SNDRV_DEVICE_TYPE_SEQUENCER, NULL, 0, &snd_seq_f_ops, NULL, seq_dev); } if (err < 0) { put_device(seq_dev); return err; } return 0; } /* * unregister sequencer device */ void snd_sequencer_device_done(void) { snd_unregister_device(seq_dev); put_device(seq_dev); }
12 243 243 358 340 23 15 357 200 356 358 13 13 5 10 17 488 487 16 397 409 483 366 3 16 16 16 4 9 16 388 388 3 2 1 1 310 308 12 12 12 12 6 5 1 4 1 1 1 1 1 1 1 1 100 97 101 97 13 93 18 1 1 1 1 1 1 1 1 1 1 13 13 13 13 13 13 9 8 5 1 7 5 5 5 5 5 12 13 13 12 12 12 12 12 6 6 93 11 2 96 93 6 30 29 30 14 10 7 82 19 6 82 78 8 1 24 5 15 15 6 14 15 11 15 15 32 33 17 33 10 33 29 28 12 1 4 13 5 9 1 2 2 5 7 5 3 3 3 3 3 5 5 5 1 4 5 2 5 2 5 33 2 3 29 29 3 3 22 22 17 2 19 19 17 1 19 4 14 3 3 2 7 7 2 4 11 7 5 11 10 1 11 1 275 2 39 1 6 13 2 233 15 235 13 3 238 237 11 1 11 240 1 1 1 228 1 59 228 229 1 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 // SPDX-License-Identifier: GPL-2.0 /* * * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. * */ #include <linux/fiemap.h> #include <linux/fs.h> #include <linux/minmax.h> #include <linux/vmalloc.h> #include "debug.h" #include "ntfs.h" #include "ntfs_fs.h" #ifdef CONFIG_NTFS3_LZX_XPRESS #include "lib/lib.h" #endif static struct mft_inode *ni_ins_mi(struct ntfs_inode *ni, struct rb_root *tree, CLST ino, struct rb_node *ins) { struct rb_node **p = &tree->rb_node; struct rb_node *pr = NULL; while (*p) { struct mft_inode *mi; pr = *p; mi = rb_entry(pr, struct mft_inode, node); if (mi->rno > ino) p = &pr->rb_left; else if (mi->rno < ino) p = &pr->rb_right; else return mi; } if (!ins) return NULL; rb_link_node(ins, pr, p); rb_insert_color(ins, tree); return rb_entry(ins, struct mft_inode, node); } /* * ni_find_mi - Find mft_inode by record number. */ static struct mft_inode *ni_find_mi(struct ntfs_inode *ni, CLST rno) { return ni_ins_mi(ni, &ni->mi_tree, rno, NULL); } /* * ni_add_mi - Add new mft_inode into ntfs_inode. */ static void ni_add_mi(struct ntfs_inode *ni, struct mft_inode *mi) { ni_ins_mi(ni, &ni->mi_tree, mi->rno, &mi->node); } /* * ni_remove_mi - Remove mft_inode from ntfs_inode. */ void ni_remove_mi(struct ntfs_inode *ni, struct mft_inode *mi) { rb_erase(&mi->node, &ni->mi_tree); } /* * ni_std - Return: Pointer into std_info from primary record. */ struct ATTR_STD_INFO *ni_std(struct ntfs_inode *ni) { const struct ATTRIB *attr; attr = mi_find_attr(ni, &ni->mi, NULL, ATTR_STD, NULL, 0, NULL); return attr ? resident_data_ex(attr, sizeof(struct ATTR_STD_INFO)) : NULL; } /* * ni_std5 * * Return: Pointer into std_info from primary record. */ struct ATTR_STD_INFO5 *ni_std5(struct ntfs_inode *ni) { const struct ATTRIB *attr; attr = mi_find_attr(ni, &ni->mi, NULL, ATTR_STD, NULL, 0, NULL); return attr ? resident_data_ex(attr, sizeof(struct ATTR_STD_INFO5)) : NULL; } /* * ni_clear - Clear resources allocated by ntfs_inode. */ void ni_clear(struct ntfs_inode *ni) { struct rb_node *node; if (!ni->vfs_inode.i_nlink && ni->mi.mrec && is_rec_inuse(ni->mi.mrec) && !(ni->mi.sbi->flags & NTFS_FLAGS_LOG_REPLAYING)) ni_delete_all(ni); al_destroy(ni); for (node = rb_first(&ni->mi_tree); node;) { struct rb_node *next = rb_next(node); struct mft_inode *mi = rb_entry(node, struct mft_inode, node); rb_erase(node, &ni->mi_tree); mi_put(mi); node = next; } /* Bad inode always has mode == S_IFREG. */ if (ni->ni_flags & NI_FLAG_DIR) indx_clear(&ni->dir); else { run_close(&ni->file.run); ntfs_sub_da(ni->mi.sbi, run_len(&ni->file.run_da)); run_close(&ni->file.run_da); #ifdef CONFIG_NTFS3_LZX_XPRESS if (ni->file.offs_folio) { /* On-demand allocated page for offsets. */ folio_put(ni->file.offs_folio); ni->file.offs_folio = NULL; } #endif } mi_clear(&ni->mi); } /* * ni_load_mi_ex - Find mft_inode by record number. */ int ni_load_mi_ex(struct ntfs_inode *ni, CLST rno, struct mft_inode **mi) { int err; struct mft_inode *r; r = ni_find_mi(ni, rno); if (r) goto out; err = mi_get(ni->mi.sbi, rno, &r); if (err) { _ntfs_bad_inode(&ni->vfs_inode); return err; } ni_add_mi(ni, r); out: if (mi) *mi = r; return 0; } /* * ni_load_mi - Load mft_inode corresponded list_entry. */ int ni_load_mi(struct ntfs_inode *ni, const struct ATTR_LIST_ENTRY *le, struct mft_inode **mi) { CLST rno; if (!le) { *mi = &ni->mi; return 0; } rno = ino_get(&le->ref); if (rno == ni->mi.rno) { *mi = &ni->mi; return 0; } return ni_load_mi_ex(ni, rno, mi); } /* * ni_find_attr * * Return: Attribute and record this attribute belongs to. */ struct ATTRIB *ni_find_attr(struct ntfs_inode *ni, struct ATTRIB *attr, struct ATTR_LIST_ENTRY **le_o, enum ATTR_TYPE type, const __le16 *name, u8 name_len, const CLST *vcn, struct mft_inode **mi) { struct ATTR_LIST_ENTRY *le; struct mft_inode *m; if (!ni->attr_list.size || (!name_len && (type == ATTR_LIST || type == ATTR_STD))) { if (le_o) *le_o = NULL; if (mi) *mi = &ni->mi; /* Look for required attribute in primary record. */ return mi_find_attr(ni, &ni->mi, attr, type, name, name_len, NULL); } /* First look for list entry of required type. */ le = al_find_ex(ni, le_o ? *le_o : NULL, type, name, name_len, vcn); if (!le) return NULL; if (le_o) *le_o = le; /* Load record that contains this attribute. */ if (ni_load_mi(ni, le, &m)) return NULL; /* Look for required attribute. */ attr = mi_find_attr(ni, m, NULL, type, name, name_len, &le->id); if (!attr) goto out; if (!attr->non_res) { if (vcn && *vcn) goto out; } else if (!vcn) { if (attr->nres.svcn) goto out; } else if (le64_to_cpu(attr->nres.svcn) > *vcn || *vcn > le64_to_cpu(attr->nres.evcn)) { goto out; } if (mi) *mi = m; return attr; out: _ntfs_bad_inode(&ni->vfs_inode); return NULL; } /* * ni_enum_attr_ex - Enumerates attributes in ntfs_inode. */ struct ATTRIB *ni_enum_attr_ex(struct ntfs_inode *ni, struct ATTRIB *attr, struct ATTR_LIST_ENTRY **le, struct mft_inode **mi) { struct mft_inode *mi2; struct ATTR_LIST_ENTRY *le2; /* Do we have an attribute list? */ if (!ni->attr_list.size) { *le = NULL; if (mi) *mi = &ni->mi; /* Enum attributes in primary record. */ return mi_enum_attr(ni, &ni->mi, attr); } /* Get next list entry. */ le2 = *le = al_enumerate(ni, attr ? *le : NULL); if (!le2) return NULL; /* Load record that contains the required attribute. */ if (ni_load_mi(ni, le2, &mi2)) return NULL; if (mi) *mi = mi2; /* Find attribute in loaded record. */ return rec_find_attr_le(ni, mi2, le2); } /* * ni_load_all_mi - Load all subrecords. */ int ni_load_all_mi(struct ntfs_inode *ni) { int err; struct ATTR_LIST_ENTRY *le; if (!ni->attr_list.size) return 0; le = NULL; while ((le = al_enumerate(ni, le))) { CLST rno = ino_get(&le->ref); if (rno == ni->mi.rno) continue; err = ni_load_mi_ex(ni, rno, NULL); if (err) return err; } return 0; } /* * ni_add_subrecord - Allocate + format + attach a new subrecord. */ bool ni_add_subrecord(struct ntfs_inode *ni, CLST rno, struct mft_inode **mi) { struct mft_inode *m; m = kzalloc_obj(struct mft_inode, GFP_NOFS); if (!m) return false; if (mi_format_new(m, ni->mi.sbi, rno, 0, ni->mi.rno == MFT_REC_MFT)) { mi_put(m); return false; } mi_get_ref(&ni->mi, &m->mrec->parent_ref); *mi = ni_ins_mi(ni, &ni->mi_tree, m->rno, &m->node); if (*mi != m) mi_put(m); return true; } /* * ni_remove_attr - Remove all attributes for the given type/name/id. */ int ni_remove_attr(struct ntfs_inode *ni, enum ATTR_TYPE type, const __le16 *name, u8 name_len, bool base_only, const __le16 *id) { int err; struct ATTRIB *attr; struct ATTR_LIST_ENTRY *le; struct mft_inode *mi; u32 type_in; int diff; if (base_only || type == ATTR_LIST || !ni->attr_list.size) { attr = mi_find_attr(ni, &ni->mi, NULL, type, name, name_len, id); if (!attr) return -ENOENT; mi_remove_attr(ni, &ni->mi, attr); return 0; } type_in = le32_to_cpu(type); le = NULL; for (;;) { le = al_enumerate(ni, le); if (!le) return 0; next_le2: diff = le32_to_cpu(le->type) - type_in; if (diff < 0) continue; if (diff > 0) return 0; if (le->name_len != name_len) continue; if (name_len && memcmp(le_name(le), name, name_len * sizeof(short))) continue; if (id && le->id != *id) continue; err = ni_load_mi(ni, le, &mi); if (err) return err; al_remove_le(ni, le); attr = mi_find_attr(ni, mi, NULL, type, name, name_len, id); if (!attr) return -ENOENT; mi_remove_attr(ni, mi, attr); if (PtrOffset(ni->attr_list.le, le) >= ni->attr_list.size) return 0; goto next_le2; } } /* * ni_ins_new_attr - Insert the attribute into record. * * Return: Not full constructed attribute or NULL if not possible to create. */ static struct ATTRIB * ni_ins_new_attr(struct ntfs_inode *ni, struct mft_inode *mi, struct ATTR_LIST_ENTRY *le, enum ATTR_TYPE type, const __le16 *name, u8 name_len, u32 asize, u16 name_off, CLST svcn, struct ATTR_LIST_ENTRY **ins_le) { int err; struct ATTRIB *attr; bool le_added = false; struct MFT_REF ref; mi_get_ref(mi, &ref); if (type != ATTR_LIST && !le && ni->attr_list.size) { err = al_add_le(ni, type, name, name_len, svcn, cpu_to_le16(-1), &ref, &le); if (err) { /* No memory or no space. */ return ERR_PTR(err); } le_added = true; /* * al_add_le -> attr_set_size (list) -> ni_expand_list * which moves some attributes out of primary record * this means that name may point into moved memory * reinit 'name' from le. */ name = le->name; } attr = mi_insert_attr(ni, mi, type, name, name_len, asize, name_off); if (!attr) { if (le_added) al_remove_le(ni, le); return NULL; } if (type == ATTR_LIST) { /* Attr list is not in list entry array. */ goto out; } if (!le) goto out; /* Update ATTRIB Id and record reference. */ le->id = attr->id; ni->attr_list.dirty = true; le->ref = ref; out: if (ins_le) *ins_le = le; return attr; } /* * ni_repack * * Random write access to sparsed or compressed file may result to * not optimized packed runs. * Here is the place to optimize it. */ static int ni_repack(struct ntfs_inode *ni) { #if 1 return 0; #else int err = 0; struct ntfs_sb_info *sbi = ni->mi.sbi; struct mft_inode *mi, *mi_p = NULL; struct ATTRIB *attr = NULL, *attr_p; struct ATTR_LIST_ENTRY *le = NULL, *le_p; CLST alloc = 0; u8 cluster_bits = sbi->cluster_bits; CLST svcn, evcn = 0, svcn_p, evcn_p, next_svcn; u32 roff, rs = sbi->record_size; struct runs_tree run; run_init(&run); while ((attr = ni_enum_attr_ex(ni, attr, &le, &mi))) { if (!attr->non_res) continue; svcn = le64_to_cpu(attr->nres.svcn); if (svcn != le64_to_cpu(le->vcn)) { err = -EINVAL; break; } if (!svcn) { alloc = le64_to_cpu(attr->nres.alloc_size) >> cluster_bits; mi_p = NULL; } else if (svcn != evcn + 1) { err = -EINVAL; break; } evcn = le64_to_cpu(attr->nres.evcn); if (svcn > evcn + 1) { err = -EINVAL; break; } if (!mi_p) { /* Do not try if not enough free space. */ if (le32_to_cpu(mi->mrec->used) + 8 >= rs) continue; /* Do not try if last attribute segment. */ if (evcn + 1 == alloc) continue; run_close(&run); } roff = le16_to_cpu(attr->nres.run_off); if (roff > le32_to_cpu(attr->size)) { err = -EINVAL; break; } err = run_unpack(&run, sbi, ni->mi.rno, svcn, evcn, svcn, Add2Ptr(attr, roff), le32_to_cpu(attr->size) - roff); if (err < 0) break; if (!mi_p) { mi_p = mi; attr_p = attr; svcn_p = svcn; evcn_p = evcn; le_p = le; err = 0; continue; } /* * Run contains data from two records: mi_p and mi * Try to pack in one. */ err = mi_pack_runs(mi_p, attr_p, &run, evcn + 1 - svcn_p); if (err) break; next_svcn = le64_to_cpu(attr_p->nres.evcn) + 1; if (next_svcn >= evcn + 1) { /* We can remove this attribute segment. */ al_remove_le(ni, le); mi_remove_attr(NULL, mi, attr); le = le_p; continue; } attr->nres.svcn = le->vcn = cpu_to_le64(next_svcn); mi->dirty = true; ni->attr_list.dirty = true; if (evcn + 1 == alloc) { err = mi_pack_runs(mi, attr, &run, evcn + 1 - next_svcn); if (err) break; mi_p = NULL; } else { mi_p = mi; attr_p = attr; svcn_p = next_svcn; evcn_p = evcn; le_p = le; run_truncate_head(&run, next_svcn); } } if (err) { ntfs_inode_warn(&ni->vfs_inode, "repack problem"); ntfs_set_state(sbi, NTFS_DIRTY_ERROR); /* Pack loaded but not packed runs. */ if (mi_p) mi_pack_runs(mi_p, attr_p, &run, evcn_p + 1 - svcn_p); } run_close(&run); return err; #endif } /* * ni_try_remove_attr_list * * Can we remove attribute list? * Check the case when primary record contains enough space for all attributes. */ static int ni_try_remove_attr_list(struct ntfs_inode *ni) { int err = 0; struct ntfs_sb_info *sbi = ni->mi.sbi; struct ATTRIB *attr, *attr_list, *attr_ins; struct ATTR_LIST_ENTRY *le; struct mft_inode *mi; u32 asize, free; struct MFT_REF ref; struct MFT_REC *mrec; __le16 id; if (!ni->attr_list.dirty) return 0; err = ni_repack(ni); if (err) return err; attr_list = mi_find_attr(ni, &ni->mi, NULL, ATTR_LIST, NULL, 0, NULL); if (!attr_list) return 0; asize = le32_to_cpu(attr_list->size); /* Free space in primary record without attribute list. */ free = sbi->record_size - le32_to_cpu(ni->mi.mrec->used) + asize; mi_get_ref(&ni->mi, &ref); le = NULL; while ((le = al_enumerate(ni, le))) { if (!memcmp(&le->ref, &ref, sizeof(ref))) continue; if (le->vcn) return 0; mi = ni_find_mi(ni, ino_get(&le->ref)); if (!mi) return 0; attr = mi_find_attr(ni, mi, NULL, le->type, le_name(le), le->name_len, &le->id); if (!attr) return 0; asize = le32_to_cpu(attr->size); if (asize > free) return 0; free -= asize; } /* Make a copy of primary record to restore if error. */ mrec = kmemdup(ni->mi.mrec, sbi->record_size, GFP_NOFS); if (!mrec) return 0; /* Not critical. */ /* It seems that attribute list can be removed from primary record. */ mi_remove_attr(NULL, &ni->mi, attr_list); /* * Repeat the cycle above and copy all attributes to primary record. * Do not remove original attributes from subrecords! * It should be success! */ le = NULL; while ((le = al_enumerate(ni, le))) { if (!memcmp(&le->ref, &ref, sizeof(ref))) continue; mi = ni_find_mi(ni, ino_get(&le->ref)); if (!mi) { /* Should never happened, 'cause already checked. */ goto out; } attr = mi_find_attr(ni, mi, NULL, le->type, le_name(le), le->name_len, &le->id); if (!attr) { /* Should never happened, 'cause already checked. */ goto out; } asize = le32_to_cpu(attr->size); /* Insert into primary record. */ attr_ins = mi_insert_attr(ni, &ni->mi, le->type, le_name(le), le->name_len, asize, le16_to_cpu(attr->name_off)); if (!attr_ins) { /* * No space in primary record (already checked). */ goto out; } /* Copy all except id. */ id = attr_ins->id; memcpy(attr_ins, attr, asize); attr_ins->id = id; } /* * Repeat the cycle above and remove all attributes from subrecords. */ le = NULL; while ((le = al_enumerate(ni, le))) { if (!memcmp(&le->ref, &ref, sizeof(ref))) continue; mi = ni_find_mi(ni, ino_get(&le->ref)); if (!mi) continue; attr = mi_find_attr(ni, mi, NULL, le->type, le_name(le), le->name_len, &le->id); if (!attr) continue; /* Remove from original record. */ mi_remove_attr(NULL, mi, attr); } run_deallocate(sbi, &ni->attr_list.run, true); run_close(&ni->attr_list.run); ni->attr_list.size = 0; kvfree(ni->attr_list.le); ni->attr_list.le = NULL; ni->attr_list.dirty = false; kfree(mrec); return 0; out: /* Restore primary record. */ swap(mrec, ni->mi.mrec); kfree(mrec); return 0; } /* * ni_create_attr_list - Generates an attribute list for this primary record. */ int ni_create_attr_list(struct ntfs_inode *ni) { struct ntfs_sb_info *sbi = ni->mi.sbi; int err; u32 lsize; struct ATTRIB *attr; struct ATTRIB *arr_move[7]; struct ATTR_LIST_ENTRY *le, *le_b[7]; struct MFT_REC *rec; bool is_mft; CLST rno = 0; struct mft_inode *mi; u32 free_b, nb, to_free, rs; u16 sz; is_mft = ni->mi.rno == MFT_REC_MFT; rec = ni->mi.mrec; rs = sbi->record_size; /* * Skip estimating exact memory requirement. * Looks like one record_size is always enough. */ le = kzalloc(al_aligned(rs), GFP_NOFS); if (!le) return -ENOMEM; mi_get_ref(&ni->mi, &le->ref); ni->attr_list.le = le; attr = NULL; nb = 0; free_b = 0; attr = NULL; for (; (attr = mi_enum_attr(ni, &ni->mi, attr)); le = Add2Ptr(le, sz)) { sz = le_size(attr->name_len); le->type = attr->type; le->size = cpu_to_le16(sz); le->name_len = attr->name_len; le->name_off = offsetof(struct ATTR_LIST_ENTRY, name); le->vcn = 0; if (le != ni->attr_list.le) le->ref = ni->attr_list.le->ref; le->id = attr->id; if (attr->name_len) memcpy(le->name, attr_name(attr), sizeof(short) * attr->name_len); else if (attr->type == ATTR_STD) continue; else if (attr->type == ATTR_LIST) continue; else if (is_mft && attr->type == ATTR_DATA) continue; if (!nb || nb < ARRAY_SIZE(arr_move)) { le_b[nb] = le; arr_move[nb++] = attr; free_b += le32_to_cpu(attr->size); } } lsize = PtrOffset(ni->attr_list.le, le); ni->attr_list.size = lsize; to_free = le32_to_cpu(rec->used) + lsize + SIZEOF_RESIDENT; if (to_free <= rs) { to_free = 0; } else { to_free -= rs; if (to_free > free_b) { err = -EINVAL; goto out; } } /* Allocate child MFT. */ err = ntfs_look_free_mft(sbi, &rno, is_mft, ni, &mi); if (err) goto out; err = -EINVAL; /* Call mi_remove_attr() in reverse order to keep pointers 'arr_move' valid. */ while (to_free > 0) { struct ATTRIB *b = arr_move[--nb]; u32 asize = le32_to_cpu(b->size); u16 name_off = le16_to_cpu(b->name_off); attr = mi_insert_attr(ni, mi, b->type, Add2Ptr(b, name_off), b->name_len, asize, name_off); if (!attr) goto out; mi_get_ref(mi, &le_b[nb]->ref); le_b[nb]->id = attr->id; /* Copy all except id. */ memcpy(attr, b, asize); attr->id = le_b[nb]->id; /* Remove from primary record. */ if (!mi_remove_attr(NULL, &ni->mi, b)) goto out; if (to_free <= asize) break; to_free -= asize; if (!nb) goto out; } attr = mi_insert_attr(ni, &ni->mi, ATTR_LIST, NULL, 0, lsize + SIZEOF_RESIDENT, SIZEOF_RESIDENT); if (!attr) goto out; attr->non_res = 0; attr->flags = 0; attr->res.data_size = cpu_to_le32(lsize); attr->res.data_off = SIZEOF_RESIDENT_LE; attr->res.flags = 0; attr->res.res = 0; memcpy(resident_data_ex(attr, lsize), ni->attr_list.le, lsize); ni->attr_list.dirty = false; mark_inode_dirty(&ni->vfs_inode); return 0; out: kvfree(ni->attr_list.le); ni->attr_list.le = NULL; ni->attr_list.size = 0; return err; } /* * ni_ins_attr_ext - Add an external attribute to the ntfs_inode. */ static int ni_ins_attr_ext(struct ntfs_inode *ni, struct ATTR_LIST_ENTRY *le, enum ATTR_TYPE type, const __le16 *name, u8 name_len, u32 asize, CLST svcn, u16 name_off, bool force_ext, struct ATTRIB **ins_attr, struct mft_inode **ins_mi, struct ATTR_LIST_ENTRY **ins_le) { struct ATTRIB *attr; struct mft_inode *mi; CLST rno; u64 vbo; struct rb_node *node; int err; bool is_mft, is_mft_data; struct ntfs_sb_info *sbi = ni->mi.sbi; is_mft = ni->mi.rno == MFT_REC_MFT; is_mft_data = is_mft && type == ATTR_DATA && !name_len; if (asize > sbi->max_bytes_per_attr) { err = -EINVAL; goto out; } /* * Standard information and attr_list cannot be made external. * The Log File cannot have any external attributes. */ if (type == ATTR_STD || type == ATTR_LIST || ni->mi.rno == MFT_REC_LOG) { err = -EINVAL; goto out; } /* Create attribute list if it is not already existed. */ if (!ni->attr_list.size) { err = ni_create_attr_list(ni); if (err) goto out; } vbo = is_mft_data ? ((u64)svcn << sbi->cluster_bits) : 0; if (force_ext) goto insert_ext; /* Load all subrecords into memory. */ err = ni_load_all_mi(ni); if (err) goto out; /* Check each of loaded subrecord. */ for (node = rb_first(&ni->mi_tree); node; node = rb_next(node)) { mi = rb_entry(node, struct mft_inode, node); if (is_mft_data && (mi_enum_attr(ni, mi, NULL) || vbo <= ((u64)mi->rno << sbi->record_bits))) { /* We can't accept this record 'cause MFT's bootstrapping. */ continue; } if (is_mft && mi_find_attr(ni, mi, NULL, ATTR_DATA, NULL, 0, NULL)) { /* * This child record already has a ATTR_DATA. * So it can't accept any other records. */ continue; } if ((type != ATTR_NAME || name_len) && mi_find_attr(ni, mi, NULL, type, name, name_len, NULL)) { /* Only indexed attributes can share same record. */ continue; } /* * Do not try to insert this attribute * if there is no room in record. */ if (le32_to_cpu(mi->mrec->used) + asize > sbi->record_size) continue; /* Try to insert attribute into this subrecord. */ attr = ni_ins_new_attr(ni, mi, le, type, name, name_len, asize, name_off, svcn, ins_le); if (!attr) continue; if (IS_ERR(attr)) return PTR_ERR(attr); if (ins_attr) *ins_attr = attr; if (ins_mi) *ins_mi = mi; return 0; } insert_ext: /* We have to allocate a new child subrecord. */ err = ntfs_look_free_mft(sbi, &rno, is_mft_data, ni, &mi); if (err) goto out; if (is_mft_data && vbo <= ((u64)rno << sbi->record_bits)) { err = -EINVAL; goto out1; } attr = ni_ins_new_attr(ni, mi, le, type, name, name_len, asize, name_off, svcn, ins_le); if (!attr) { err = -EINVAL; goto out2; } if (IS_ERR(attr)) { err = PTR_ERR(attr); goto out2; } if (ins_attr) *ins_attr = attr; if (ins_mi) *ins_mi = mi; return 0; out2: ni_remove_mi(ni, mi); out1: mi_put(mi); ntfs_mark_rec_free(sbi, rno, is_mft); out: return err; } /* * ni_insert_attr - Insert an attribute into the file. * * If the primary record has room, it will just insert the attribute. * If not, it may make the attribute external. * For $MFT::Data it may make room for the attribute by * making other attributes external. * * NOTE: * The ATTR_LIST and ATTR_STD cannot be made external. * This function does not fill new attribute full. * It only fills 'size'/'type'/'id'/'name_len' fields. */ static int ni_insert_attr(struct ntfs_inode *ni, enum ATTR_TYPE type, const __le16 *name, u8 name_len, u32 asize, u16 name_off, CLST svcn, struct ATTRIB **ins_attr, struct mft_inode **ins_mi, struct ATTR_LIST_ENTRY **ins_le) { struct ntfs_sb_info *sbi = ni->mi.sbi; int err; struct ATTRIB *attr, *eattr; struct MFT_REC *rec; bool is_mft; struct ATTR_LIST_ENTRY *le; u32 list_reserve, max_free, free, used, t32; __le16 id; u16 t16; is_mft = ni->mi.rno == MFT_REC_MFT; rec = ni->mi.mrec; list_reserve = SIZEOF_NONRESIDENT + 3 * (1 + 2 * sizeof(u32)); used = le32_to_cpu(rec->used); free = sbi->record_size - used; if (is_mft && type != ATTR_LIST) { /* Reserve space for the ATTRIB list. */ if (free < list_reserve) free = 0; else free -= list_reserve; } if (asize <= free) { attr = ni_ins_new_attr(ni, &ni->mi, NULL, type, name, name_len, asize, name_off, svcn, ins_le); if (IS_ERR(attr)) { err = PTR_ERR(attr); goto out; } if (attr) { if (ins_attr) *ins_attr = attr; if (ins_mi) *ins_mi = &ni->mi; err = 0; goto out; } } if (!is_mft || type != ATTR_DATA || svcn) { /* This ATTRIB will be external. */ err = ni_ins_attr_ext(ni, NULL, type, name, name_len, asize, svcn, name_off, false, ins_attr, ins_mi, ins_le); goto out; } /* * Here we have: "is_mft && type == ATTR_DATA && !svcn" * * The first chunk of the $MFT::Data ATTRIB must be the base record. * Evict as many other attributes as possible. */ max_free = free; /* Estimate the result of moving all possible attributes away. */ attr = NULL; while ((attr = mi_enum_attr(ni, &ni->mi, attr))) { if (attr->type == ATTR_STD) continue; if (attr->type == ATTR_LIST) continue; max_free += le32_to_cpu(attr->size); } if (max_free < asize + list_reserve) { /* Impossible to insert this attribute into primary record. */ err = -EINVAL; goto out; } /* Start real attribute moving. */ attr = NULL; for (;;) { attr = mi_enum_attr(ni, &ni->mi, attr); if (!attr) { /* We should never be here 'cause we have already check this case. */ err = -EINVAL; goto out; } /* Skip attributes that MUST be primary record. */ if (attr->type == ATTR_STD || attr->type == ATTR_LIST) continue; le = NULL; if (ni->attr_list.size) { le = al_find_le(ni, NULL, attr); if (!le) { /* Really this is a serious bug. */ err = -EINVAL; goto out; } } t32 = le32_to_cpu(attr->size); t16 = le16_to_cpu(attr->name_off); err = ni_ins_attr_ext(ni, le, attr->type, Add2Ptr(attr, t16), attr->name_len, t32, attr_svcn(attr), t16, false, &eattr, NULL, NULL); if (err) return err; id = eattr->id; memcpy(eattr, attr, t32); eattr->id = id; /* Remove from primary record. */ mi_remove_attr(NULL, &ni->mi, attr); /* attr now points to next attribute. */ if (attr->type == ATTR_END) goto out; } while (asize + list_reserve > sbi->record_size - le32_to_cpu(rec->used)) ; attr = ni_ins_new_attr(ni, &ni->mi, NULL, type, name, name_len, asize, name_off, svcn, ins_le); if (!attr) { err = -EINVAL; goto out; } if (IS_ERR(attr)) { err = PTR_ERR(attr); goto out; } if (ins_attr) *ins_attr = attr; if (ins_mi) *ins_mi = &ni->mi; out: return err; } /* ni_expand_mft_list - Split ATTR_DATA of $MFT. */ static int ni_expand_mft_list(struct ntfs_inode *ni) { int err = 0; struct runs_tree *run = &ni->file.run; u32 asize, run_size, done = 0; struct ATTRIB *attr; struct rb_node *node; CLST mft_min, mft_new, svcn, evcn, plen; struct mft_inode *mi, *mi_min, *mi_new; struct ntfs_sb_info *sbi = ni->mi.sbi; /* Find the nearest MFT. */ mft_min = 0; mft_new = 0; mi_min = NULL; for (node = rb_first(&ni->mi_tree); node; node = rb_next(node)) { mi = rb_entry(node, struct mft_inode, node); attr = mi_enum_attr(ni, mi, NULL); if (!attr) { mft_min = mi->rno; mi_min = mi; break; } } if (ntfs_look_free_mft(sbi, &mft_new, true, ni, &mi_new)) { mft_new = 0; /* Really this is not critical. */ } else if (mft_min > mft_new) { mft_min = mft_new; mi_min = mi_new; } else { ntfs_mark_rec_free(sbi, mft_new, true); mft_new = 0; ni_remove_mi(ni, mi_new); } attr = mi_find_attr(ni, &ni->mi, NULL, ATTR_DATA, NULL, 0, NULL); if (!attr) { err = -EINVAL; goto out; } asize = le32_to_cpu(attr->size); evcn = le64_to_cpu(attr->nres.evcn); svcn = bytes_to_cluster(sbi, (u64)(mft_min + 1) << sbi->record_bits); if (evcn + 1 >= svcn) { err = -EINVAL; goto out; } /* * Split primary attribute [0 evcn] in two parts [0 svcn) + [svcn evcn]. * * Update first part of ATTR_DATA in 'primary MFT. */ err = run_pack(run, 0, svcn, Add2Ptr(attr, SIZEOF_NONRESIDENT), asize - SIZEOF_NONRESIDENT, &plen); if (err < 0) goto out; run_size = ALIGN(err, 8); err = 0; if (plen < svcn) { err = -EINVAL; goto out; } attr->nres.evcn = cpu_to_le64(svcn - 1); attr->size = cpu_to_le32(run_size + SIZEOF_NONRESIDENT); /* 'done' - How many bytes of primary MFT becomes free. */ done = asize - run_size - SIZEOF_NONRESIDENT; le32_sub_cpu(&ni->mi.mrec->used, done); /* Estimate packed size (run_buf=NULL). */ err = run_pack(run, svcn, evcn + 1 - svcn, NULL, sbi->record_size, &plen); if (err < 0) goto out; run_size = ALIGN(err, 8); err = 0; if (plen < evcn + 1 - svcn) { err = -EINVAL; goto out; } /* * This function may implicitly call expand attr_list. * Insert second part of ATTR_DATA in 'mi_min'. */ attr = ni_ins_new_attr(ni, mi_min, NULL, ATTR_DATA, NULL, 0, SIZEOF_NONRESIDENT + run_size, SIZEOF_NONRESIDENT, svcn, NULL); if (!attr) { err = -EINVAL; goto out; } if (IS_ERR(attr)) { err = PTR_ERR(attr); goto out; } attr->non_res = 1; attr->name_off = SIZEOF_NONRESIDENT_LE; attr->flags = 0; /* This function can't fail - cause already checked above. */ run_pack(run, svcn, evcn + 1 - svcn, Add2Ptr(attr, SIZEOF_NONRESIDENT), run_size, &plen); attr->nres.svcn = cpu_to_le64(svcn); attr->nres.evcn = cpu_to_le64(evcn); attr->nres.run_off = cpu_to_le16(SIZEOF_NONRESIDENT); out: if (mft_new) { ntfs_mark_rec_free(sbi, mft_new, true); ni_remove_mi(ni, mi_new); } return !err && !done ? -EOPNOTSUPP : err; } /* * ni_expand_list - Move all possible attributes out of primary record. */ int ni_expand_list(struct ntfs_inode *ni) { int err = 0; u32 asize, done = 0; struct ATTRIB *attr, *ins_attr; struct ATTR_LIST_ENTRY *le; bool is_mft = ni->mi.rno == MFT_REC_MFT; struct MFT_REF ref; mi_get_ref(&ni->mi, &ref); le = NULL; while ((le = al_enumerate(ni, le))) { if (le->type == ATTR_STD) continue; if (memcmp(&ref, &le->ref, sizeof(struct MFT_REF))) continue; if (is_mft && le->type == ATTR_DATA) continue; /* Find attribute in primary record. */ attr = rec_find_attr_le(ni, &ni->mi, le); if (!attr) { err = -EINVAL; goto out; } asize = le32_to_cpu(attr->size); /* Always insert into new record to avoid collisions (deep recursive). */ err = ni_ins_attr_ext(ni, le, attr->type, attr_name(attr), attr->name_len, asize, attr_svcn(attr), le16_to_cpu(attr->name_off), true, &ins_attr, NULL, NULL); if (err) goto out; memcpy(ins_attr, attr, asize); ins_attr->id = le->id; /* Remove from primary record. */ mi_remove_attr(NULL, &ni->mi, attr); done += asize; goto out; } if (!is_mft) { err = -EFBIG; /* Attr list is too big(?) */ goto out; } /* Split MFT data as much as possible. */ err = ni_expand_mft_list(ni); out: return !err && !done ? -EOPNOTSUPP : err; } /* * ni_insert_nonresident - Insert new nonresident attribute. */ int ni_insert_nonresident(struct ntfs_inode *ni, enum ATTR_TYPE type, const __le16 *name, u8 name_len, const struct runs_tree *run, CLST svcn, CLST len, __le16 flags, struct ATTRIB **new_attr, struct mft_inode **mi, struct ATTR_LIST_ENTRY **le) { int err; CLST plen; struct ATTRIB *attr; bool is_ext = (flags & (ATTR_FLAG_SPARSED | ATTR_FLAG_COMPRESSED)) && !svcn; u32 name_size = ALIGN(name_len * sizeof(short), 8); u32 name_off = is_ext ? SIZEOF_NONRESIDENT_EX : SIZEOF_NONRESIDENT; u32 run_off = name_off + name_size; u32 run_size, asize; struct ntfs_sb_info *sbi = ni->mi.sbi; /* Estimate packed size (run_buf=NULL). */ err = run_pack(run, svcn, len, NULL, sbi->max_bytes_per_attr - run_off, &plen); if (err < 0) goto out; run_size = ALIGN(err, 8); if (plen < len) { err = -EINVAL; goto out; } asize = run_off + run_size; if (asize > sbi->max_bytes_per_attr) { err = -EINVAL; goto out; } err = ni_insert_attr(ni, type, name, name_len, asize, name_off, svcn, &attr, mi, le); if (err) goto out; attr->non_res = 1; attr->name_off = cpu_to_le16(name_off); attr->flags = flags; /* This function can't fail - cause already checked above. */ run_pack(run, svcn, len, Add2Ptr(attr, run_off), run_size, &plen); attr->nres.svcn = cpu_to_le64(svcn); attr->nres.evcn = cpu_to_le64((u64)svcn + len - 1); if (new_attr) *new_attr = attr; *(__le64 *)&attr->nres.run_off = cpu_to_le64(run_off); attr->nres.alloc_size = svcn ? 0 : cpu_to_le64((u64)len << ni->mi.sbi->cluster_bits); attr->nres.data_size = attr->nres.alloc_size; attr->nres.valid_size = attr->nres.alloc_size; if (is_ext) { if (flags & ATTR_FLAG_COMPRESSED) attr->nres.c_unit = NTFS_LZNT_CUNIT; attr->nres.total_size = attr->nres.alloc_size; } out: return err; } /* * ni_insert_resident - Inserts new resident attribute. */ int ni_insert_resident(struct ntfs_inode *ni, u32 data_size, enum ATTR_TYPE type, const __le16 *name, u8 name_len, struct ATTRIB **new_attr, struct mft_inode **mi, struct ATTR_LIST_ENTRY **le) { int err; u32 name_size = ALIGN(name_len * sizeof(short), 8); u32 asize = SIZEOF_RESIDENT + name_size + ALIGN(data_size, 8); struct ATTRIB *attr; err = ni_insert_attr(ni, type, name, name_len, asize, SIZEOF_RESIDENT, 0, &attr, mi, le); if (err) return err; attr->non_res = 0; attr->flags = 0; attr->res.data_size = cpu_to_le32(data_size); attr->res.data_off = cpu_to_le16(SIZEOF_RESIDENT + name_size); if (type == ATTR_NAME) { attr->res.flags = RESIDENT_FLAG_INDEXED; /* is_attr_indexed(attr)) == true */ le16_add_cpu(&ni->mi.mrec->hard_links, 1); ni->mi.dirty = true; } attr->res.res = 0; if (new_attr) *new_attr = attr; return 0; } /* * ni_remove_attr_le - Remove attribute from record. */ void ni_remove_attr_le(struct ntfs_inode *ni, struct ATTRIB *attr, struct mft_inode *mi, struct ATTR_LIST_ENTRY *le) { mi_remove_attr(ni, mi, attr); if (le) al_remove_le(ni, le); } /* * ni_delete_all - Remove all attributes and frees allocates space. * * ntfs_evict_inode->ntfs_clear_inode->ni_delete_all (if no links). */ int ni_delete_all(struct ntfs_inode *ni) { int err; struct ATTR_LIST_ENTRY *le = NULL; struct ATTRIB *attr = NULL; struct rb_node *node; u16 roff; u32 asize; CLST svcn, evcn; struct ntfs_sb_info *sbi = ni->mi.sbi; bool nt3 = is_ntfs3(sbi); struct MFT_REF ref; while ((attr = ni_enum_attr_ex(ni, attr, &le, NULL))) { if (!nt3 || attr->name_len) { ; } else if (attr->type == ATTR_REPARSE) { mi_get_ref(&ni->mi, &ref); ntfs_remove_reparse(sbi, 0, &ref); } else if (attr->type == ATTR_ID && !attr->non_res && le32_to_cpu(attr->res.data_size) >= sizeof(struct GUID)) { ntfs_objid_remove(sbi, resident_data(attr)); } if (!attr->non_res) continue; svcn = le64_to_cpu(attr->nres.svcn); evcn = le64_to_cpu(attr->nres.evcn); if (evcn + 1 <= svcn) continue; asize = le32_to_cpu(attr->size); roff = le16_to_cpu(attr->nres.run_off); if (roff > asize) { /* ni_enum_attr_ex checks this case. */ continue; } /* run==1 means unpack and deallocate. */ run_unpack_ex(RUN_DEALLOCATE, sbi, ni->mi.rno, svcn, evcn, svcn, Add2Ptr(attr, roff), asize - roff); } if (ni->attr_list.size) { run_deallocate(ni->mi.sbi, &ni->attr_list.run, true); al_destroy(ni); } /* Free all subrecords. */ for (node = rb_first(&ni->mi_tree); node;) { struct rb_node *next = rb_next(node); struct mft_inode *mi = rb_entry(node, struct mft_inode, node); clear_rec_inuse(mi->mrec); mi->dirty = true; mi_write(mi, 0); ntfs_mark_rec_free(sbi, mi->rno, false); ni_remove_mi(ni, mi); mi_put(mi); node = next; } /* Free base record. */ clear_rec_inuse(ni->mi.mrec); ni->mi.dirty = true; err = mi_write(&ni->mi, 0); ntfs_mark_rec_free(sbi, ni->mi.rno, false); return err; } /* ni_fname_name * * Return: File name attribute by its value. */ struct ATTR_FILE_NAME *ni_fname_name(struct ntfs_inode *ni, const struct le_str *uni, const struct MFT_REF *home_dir, struct mft_inode **mi, struct ATTR_LIST_ENTRY **le) { struct ATTRIB *attr = NULL; struct ATTR_FILE_NAME *fname; if (le) *le = NULL; /* Enumerate all names. */ next: attr = ni_find_attr(ni, attr, le, ATTR_NAME, NULL, 0, NULL, mi); if (!attr) return NULL; fname = resident_data_ex(attr, SIZEOF_ATTRIBUTE_FILENAME); if (!fname) goto next; if (home_dir && memcmp(home_dir, &fname->home, sizeof(*home_dir))) goto next; if (!uni) return fname; if (uni->len != fname->name_len) goto next; if (ntfs_cmp_names(uni->name, uni->len, fname->name, uni->len, NULL, false)) goto next; return fname; } /* * ni_fname_type * * Return: File name attribute with given type. */ struct ATTR_FILE_NAME *ni_fname_type(struct ntfs_inode *ni, u8 name_type, struct mft_inode **mi, struct ATTR_LIST_ENTRY **le) { struct ATTRIB *attr = NULL; struct ATTR_FILE_NAME *fname; *le = NULL; if (name_type == FILE_NAME_POSIX) return NULL; /* Enumerate all names. */ for (;;) { attr = ni_find_attr(ni, attr, le, ATTR_NAME, NULL, 0, NULL, mi); if (!attr) return NULL; fname = resident_data_ex(attr, SIZEOF_ATTRIBUTE_FILENAME); if (fname && name_type == fname->type) return fname; } } /* * ni_new_attr_flags * * Process compressed/sparsed in special way. * NOTE: You need to set ni->std_fa = new_fa * after this function to keep internal structures in consistency. */ int ni_new_attr_flags(struct ntfs_inode *ni, enum FILE_ATTRIBUTE new_fa) { struct ATTRIB *attr; struct mft_inode *mi; __le16 new_aflags; u32 new_asize; attr = ni_find_attr(ni, NULL, NULL, ATTR_DATA, NULL, 0, NULL, &mi); if (!attr) return -EINVAL; new_aflags = attr->flags; if (new_fa & FILE_ATTRIBUTE_SPARSE_FILE) new_aflags |= ATTR_FLAG_SPARSED; else new_aflags &= ~ATTR_FLAG_SPARSED; if (new_fa & FILE_ATTRIBUTE_COMPRESSED) new_aflags |= ATTR_FLAG_COMPRESSED; else new_aflags &= ~ATTR_FLAG_COMPRESSED; if (new_aflags == attr->flags) return 0; if ((new_aflags & (ATTR_FLAG_COMPRESSED | ATTR_FLAG_SPARSED)) == (ATTR_FLAG_COMPRESSED | ATTR_FLAG_SPARSED)) { ntfs_inode_warn(&ni->vfs_inode, "file can't be sparsed and compressed"); return -EOPNOTSUPP; } if (!attr->non_res) goto out; if (attr->nres.data_size) { ntfs_inode_warn( &ni->vfs_inode, "one can change sparsed/compressed only for empty files"); return -EOPNOTSUPP; } /* Resize nonresident empty attribute in-place only. */ new_asize = (new_aflags & (ATTR_FLAG_COMPRESSED | ATTR_FLAG_SPARSED)) ? (SIZEOF_NONRESIDENT_EX + 8) : (SIZEOF_NONRESIDENT + 8); if (!mi_resize_attr(mi, attr, new_asize - le32_to_cpu(attr->size))) return -EOPNOTSUPP; if (new_aflags & ATTR_FLAG_SPARSED) { attr->name_off = SIZEOF_NONRESIDENT_EX_LE; /* Windows uses 16 clusters per frame but supports one cluster per frame too. */ attr->nres.c_unit = 0; ni->vfs_inode.i_mapping->a_ops = &ntfs_aops; } else if (new_aflags & ATTR_FLAG_COMPRESSED) { attr->name_off = SIZEOF_NONRESIDENT_EX_LE; /* The only allowed: 16 clusters per frame. */ attr->nres.c_unit = NTFS_LZNT_CUNIT; ni->vfs_inode.i_mapping->a_ops = &ntfs_aops_cmpr; } else { attr->name_off = SIZEOF_NONRESIDENT_LE; /* Normal files. */ attr->nres.c_unit = 0; ni->vfs_inode.i_mapping->a_ops = &ntfs_aops; } attr->nres.run_off = attr->name_off; out: attr->flags = new_aflags; mi->dirty = true; return 0; } /* * ni_parse_reparse * * buffer - memory for reparse buffer header */ enum REPARSE_SIGN ni_parse_reparse(struct ntfs_inode *ni, struct ATTRIB *attr, struct REPARSE_DATA_BUFFER *buffer) { const struct REPARSE_DATA_BUFFER *rp = NULL; u8 bits; u16 len; typeof(rp->CompressReparseBuffer) *cmpr; /* Try to estimate reparse point. */ if (!attr->non_res) { rp = resident_data_ex(attr, sizeof(struct REPARSE_DATA_BUFFER)); } else if (le64_to_cpu(attr->nres.data_size) >= sizeof(struct REPARSE_DATA_BUFFER)) { struct runs_tree run; run_init(&run); if (!attr_load_runs_vcn(ni, ATTR_REPARSE, NULL, 0, &run, 0) && !ntfs_read_run_nb(ni->mi.sbi, &run, 0, buffer, sizeof(struct REPARSE_DATA_BUFFER), NULL)) { rp = buffer; } run_close(&run); } if (!rp) return REPARSE_NONE; len = le16_to_cpu(rp->ReparseDataLength); switch (rp->ReparseTag) { case (IO_REPARSE_TAG_MICROSOFT | IO_REPARSE_TAG_SYMBOLIC_LINK): break; /* Symbolic link. */ case IO_REPARSE_TAG_MOUNT_POINT: break; /* Mount points and junctions. */ case IO_REPARSE_TAG_SYMLINK: break; case IO_REPARSE_TAG_COMPRESS: /* * WOF - Windows Overlay Filter - Used to compress files with * LZX/Xpress. * * Unlike native NTFS file compression, the Windows * Overlay Filter supports only read operations. This means * that it doesn't need to sector-align each compressed chunk, * so the compressed data can be packed more tightly together. * If you open the file for writing, the WOF just decompresses * the entire file, turning it back into a plain file. * * Ntfs3 driver decompresses the entire file only on write or * change size requests. */ cmpr = &rp->CompressReparseBuffer; if (len < sizeof(*cmpr) || cmpr->WofVersion != WOF_CURRENT_VERSION || cmpr->WofProvider != WOF_PROVIDER_SYSTEM || cmpr->ProviderVer != WOF_PROVIDER_CURRENT_VERSION) { return REPARSE_NONE; } switch (cmpr->CompressionFormat) { case WOF_COMPRESSION_XPRESS4K: bits = 0xc; // 4k break; case WOF_COMPRESSION_XPRESS8K: bits = 0xd; // 8k break; case WOF_COMPRESSION_XPRESS16K: bits = 0xe; // 16k break; case WOF_COMPRESSION_LZX32K: bits = 0xf; // 32k break; default: bits = 0x10; // 64k break; } ni_set_ext_compress_bits(ni, bits); return REPARSE_COMPRESSED; case IO_REPARSE_TAG_DEDUP: ni->ni_flags |= NI_FLAG_DEDUPLICATED; return REPARSE_DEDUPLICATED; default: if (rp->ReparseTag & IO_REPARSE_TAG_NAME_SURROGATE) break; return REPARSE_NONE; } if (buffer != rp) memcpy(buffer, rp, sizeof(struct REPARSE_DATA_BUFFER)); /* Looks like normal symlink. */ return REPARSE_LINK; } static struct folio *ntfs_lock_new_page(struct address_space *mapping, pgoff_t index, gfp_t gfp) { struct folio *folio = __filemap_get_folio(mapping, index, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); if (IS_ERR(folio)) return folio; if (!folio_test_uptodate(folio)) { struct page *page = folio_file_page(folio, index); if (IS_ERR(page)) return ERR_CAST(page); return page_folio(page); } /* Use a temporary page to avoid data corruption */ folio_unlock(folio); folio_put(folio); folio = folio_alloc(gfp, 0); if (!folio) return ERR_PTR(-ENOMEM); __folio_set_locked(folio); return folio; } /* * ni_read_folio_cmpr * * When decompressing, we typically obtain more than one page per reference. * We inject the additional pages into the page cache. */ int ni_read_folio_cmpr(struct ntfs_inode *ni, struct folio *folio) { int err; struct ntfs_sb_info *sbi = ni->mi.sbi; struct address_space *mapping = folio->mapping; pgoff_t index; u64 frame_vbo, vbo = folio_pos(folio); struct page **pages = NULL; /* Array of at most 16 pages. stack? */ u8 frame_bits; CLST frame; u32 i, idx, frame_size, pages_per_frame; gfp_t gfp_mask; struct page *pg; struct folio *f; if (vbo >= i_size_read(&ni->vfs_inode)) { folio_zero_range(folio, 0, folio_size(folio)); folio_mark_uptodate(folio); err = 0; goto out; } if (ni->ni_flags & NI_FLAG_COMPRESSED_MASK) { /* Xpress or LZX. */ frame_bits = ni_ext_compress_bits(ni); } else { /* LZNT compression. */ frame_bits = NTFS_LZNT_CUNIT + sbi->cluster_bits; } frame_size = 1u << frame_bits; frame = vbo >> frame_bits; frame_vbo = (u64)frame << frame_bits; idx = (vbo - frame_vbo) >> PAGE_SHIFT; pages_per_frame = frame_size >> PAGE_SHIFT; pages = kzalloc_objs(struct page *, pages_per_frame, GFP_NOFS); if (!pages) { err = -ENOMEM; goto out; } pages[idx] = &folio->page; index = frame_vbo >> PAGE_SHIFT; gfp_mask = mapping_gfp_mask(mapping); for (i = 0; i < pages_per_frame; i++, index++) { if (i == idx) continue; f = ntfs_lock_new_page(mapping, index, gfp_mask); if (IS_ERR(f)) { err = PTR_ERR(f); goto out1; } pages[i] = &f->page; } ni_lock(ni); err = ni_read_frame(ni, frame_vbo, pages, pages_per_frame, 0); ni_unlock(ni); out1: for (i = 0; i < pages_per_frame; i++) { pg = pages[i]; if (i == idx || !pg) continue; unlock_page(pg); put_page(pg); } out: /* At this point, err contains 0 or -EIO depending on the "critical" page. */ kfree(pages); folio_unlock(folio); return err; } #ifdef CONFIG_NTFS3_LZX_XPRESS /* * ni_decompress_file - Decompress LZX/Xpress compressed file. * * Remove ATTR_DATA::WofCompressedData. * Remove ATTR_REPARSE. */ int ni_decompress_file(struct ntfs_inode *ni) { struct ntfs_sb_info *sbi = ni->mi.sbi; struct inode *inode = &ni->vfs_inode; loff_t i_size = i_size_read(inode); struct address_space *mapping = inode->i_mapping; gfp_t gfp_mask = mapping_gfp_mask(mapping); struct page **pages = NULL; struct ATTR_LIST_ENTRY *le; struct ATTRIB *attr; CLST vcn, cend, lcn, clen, end; pgoff_t index; u64 vbo; u8 frame_bits; u32 i, frame_size, pages_per_frame, bytes; struct mft_inode *mi; int err; /* Clusters for decompressed data. */ cend = bytes_to_cluster(sbi, i_size); if (!i_size) goto remove_wof; /* Check in advance. */ if (cend > wnd_zeroes(&sbi->used.bitmap)) { err = -ENOSPC; goto out; } frame_bits = ni_ext_compress_bits(ni); frame_size = 1u << frame_bits; pages_per_frame = frame_size >> PAGE_SHIFT; pages = kzalloc_objs(struct page *, pages_per_frame, GFP_NOFS); if (!pages) { err = -ENOMEM; goto out; } /* * Step 1: Decompress data and copy to new allocated clusters. */ index = 0; for (vbo = 0; vbo < i_size; vbo += bytes) { bool new; bytes = vbo + frame_size > i_size ? (i_size - vbo) : frame_size; end = bytes_to_cluster(sbi, vbo + bytes); for (vcn = vbo >> sbi->cluster_bits; vcn < end; vcn += clen) { err = attr_data_get_block(ni, vcn, cend - vcn, &lcn, &clen, &new, false, NULL, false); if (err) goto out; } for (i = 0; i < pages_per_frame; i++, index++) { struct folio *f; f = ntfs_lock_new_page(mapping, index, gfp_mask); if (IS_ERR(f)) { while (i--) { unlock_page(pages[i]); put_page(pages[i]); } err = PTR_ERR(f); goto out; } pages[i] = &f->page; } err = ni_read_frame(ni, vbo, pages, pages_per_frame, 1); for (i = 0; i < pages_per_frame; i++) { unlock_page(pages[i]); put_page(pages[i]); } if (err) goto out; cond_resched(); } remove_wof: /* * Step 2: Deallocate attributes ATTR_DATA::WofCompressedData * and ATTR_REPARSE. */ attr = NULL; le = NULL; while ((attr = ni_enum_attr_ex(ni, attr, &le, NULL))) { CLST svcn, evcn; u32 asize, roff; if (attr->type == ATTR_REPARSE) { struct MFT_REF ref; mi_get_ref(&ni->mi, &ref); ntfs_remove_reparse(sbi, 0, &ref); } if (!attr->non_res) continue; if (attr->type != ATTR_REPARSE && (attr->type != ATTR_DATA || attr->name_len != ARRAY_SIZE(WOF_NAME) || memcmp(attr_name(attr), WOF_NAME, sizeof(WOF_NAME)))) continue; svcn = le64_to_cpu(attr->nres.svcn); evcn = le64_to_cpu(attr->nres.evcn); if (evcn + 1 <= svcn) continue; asize = le32_to_cpu(attr->size); roff = le16_to_cpu(attr->nres.run_off); if (roff > asize) { err = -EINVAL; goto out; } /*run==1 Means unpack and deallocate. */ run_unpack_ex(RUN_DEALLOCATE, sbi, ni->mi.rno, svcn, evcn, svcn, Add2Ptr(attr, roff), asize - roff); } /* * Step 3: Remove attribute ATTR_DATA::WofCompressedData. */ err = ni_remove_attr(ni, ATTR_DATA, WOF_NAME, ARRAY_SIZE(WOF_NAME), false, NULL); if (err) goto out; /* * Step 4: Remove ATTR_REPARSE. */ err = ni_remove_attr(ni, ATTR_REPARSE, NULL, 0, false, NULL); if (err) goto out; /* * Step 5: Remove sparse flag from data attribute. */ attr = ni_find_attr(ni, NULL, NULL, ATTR_DATA, NULL, 0, NULL, &mi); if (!attr) { err = -EINVAL; goto out; } if (attr->non_res && is_attr_sparsed(attr)) { /* Sparsed attribute header is 8 bytes bigger than normal. */ struct MFT_REC *rec = mi->mrec; u32 used = le32_to_cpu(rec->used); u32 asize = le32_to_cpu(attr->size); u16 roff = le16_to_cpu(attr->nres.run_off); char *rbuf = Add2Ptr(attr, roff); memmove(rbuf - 8, rbuf, used - PtrOffset(rec, rbuf)); attr->size = cpu_to_le32(asize - 8); attr->flags &= ~ATTR_FLAG_SPARSED; attr->nres.run_off = cpu_to_le16(roff - 8); attr->nres.c_unit = 0; rec->used = cpu_to_le32(used - 8); mi->dirty = true; ni->std_fa &= ~(FILE_ATTRIBUTE_SPARSE_FILE | FILE_ATTRIBUTE_REPARSE_POINT); mark_inode_dirty(inode); } /* Clear cached flag. */ ni->ni_flags &= ~NI_FLAG_COMPRESSED_MASK; if (ni->file.offs_folio) { folio_put(ni->file.offs_folio); ni->file.offs_folio = NULL; } mapping->a_ops = &ntfs_aops; out: kfree(pages); if (err) _ntfs_bad_inode(inode); return err; } /* * decompress_lzx_xpress - External compression LZX/Xpress. */ static int decompress_lzx_xpress(struct ntfs_sb_info *sbi, const char *cmpr, size_t cmpr_size, void *unc, size_t unc_size, u32 frame_size) { int err; void *ctx; if (cmpr_size == unc_size) { /* Frame not compressed. */ memcpy(unc, cmpr, unc_size); return 0; } err = 0; if (frame_size == 0x8000) { mutex_lock(&sbi->compress.mtx_lzx); /* LZX: Frame compressed. */ ctx = sbi->compress.lzx; if (!ctx) { /* Lazy initialize LZX decompress context. */ ctx = lzx_allocate_decompressor(); if (!ctx) { err = -ENOMEM; goto out1; } sbi->compress.lzx = ctx; } if (lzx_decompress(ctx, cmpr, cmpr_size, unc, unc_size)) { /* Treat all errors as "invalid argument". */ err = -EINVAL; } out1: mutex_unlock(&sbi->compress.mtx_lzx); } else { /* XPRESS: Frame compressed. */ mutex_lock(&sbi->compress.mtx_xpress); ctx = sbi->compress.xpress; if (!ctx) { /* Lazy initialize Xpress decompress context. */ ctx = xpress_allocate_decompressor(); if (!ctx) { err = -ENOMEM; goto out2; } sbi->compress.xpress = ctx; } if (xpress_decompress(ctx, cmpr, cmpr_size, unc, unc_size)) { /* Treat all errors as "invalid argument". */ err = -EINVAL; } out2: mutex_unlock(&sbi->compress.mtx_xpress); } return err; } #endif /* * ni_read_frame * * Pages - Array of locked pages. */ int ni_read_frame(struct ntfs_inode *ni, u64 frame_vbo, struct page **pages, u32 pages_per_frame, int copy) { int err; struct ntfs_sb_info *sbi = ni->mi.sbi; u8 cluster_bits = sbi->cluster_bits; char *frame_ondisk = NULL; char *frame_mem = NULL; struct ATTR_LIST_ENTRY *le = NULL; struct runs_tree *run = &ni->file.run; u64 valid_size = ni->i_valid; u64 vbo_disk; size_t unc_size = 0; u32 frame_size, i, ondisk_size; struct page *pg; struct ATTRIB *attr; CLST frame, clst_data; /* * To simplify decompress algorithm do vmap for source * and target pages. */ frame_size = pages_per_frame << PAGE_SHIFT; frame_mem = vmap(pages, pages_per_frame, VM_MAP, PAGE_KERNEL); if (!frame_mem) { err = -ENOMEM; goto out; } attr = ni_find_attr(ni, NULL, &le, ATTR_DATA, NULL, 0, NULL, NULL); if (!attr) { err = -ENOENT; goto out1; } if (!attr->non_res) { u32 data_size = le32_to_cpu(attr->res.data_size); memset(frame_mem, 0, frame_size); if (frame_vbo < data_size) { ondisk_size = data_size - frame_vbo; memcpy(frame_mem, resident_data(attr) + frame_vbo, min(ondisk_size, frame_size)); } err = 0; goto out1; } if (frame_vbo >= valid_size) { memset(frame_mem, 0, frame_size); err = 0; goto out1; } if (ni->ni_flags & NI_FLAG_COMPRESSED_MASK) { #ifndef CONFIG_NTFS3_LZX_XPRESS err = -EOPNOTSUPP; goto out1; #else loff_t i_size = i_size_read(&ni->vfs_inode); u32 frame_bits = ni_ext_compress_bits(ni); u64 frame64 = frame_vbo >> frame_bits; u64 frames, vbo_data; if (frame_size != (1u << frame_bits)) { err = -EINVAL; goto out1; } switch (frame_size) { case 0x1000: case 0x2000: case 0x4000: case 0x8000: break; default: /* Unknown compression. */ err = -EOPNOTSUPP; goto out1; } attr = ni_find_attr(ni, attr, &le, ATTR_DATA, WOF_NAME, ARRAY_SIZE(WOF_NAME), NULL, NULL); if (!attr) { ntfs_inode_err( &ni->vfs_inode, "external compressed file should contains data attribute \"WofCompressedData\""); err = -EINVAL; goto out1; } if (!attr->non_res) { run = NULL; } else { run = run_alloc(); if (!run) { err = -ENOMEM; goto out1; } } frames = (i_size - 1) >> frame_bits; err = attr_wof_frame_info(ni, attr, run, frame64, frames, frame_bits, &ondisk_size, &vbo_data); if (err) goto out1; if (frame64 == frames) { unc_size = 1 + ((i_size - 1) & (frame_size - 1)); ondisk_size = attr_size(attr) - vbo_data; } else { unc_size = frame_size; } if (ondisk_size > frame_size) { err = -EINVAL; goto out1; } if (!attr->non_res) { if (vbo_data + ondisk_size > le32_to_cpu(attr->res.data_size)) { err = -EINVAL; goto out1; } err = decompress_lzx_xpress( sbi, Add2Ptr(resident_data(attr), vbo_data), ondisk_size, frame_mem, unc_size, frame_size); goto out1; } vbo_disk = vbo_data; /* Load all runs to read [vbo_disk-vbo_to). */ err = attr_load_runs_range(ni, ATTR_DATA, WOF_NAME, ARRAY_SIZE(WOF_NAME), run, vbo_disk, vbo_data + ondisk_size); if (err) goto out1; #endif } else if (is_attr_compressed(attr)) { /* LZNT compression. */ if (sbi->cluster_size > NTFS_LZNT_MAX_CLUSTER) { err = -EOPNOTSUPP; goto out1; } if (attr->nres.c_unit != NTFS_LZNT_CUNIT) { err = -EOPNOTSUPP; goto out1; } down_write(&ni->file.run_lock); run_truncate_around(run, le64_to_cpu(attr->nres.svcn)); frame = frame_vbo >> (cluster_bits + NTFS_LZNT_CUNIT); err = attr_is_frame_compressed(ni, attr, frame, &clst_data, run); up_write(&ni->file.run_lock); if (err) goto out1; if (!clst_data) { memset(frame_mem, 0, frame_size); goto out1; } frame_size = sbi->cluster_size << NTFS_LZNT_CUNIT; ondisk_size = clst_data << cluster_bits; if (clst_data >= NTFS_LZNT_CLUSTERS) { /* Frame is not compressed. */ down_read(&ni->file.run_lock); err = ntfs_read_run(sbi, run, frame_mem, frame_vbo, ondisk_size); up_read(&ni->file.run_lock); goto out1; } vbo_disk = frame_vbo; } else { __builtin_unreachable(); err = -EINVAL; goto out1; } /* Allocate memory to read compressed data to. */ frame_ondisk = kvmalloc(ondisk_size, GFP_KERNEL); if (!frame_ondisk) { err = -ENOMEM; goto out1; } /* Read 'ondisk_size' bytes from disk. */ down_read(&ni->file.run_lock); err = ntfs_read_run(sbi, run, frame_ondisk, vbo_disk, ondisk_size); up_read(&ni->file.run_lock); if (err) goto out2; #ifdef CONFIG_NTFS3_LZX_XPRESS if (run != &ni->file.run) { /* LZX or XPRESS */ err = decompress_lzx_xpress(sbi, frame_ondisk, ondisk_size, frame_mem, unc_size, frame_size); } else #endif { /* LZNT - Native NTFS compression. */ unc_size = decompress_lznt(frame_ondisk, ondisk_size, frame_mem, frame_size); if ((ssize_t)unc_size < 0) err = unc_size; else if (!unc_size || unc_size > frame_size) err = -EINVAL; } if (!err && valid_size < frame_vbo + frame_size) { size_t ok = valid_size - frame_vbo; memset(frame_mem + ok, 0, frame_size - ok); } out2: kvfree(frame_ondisk); out1: #ifdef CONFIG_NTFS3_LZX_XPRESS if (run != &ni->file.run) run_free(run); if (!err && copy) { /* We are called from 'ni_decompress_file' */ /* Copy decompressed LZX or XPRESS data into new place. */ down_read(&ni->file.run_lock); err = ntfs_write_run(sbi, &ni->file.run, frame_mem, frame_vbo, frame_size); up_read(&ni->file.run_lock); } #endif vunmap(frame_mem); out: for (i = 0; i < pages_per_frame; i++) { pg = pages[i]; SetPageUptodate(pg); } return err; } /* * ni_write_frame * * Pages - Array of locked pages. */ int ni_write_frame(struct ntfs_inode *ni, struct page **pages, u32 pages_per_frame) { int err; struct ntfs_sb_info *sbi = ni->mi.sbi; struct folio *folio = page_folio(pages[0]); u8 frame_bits = NTFS_LZNT_CUNIT + sbi->cluster_bits; u32 frame_size = sbi->cluster_size << NTFS_LZNT_CUNIT; u64 frame_vbo = folio_pos(folio); CLST frame = frame_vbo >> frame_bits; char *frame_ondisk = NULL; struct ATTR_LIST_ENTRY *le = NULL; char *frame_mem; struct ATTRIB *attr; struct mft_inode *mi; size_t compr_size, ondisk_size; struct lznt *lznt; attr = ni_find_attr(ni, NULL, &le, ATTR_DATA, NULL, 0, NULL, &mi); if (!attr) { err = -ENOENT; goto out; } if (WARN_ON(!is_attr_compressed(attr))) { err = -EINVAL; goto out; } if (sbi->cluster_size > NTFS_LZNT_MAX_CLUSTER) { err = -EOPNOTSUPP; goto out; } if (!attr->non_res) { down_write(&ni->file.run_lock); err = attr_make_nonresident(ni, attr, le, mi, le32_to_cpu(attr->res.data_size), &ni->file.run, &attr, pages[0]); up_write(&ni->file.run_lock); if (err) goto out; } if (attr->nres.c_unit != NTFS_LZNT_CUNIT) { err = -EOPNOTSUPP; goto out; } /* Allocate memory to write compressed data to. */ frame_ondisk = kvmalloc(frame_size, GFP_KERNEL); if (!frame_ondisk) { err = -ENOMEM; goto out; } /* Map in-memory frame for read-only. */ frame_mem = vmap(pages, pages_per_frame, VM_MAP, PAGE_KERNEL_RO); if (!frame_mem) { err = -ENOMEM; goto out1; } mutex_lock(&sbi->compress.mtx_lznt); lznt = NULL; if (!sbi->compress.lznt) { /* * LZNT implements two levels of compression: * 0 - Standard compression * 1 - Best compression, requires a lot of cpu * use mount option? */ lznt = get_lznt_ctx(0); if (!lznt) { mutex_unlock(&sbi->compress.mtx_lznt); err = -ENOMEM; goto out2; } sbi->compress.lznt = lznt; lznt = NULL; } /* Compress: frame_mem -> frame_ondisk */ compr_size = compress_lznt(frame_mem, frame_size, frame_ondisk, frame_size, sbi->compress.lznt); mutex_unlock(&sbi->compress.mtx_lznt); kfree(lznt); if (compr_size + sbi->cluster_size > frame_size) { /* Frame is not compressed. */ compr_size = frame_size; ondisk_size = frame_size; } else if (compr_size) { /* Frame is compressed. */ ondisk_size = ntfs_up_cluster(sbi, compr_size); memset(frame_ondisk + compr_size, 0, ondisk_size - compr_size); } else { /* Frame is sparsed. */ ondisk_size = 0; } down_write(&ni->file.run_lock); run_truncate_around(&ni->file.run, le64_to_cpu(attr->nres.svcn)); err = attr_allocate_frame(ni, frame, compr_size, ni->i_valid); up_write(&ni->file.run_lock); if (err) goto out2; if (!ondisk_size) goto out2; down_read(&ni->file.run_lock); err = ntfs_write_run(sbi, &ni->file.run, ondisk_size < frame_size ? frame_ondisk : frame_mem, frame_vbo, ondisk_size); up_read(&ni->file.run_lock); out2: vunmap(frame_mem); out1: kvfree(frame_ondisk); out: return err; } /* * ni_remove_name - Removes name 'de' from MFT and from directory. * 'de2' and 'undo_step' are used to restore MFT/dir, if error occurs. */ int ni_remove_name(struct ntfs_inode *dir_ni, struct ntfs_inode *ni, struct NTFS_DE *de, struct NTFS_DE **de2, int *undo_step) { int err; struct ntfs_sb_info *sbi = ni->mi.sbi; struct ATTR_FILE_NAME *de_name = (struct ATTR_FILE_NAME *)(de + 1); struct ATTR_FILE_NAME *fname; struct ATTR_LIST_ENTRY *le; struct mft_inode *mi; u16 de_key_size = le16_to_cpu(de->key_size); u8 name_type; *undo_step = 0; /* Find name in record. */ mi_get_ref(&dir_ni->mi, &de_name->home); fname = ni_fname_name(ni, (struct le_str *)&de_name->name_len, &de_name->home, &mi, &le); if (!fname) return -ENOENT; memcpy(&de_name->dup, &fname->dup, sizeof(struct NTFS_DUP_INFO)); name_type = paired_name(fname->type); /* Mark ntfs as dirty. It will be cleared at umount. */ ntfs_set_state(sbi, NTFS_DIRTY_DIRTY); /* Step 1: Remove name from directory. */ err = indx_delete_entry(&dir_ni->dir, dir_ni, fname, de_key_size, sbi); if (err) return err; /* Step 2: Remove name from MFT. */ ni_remove_attr_le(ni, attr_from_name(fname), mi, le); *undo_step = 2; /* Get paired name. */ fname = ni_fname_type(ni, name_type, &mi, &le); if (fname) { u16 de2_key_size = fname_full_size(fname); *de2 = Add2Ptr(de, 1024); (*de2)->key_size = cpu_to_le16(de2_key_size); memcpy(*de2 + 1, fname, de2_key_size); /* Step 3: Remove paired name from directory. */ err = indx_delete_entry(&dir_ni->dir, dir_ni, fname, de2_key_size, sbi); if (err) return err; /* Step 4: Remove paired name from MFT. */ ni_remove_attr_le(ni, attr_from_name(fname), mi, le); *undo_step = 4; } return 0; } /* * ni_remove_name_undo - Paired function for ni_remove_name. * * Return: True if ok */ bool ni_remove_name_undo(struct ntfs_inode *dir_ni, struct ntfs_inode *ni, struct NTFS_DE *de, struct NTFS_DE *de2, int undo_step) { struct ntfs_sb_info *sbi = ni->mi.sbi; struct ATTRIB *attr; u16 de_key_size; switch (undo_step) { case 4: de_key_size = le16_to_cpu(de2->key_size); if (ni_insert_resident(ni, de_key_size, ATTR_NAME, NULL, 0, &attr, NULL, NULL)) return false; memcpy(Add2Ptr(attr, SIZEOF_RESIDENT), de2 + 1, de_key_size); mi_get_ref(&ni->mi, &de2->ref); de2->size = cpu_to_le16(ALIGN(de_key_size, 8) + sizeof(struct NTFS_DE)); de2->flags = 0; de2->res = 0; if (indx_insert_entry(&dir_ni->dir, dir_ni, de2, sbi, NULL, 1)) return false; fallthrough; case 2: de_key_size = le16_to_cpu(de->key_size); if (ni_insert_resident(ni, de_key_size, ATTR_NAME, NULL, 0, &attr, NULL, NULL)) return false; memcpy(Add2Ptr(attr, SIZEOF_RESIDENT), de + 1, de_key_size); mi_get_ref(&ni->mi, &de->ref); if (indx_insert_entry(&dir_ni->dir, dir_ni, de, sbi, NULL, 1)) return false; } return true; } /* * ni_add_name - Add new name into MFT and into directory. */ int ni_add_name(struct ntfs_inode *dir_ni, struct ntfs_inode *ni, struct NTFS_DE *de) { int err; struct ntfs_sb_info *sbi = ni->mi.sbi; struct ATTRIB *attr; struct ATTR_LIST_ENTRY *le; struct mft_inode *mi; struct ATTR_FILE_NAME *fname; struct ATTR_FILE_NAME *de_name = (struct ATTR_FILE_NAME *)(de + 1); u16 de_key_size = le16_to_cpu(de->key_size); if (sbi->options->windows_names && !valid_windows_name(sbi, (struct le_str *)&de_name->name_len)) return -EINVAL; /* If option "hide_dot_files" then set hidden attribute for dot files. */ if (ni->mi.sbi->options->hide_dot_files) { if (de_name->name_len > 0 && le16_to_cpu(de_name->name[0]) == '.') ni->std_fa |= FILE_ATTRIBUTE_HIDDEN; else ni->std_fa &= ~FILE_ATTRIBUTE_HIDDEN; } mi_get_ref(&ni->mi, &de->ref); mi_get_ref(&dir_ni->mi, &de_name->home); /* Fill duplicate from any ATTR_NAME. */ fname = ni_fname_name(ni, NULL, NULL, NULL, NULL); if (fname) memcpy(&de_name->dup, &fname->dup, sizeof(fname->dup)); de_name->dup.fa = ni->std_fa; /* Insert new name into MFT. */ err = ni_insert_resident(ni, de_key_size, ATTR_NAME, NULL, 0, &attr, &mi, &le); if (err) return err; memcpy(Add2Ptr(attr, SIZEOF_RESIDENT), de_name, de_key_size); /* Insert new name into directory. */ err = indx_insert_entry(&dir_ni->dir, dir_ni, de, sbi, NULL, 0); if (err) ni_remove_attr_le(ni, attr, mi, le); return err; } /* * ni_rename - Remove one name and insert new name. */ int ni_rename(struct ntfs_inode *dir_ni, struct ntfs_inode *new_dir_ni, struct ntfs_inode *ni, struct NTFS_DE *de, struct NTFS_DE *new_de) { int err; struct NTFS_DE *de2 = NULL; int undo = 0; /* * There are two possible ways to rename: * 1) Add new name and remove old name. * 2) Remove old name and add new name. * * In most cases (not all!) adding new name into MFT and into directory can * allocate additional cluster(s). * Second way may result to bad inode if we can't add new name * and then can't restore (add) old name. */ /* * Way 1 - Add new + remove old. */ err = ni_add_name(new_dir_ni, ni, new_de); if (!err) { err = ni_remove_name(dir_ni, ni, de, &de2, &undo); WARN_ON(err && ni_remove_name(new_dir_ni, ni, new_de, &de2, &undo)); } /* * Way 2 - Remove old + add new. */ /* * err = ni_remove_name(dir_ni, ni, de, &de2, &undo); * if (!err) { * err = ni_add_name(new_dir_ni, ni, new_de); * if (err && !ni_remove_name_undo(dir_ni, ni, de, de2, undo)) * *is_bad = true; * } */ return err; } /* * ni_is_dirty - Return: True if 'ni' requires ni_write_inode. */ bool ni_is_dirty(struct inode *inode) { struct ntfs_inode *ni = ntfs_i(inode); struct rb_node *node; if (ni->mi.dirty || ni->attr_list.dirty || (ni->ni_flags & NI_FLAG_UPDATE_PARENT)) return true; for (node = rb_first(&ni->mi_tree); node; node = rb_next(node)) { if (rb_entry(node, struct mft_inode, node)->dirty) return true; } return false; } /* * ni_seek_data_or_hole * * Helper function for ntfs_llseek( SEEK_DATA/SEEK_HOLE ) */ loff_t ni_seek_data_or_hole(struct ntfs_inode *ni, loff_t offset, bool data) { int err; u8 cluster_bits = ni->mi.sbi->cluster_bits; CLST vcn, lcn, clen; loff_t vbo; /* Enumerate all fragments. */ for (vcn = offset >> cluster_bits;; vcn += clen) { err = attr_data_get_block(ni, vcn, 1, &lcn, &clen, NULL, false, NULL, false); if (err) { return err; } if (lcn == RESIDENT_LCN) { /* clen - resident size in bytes. clen == ni->vfs_inode.i_size */ if (offset >= clen) { /* check eof. */ return -ENXIO; } if (data) { return offset; } return clen; } if (lcn == EOF_LCN) { if (data) { return -ENXIO; } /* implicit hole at the end of file. */ return ni->vfs_inode.i_size; } if (data) { /* * Adjust the file offset to the next location in the file greater than * or equal to offset containing data. If offset points to data, then * the file offset is set to offset. */ if (lcn != SPARSE_LCN) { vbo = (u64)vcn << cluster_bits; return max(vbo, offset); } } else { /* * Adjust the file offset to the next hole in the file greater than or * equal to offset. If offset points into the middle of a hole, then the * file offset is set to offset. If there is no hole past offset, then the * file offset is adjusted to the end of the file * (i.e., there is an implicit hole at the end of any file). */ if (lcn == SPARSE_LCN && /* native compression hole begins at aligned vcn. */ (!(ni->std_fa & FILE_ATTRIBUTE_COMPRESSED) || !(vcn & (NTFS_LZNT_CLUSTERS - 1)))) { vbo = (u64)vcn << cluster_bits; return max(vbo, offset); } } if (!clen) { /* Corrupted file. */ return -EINVAL; } } } /* * ni_write_parents * * Helper function for ntfs_file_fsync. */ int ni_write_parents(struct ntfs_inode *ni, int sync) { int err = 0; struct ATTRIB *attr = NULL; struct ATTR_LIST_ENTRY *le = NULL; struct ntfs_sb_info *sbi = ni->mi.sbi; struct super_block *sb = sbi->sb; while ((attr = ni_find_attr(ni, attr, &le, ATTR_NAME, NULL, 0, NULL, NULL))) { struct inode *dir; struct ATTR_FILE_NAME *fname; fname = resident_data_ex(attr, SIZEOF_ATTRIBUTE_FILENAME); if (!fname) continue; /* Check simple case when parent inode equals current inode. */ if (ino_get(&fname->home) == ni->vfs_inode.i_ino) { if (MFT_REC_ROOT != ni->vfs_inode.i_ino) { ntfs_set_state(sbi, NTFS_DIRTY_ERROR); err = -EINVAL; } continue; } dir = ntfs_iget5(sb, &fname->home, NULL); if (IS_ERR(dir)) { ntfs_inode_warn( &ni->vfs_inode, "failed to open parent directory r=%lx to write", (long)ino_get(&fname->home)); continue; } if (!is_bad_inode(dir)) { int err2 = write_inode_now(dir, sync); if (!err) err = err2; } iput(dir); } return err; } /* * ni_update_parent * * Update duplicate info of ATTR_FILE_NAME in MFT and in parent directories. */ static bool ni_update_parent(struct ntfs_inode *ni, struct NTFS_DUP_INFO *dup, int sync) { struct ATTRIB *attr; struct mft_inode *mi; struct ATTR_LIST_ENTRY *le = NULL; struct ntfs_sb_info *sbi = ni->mi.sbi; struct super_block *sb = sbi->sb; bool re_dirty = false; if (ni->mi.mrec->flags & RECORD_FLAG_DIR) { dup->fa |= FILE_ATTRIBUTE_DIRECTORY; attr = NULL; dup->alloc_size = 0; dup->data_size = 0; } else { dup->fa &= ~FILE_ATTRIBUTE_DIRECTORY; attr = ni_find_attr(ni, NULL, &le, ATTR_DATA, NULL, 0, NULL, &mi); if (!attr) { dup->alloc_size = dup->data_size = 0; } else if (!attr->non_res) { u32 data_size = le32_to_cpu(attr->res.data_size); dup->alloc_size = cpu_to_le64(ALIGN(data_size, 8)); dup->data_size = cpu_to_le64(data_size); } else { u64 new_valid = ni->i_valid; u64 data_size = le64_to_cpu(attr->nres.data_size); __le64 valid_le; dup->alloc_size = is_attr_ext(attr) ? attr->nres.total_size : attr->nres.alloc_size; dup->data_size = attr->nres.data_size; if (new_valid > data_size) new_valid = data_size; valid_le = cpu_to_le64(new_valid); if (valid_le != attr->nres.valid_size) { attr->nres.valid_size = valid_le; mi->dirty = true; } } } dup->extend_data = 0; if (dup->fa & FILE_ATTRIBUTE_REPARSE_POINT) { attr = ni_find_attr(ni, NULL, NULL, ATTR_REPARSE, NULL, 0, NULL, NULL); if (attr) { const struct REPARSE_POINT *rp; rp = resident_data_ex(attr, sizeof(struct REPARSE_POINT)); /* If ATTR_REPARSE exists 'rp' can't be NULL. */ if (rp) dup->extend_data = rp->ReparseTag; } } else if (ni->ni_flags & NI_FLAG_EA) { attr = ni_find_attr(ni, attr, &le, ATTR_EA_INFO, NULL, 0, NULL, NULL); if (attr) { const struct EA_INFO *info; info = resident_data_ex(attr, sizeof(struct EA_INFO)); /* If ATTR_EA_INFO exists 'info' can't be NULL. */ if (info) dup->extend_data = info->size; } } attr = NULL; le = NULL; while ((attr = ni_find_attr(ni, attr, &le, ATTR_NAME, NULL, 0, NULL, &mi))) { struct inode *dir; struct ATTR_FILE_NAME *fname; fname = resident_data_ex(attr, SIZEOF_ATTRIBUTE_FILENAME); if (!fname || !memcmp(&fname->dup, dup, sizeof(fname->dup))) continue; /* Check simple case when parent inode equals current inode. */ if (ino_get(&fname->home) == ni->vfs_inode.i_ino) { ntfs_set_state(sbi, NTFS_DIRTY_ERROR); continue; } /* ntfs_iget5 may sleep. */ dir = ntfs_iget5(sb, &fname->home, NULL); if (IS_ERR(dir)) { ntfs_inode_warn( &ni->vfs_inode, "failed to open parent directory r=%lx to update", (long)ino_get(&fname->home)); continue; } if (!is_bad_inode(dir)) { struct ntfs_inode *dir_ni = ntfs_i(dir); if (!ni_trylock(dir_ni)) { re_dirty = true; } else { indx_update_dup(dir_ni, sbi, fname, dup, sync); ni_unlock(dir_ni); memcpy(&fname->dup, dup, sizeof(fname->dup)); mi->dirty = true; } } iput(dir); } return re_dirty; } /* * ni_write_inode - Write MFT base record and all subrecords to disk. */ int ni_write_inode(struct inode *inode, int sync, const char *hint) { int err = 0, err2; struct ntfs_inode *ni = ntfs_i(inode); struct super_block *sb = inode->i_sb; struct ntfs_sb_info *sbi = sb->s_fs_info; bool re_dirty = false; struct ATTR_STD_INFO *std; struct rb_node *node, *next; struct NTFS_DUP_INFO dup; if (is_bad_inode(inode) || sb_rdonly(sb)) return 0; /* Avoid any operation if inode is bad. */ if (unlikely(is_bad_ni(ni))) return -EINVAL; if (unlikely(ntfs3_forced_shutdown(sb))) return -EIO; if (!ni_trylock(ni)) { /* 'ni' is under modification, skip for now. */ mark_inode_dirty_sync(inode); return 0; } if (!ni->mi.mrec) goto out; if (is_rec_inuse(ni->mi.mrec) && !(sbi->flags & NTFS_FLAGS_LOG_REPLAYING) && inode->i_nlink) { bool modified = false; struct timespec64 ts; /* Update times in standard attribute. */ std = ni_std(ni); if (!std) { err = -EINVAL; goto out; } /* Update the access times if they have changed. */ ts = inode_get_mtime(inode); dup.m_time = kernel2nt(&ts); if (std->m_time != dup.m_time) { std->m_time = dup.m_time; modified = true; } ts = inode_get_ctime(inode); dup.c_time = kernel2nt(&ts); if (std->c_time != dup.c_time) { std->c_time = dup.c_time; modified = true; } ts = inode_get_atime(inode); dup.a_time = kernel2nt(&ts); if (std->a_time != dup.a_time) { std->a_time = dup.a_time; modified = true; } dup.fa = ni->std_fa; if (std->fa != dup.fa) { std->fa = dup.fa; modified = true; } /* std attribute is always in primary MFT record. */ if (modified) ni->mi.dirty = true; if (!ntfs_is_meta_file(sbi, inode->i_ino) && (modified || (ni->ni_flags & NI_FLAG_UPDATE_PARENT)) /* Avoid __wait_on_freeing_inode(inode). */ && (sb->s_flags & SB_ACTIVE)) { dup.cr_time = std->cr_time; /* Not critical if this function fail. */ re_dirty = ni_update_parent(ni, &dup, sync); if (re_dirty) ni->ni_flags |= NI_FLAG_UPDATE_PARENT; else ni->ni_flags &= ~NI_FLAG_UPDATE_PARENT; } /* Update attribute list. */ if (ni->attr_list.size && ni->attr_list.dirty) { if (inode->i_ino != MFT_REC_MFT || sync) { err = ni_try_remove_attr_list(ni); if (err) goto out; } err = al_update(ni, sync); if (err) goto out; } } for (node = rb_first(&ni->mi_tree); node; node = next) { struct mft_inode *mi = rb_entry(node, struct mft_inode, node); bool is_empty; next = rb_next(node); if (!mi->dirty) continue; is_empty = !mi_enum_attr(ni, mi, NULL); if (is_empty) clear_rec_inuse(mi->mrec); err2 = mi_write(mi, sync); if (!err && err2) err = err2; if (is_empty) { ntfs_mark_rec_free(sbi, mi->rno, false); rb_erase(node, &ni->mi_tree); mi_put(mi); } } if (ni->mi.dirty) { err2 = mi_write(&ni->mi, sync); if (!err && err2) err = err2; } out: ni_unlock(ni); if (err) { ntfs_inode_err(inode, "%s failed, %d.", hint, err); ntfs_set_state(sbi, NTFS_DIRTY_ERROR); return err; } if (re_dirty) mark_inode_dirty_sync(inode); return 0; } /* * Force to allocate all delay allocated clusters. */ int ni_allocate_da_blocks(struct ntfs_inode *ni) { int err; ni_lock(ni); down_write(&ni->file.run_lock); err = ni_allocate_da_blocks_locked(ni); up_write(&ni->file.run_lock); ni_unlock(ni); return err; } /* * Force to allocate all delay allocated clusters. */ int ni_allocate_da_blocks_locked(struct ntfs_inode *ni) { int err = 0; if (!ni->file.run_da.count) return 0; if (is_sparsed(ni)) { CLST vcn, lcn, clen, alen; bool new; /* * Sparse file allocates clusters in 'attr_data_get_block_locked' */ while (run_get_entry(&ni->file.run_da, 0, &vcn, &lcn, &clen)) { /* TODO: zero=true? */ err = attr_data_get_block_locked(ni, vcn, clen, &lcn, &alen, &new, true, NULL, true); if (err) break; if (!new) { err = -EINVAL; break; } } } else { /* * Normal file allocates clusters in 'attr_set_size' */ err = attr_set_size_ex(ni, ATTR_DATA, NULL, 0, &ni->file.run, ni->vfs_inode.i_size, &ni->i_valid, false, NULL, true); } return err; }
3829 3825 968 3 308 98 14 3577 43 3264 701 3828 412 3574 3872 3874 284 65 7 2 9 7 3617 3881 3874 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 // SPDX-License-Identifier: GPL-2.0 /* * security/tomoyo/mount.c * * Copyright (C) 2005-2011 NTT DATA CORPORATION */ #include <linux/slab.h> #include <uapi/linux/mount.h> #include "common.h" /* String table for special mount operations. */ static const char * const tomoyo_mounts[TOMOYO_MAX_SPECIAL_MOUNT] = { [TOMOYO_MOUNT_BIND] = "--bind", [TOMOYO_MOUNT_MOVE] = "--move", [TOMOYO_MOUNT_REMOUNT] = "--remount", [TOMOYO_MOUNT_MAKE_UNBINDABLE] = "--make-unbindable", [TOMOYO_MOUNT_MAKE_PRIVATE] = "--make-private", [TOMOYO_MOUNT_MAKE_SLAVE] = "--make-slave", [TOMOYO_MOUNT_MAKE_SHARED] = "--make-shared", }; /** * tomoyo_audit_mount_log - Audit mount log. * * @r: Pointer to "struct tomoyo_request_info". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_audit_mount_log(struct tomoyo_request_info *r) __must_hold_shared(&tomoyo_ss) { return tomoyo_supervisor(r, "file mount %s %s %s 0x%lX\n", r->param.mount.dev->name, r->param.mount.dir->name, r->param.mount.type->name, r->param.mount.flags); } /** * tomoyo_check_mount_acl - Check permission for path path path number operation. * * @r: Pointer to "struct tomoyo_request_info". * @ptr: Pointer to "struct tomoyo_acl_info". * * Returns true if granted, false otherwise. */ static bool tomoyo_check_mount_acl(struct tomoyo_request_info *r, const struct tomoyo_acl_info *ptr) { const struct tomoyo_mount_acl *acl = container_of(ptr, typeof(*acl), head); return tomoyo_compare_number_union(r->param.mount.flags, &acl->flags) && tomoyo_compare_name_union(r->param.mount.type, &acl->fs_type) && tomoyo_compare_name_union(r->param.mount.dir, &acl->dir_name) && (!r->param.mount.need_dev || tomoyo_compare_name_union(r->param.mount.dev, &acl->dev_name)); } /** * tomoyo_mount_acl - Check permission for mount() operation. * * @r: Pointer to "struct tomoyo_request_info". * @dev_name: Name of device file. Maybe NULL. * @dir: Pointer to "struct path". * @type: Name of filesystem type. * @flags: Mount options. * * Returns 0 on success, negative value otherwise. * * Caller holds tomoyo_read_lock(). */ static int tomoyo_mount_acl(struct tomoyo_request_info *r, const char *dev_name, const struct path *dir, const char *type, unsigned long flags) __must_hold_shared(&tomoyo_ss) { struct tomoyo_obj_info obj = { }; struct path path; struct file_system_type *fstype = NULL; const char *requested_type = NULL; const char *requested_dir_name = NULL; const char *requested_dev_name = NULL; struct tomoyo_path_info rtype; struct tomoyo_path_info rdev; struct tomoyo_path_info rdir; int need_dev = 0; int error = -ENOMEM; r->obj = &obj; /* Get fstype. */ requested_type = tomoyo_encode(type); if (!requested_type) goto out; rtype.name = requested_type; tomoyo_fill_path_info(&rtype); /* Get mount point. */ obj.path2 = *dir; requested_dir_name = tomoyo_realpath_from_path(dir); if (!requested_dir_name) { error = -ENOMEM; goto out; } rdir.name = requested_dir_name; tomoyo_fill_path_info(&rdir); /* Compare fs name. */ if (type == tomoyo_mounts[TOMOYO_MOUNT_REMOUNT]) { /* dev_name is ignored. */ } else if (type == tomoyo_mounts[TOMOYO_MOUNT_MAKE_UNBINDABLE] || type == tomoyo_mounts[TOMOYO_MOUNT_MAKE_PRIVATE] || type == tomoyo_mounts[TOMOYO_MOUNT_MAKE_SLAVE] || type == tomoyo_mounts[TOMOYO_MOUNT_MAKE_SHARED]) { /* dev_name is ignored. */ } else if (type == tomoyo_mounts[TOMOYO_MOUNT_BIND] || type == tomoyo_mounts[TOMOYO_MOUNT_MOVE]) { need_dev = -1; /* dev_name is a directory */ } else { fstype = get_fs_type(type); if (!fstype) { error = -ENODEV; goto out; } if (fstype->fs_flags & FS_REQUIRES_DEV) /* dev_name is a block device file. */ need_dev = 1; } if (need_dev) { /* Get mount point or device file. */ if (!dev_name || kern_path(dev_name, LOOKUP_FOLLOW, &path)) { error = -ENOENT; goto out; } obj.path1 = path; requested_dev_name = tomoyo_realpath_from_path(&path); if (!requested_dev_name) { error = -ENOENT; goto out; } } else { /* Map dev_name to "<NULL>" if no dev_name given. */ if (!dev_name) dev_name = "<NULL>"; requested_dev_name = tomoyo_encode(dev_name); if (!requested_dev_name) { error = -ENOMEM; goto out; } } rdev.name = requested_dev_name; tomoyo_fill_path_info(&rdev); r->param_type = TOMOYO_TYPE_MOUNT_ACL; r->param.mount.need_dev = need_dev; r->param.mount.dev = &rdev; r->param.mount.dir = &rdir; r->param.mount.type = &rtype; r->param.mount.flags = flags; do { tomoyo_check_acl(r, tomoyo_check_mount_acl); error = tomoyo_audit_mount_log(r); } while (error == TOMOYO_RETRY_REQUEST); out: kfree(requested_dev_name); kfree(requested_dir_name); if (fstype) put_filesystem(fstype); kfree(requested_type); /* Drop refcount obtained by kern_path(). */ if (obj.path1.dentry) path_put(&obj.path1); return error; } /** * tomoyo_mount_permission - Check permission for mount() operation. * * @dev_name: Name of device file. Maybe NULL. * @path: Pointer to "struct path". * @type: Name of filesystem type. Maybe NULL. * @flags: Mount options. * @data_page: Optional data. Maybe NULL. * * Returns 0 on success, negative value otherwise. */ int tomoyo_mount_permission(const char *dev_name, const struct path *path, const char *type, unsigned long flags, void *data_page) { struct tomoyo_request_info r; int error; int idx; if (tomoyo_init_request_info(&r, NULL, TOMOYO_MAC_FILE_MOUNT) == TOMOYO_CONFIG_DISABLED) return 0; if ((flags & MS_MGC_MSK) == MS_MGC_VAL) flags &= ~MS_MGC_MSK; if (flags & MS_REMOUNT) { type = tomoyo_mounts[TOMOYO_MOUNT_REMOUNT]; flags &= ~MS_REMOUNT; } else if (flags & MS_BIND) { type = tomoyo_mounts[TOMOYO_MOUNT_BIND]; flags &= ~MS_BIND; } else if (flags & MS_SHARED) { if (flags & (MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) return -EINVAL; type = tomoyo_mounts[TOMOYO_MOUNT_MAKE_SHARED]; flags &= ~MS_SHARED; } else if (flags & MS_PRIVATE) { if (flags & (MS_SHARED | MS_SLAVE | MS_UNBINDABLE)) return -EINVAL; type = tomoyo_mounts[TOMOYO_MOUNT_MAKE_PRIVATE]; flags &= ~MS_PRIVATE; } else if (flags & MS_SLAVE) { if (flags & (MS_SHARED | MS_PRIVATE | MS_UNBINDABLE)) return -EINVAL; type = tomoyo_mounts[TOMOYO_MOUNT_MAKE_SLAVE]; flags &= ~MS_SLAVE; } else if (flags & MS_UNBINDABLE) { if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE)) return -EINVAL; type = tomoyo_mounts[TOMOYO_MOUNT_MAKE_UNBINDABLE]; flags &= ~MS_UNBINDABLE; } else if (flags & MS_MOVE) { type = tomoyo_mounts[TOMOYO_MOUNT_MOVE]; flags &= ~MS_MOVE; } if (!type) type = "<NULL>"; idx = tomoyo_read_lock(); error = tomoyo_mount_acl(&r, dev_name, path, type, flags); tomoyo_read_unlock(idx); return error; }
6423 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Landlock - Credential hooks * * Copyright © 2019-2020 Mickaël Salaün <mic@digikod.net> * Copyright © 2019-2020 ANSSI * Copyright © 2021-2025 Microsoft Corporation */ #ifndef _SECURITY_LANDLOCK_CRED_H #define _SECURITY_LANDLOCK_CRED_H #include <linux/container_of.h> #include <linux/cred.h> #include <linux/init.h> #include <linux/rcupdate.h> #include "access.h" #include "limits.h" #include "ruleset.h" #include "setup.h" /** * struct landlock_cred_security - Credential security blob * * This structure is packed to minimize the size of struct * landlock_file_security. However, it is always aligned in the LSM cred blob, * see lsm_set_blob_size(). * * When updating this, also update landlock_cred_copy() if needed. */ struct landlock_cred_security { /** * @domain: Immutable ruleset enforced on a task. */ struct landlock_ruleset *domain; #ifdef CONFIG_AUDIT /** * @domain_exec: Bitmask identifying the domain layers that were enforced by * the current task's executed file (i.e. no new execve(2) since * landlock_restrict_self(2)). */ u16 domain_exec; /** * @log_subdomains_off: Set if the domain descendants's log_status should be * set to %LANDLOCK_LOG_DISABLED. This is not a landlock_hierarchy * configuration because it applies to future descendant domains and it does * not require a current domain. */ u8 log_subdomains_off : 1; #endif /* CONFIG_AUDIT */ } __packed; #ifdef CONFIG_AUDIT /* Makes sure all layer executions can be stored. */ static_assert(BITS_PER_TYPE(typeof_member(struct landlock_cred_security, domain_exec)) >= LANDLOCK_MAX_NUM_LAYERS); #endif /* CONFIG_AUDIT */ static inline struct landlock_cred_security * landlock_cred(const struct cred *cred) { return cred->security + landlock_blob_sizes.lbs_cred; } static inline void landlock_cred_copy(struct landlock_cred_security *dst, const struct landlock_cred_security *src) { landlock_put_ruleset(dst->domain); *dst = *src; landlock_get_ruleset(src->domain); } static inline struct landlock_ruleset *landlock_get_current_domain(void) { return landlock_cred(current_cred())->domain; } /* * The call needs to come from an RCU read-side critical section. */ static inline const struct landlock_ruleset * landlock_get_task_domain(const struct task_struct *const task) { return landlock_cred(__task_cred(task))->domain; } static inline bool landlocked(const struct task_struct *const task) { bool has_dom; if (task == current) return !!landlock_get_current_domain(); rcu_read_lock(); has_dom = !!landlock_get_task_domain(task); rcu_read_unlock(); return has_dom; } /** * landlock_get_applicable_subject - Return the subject's Landlock credential * if its enforced domain applies to (i.e. * handles) at least one of the access rights * specified in @masks * * @cred: credential * @masks: access masks * @handle_layer: returned youngest layer handling a subset of @masks. Not set * if the function returns NULL. * * Return: landlock_cred(@cred) if any access rights specified in @masks is * handled, or NULL otherwise. */ static inline const struct landlock_cred_security * landlock_get_applicable_subject(const struct cred *const cred, const struct access_masks masks, size_t *const handle_layer) { const union access_masks_all masks_all = { .masks = masks, }; const struct landlock_ruleset *domain; ssize_t layer_level; if (!cred) return NULL; domain = landlock_cred(cred)->domain; if (!domain) return NULL; for (layer_level = domain->num_layers - 1; layer_level >= 0; layer_level--) { union access_masks_all layer = { .masks = domain->access_masks[layer_level], }; if (layer.all & masks_all.all) { if (handle_layer) *handle_layer = layer_level; return landlock_cred(cred); } } return NULL; } __init void landlock_add_cred_hooks(void); #endif /* _SECURITY_LANDLOCK_CRED_H */
1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 // SPDX-License-Identifier: GPL-2.0-or-later /* * Bridge netlink control interface * * Authors: * Stephen Hemminger <shemminger@osdl.org> */ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/etherdevice.h> #include <net/rtnetlink.h> #include <net/net_namespace.h> #include <net/sock.h> #include <uapi/linux/if_bridge.h> #include "br_private.h" #include "br_private_stp.h" #include "br_private_cfm.h" #include "br_private_tunnel.h" #include "br_private_mcast_eht.h" static int __get_num_vlan_infos(struct net_bridge_vlan_group *vg, u32 filter_mask) { struct net_bridge_vlan *v; u16 vid_range_start = 0, vid_range_end = 0, vid_range_flags = 0; u16 flags, pvid; int num_vlans = 0; if (!(filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED)) return 0; pvid = br_get_pvid(vg); /* Count number of vlan infos */ list_for_each_entry_rcu(v, &vg->vlan_list, vlist) { flags = 0; /* only a context, bridge vlan not activated */ if (!br_vlan_should_use(v)) continue; if (v->vid == pvid) flags |= BRIDGE_VLAN_INFO_PVID; if (v->flags & BRIDGE_VLAN_INFO_UNTAGGED) flags |= BRIDGE_VLAN_INFO_UNTAGGED; if (vid_range_start == 0) { goto initvars; } else if ((v->vid - vid_range_end) == 1 && flags == vid_range_flags) { vid_range_end = v->vid; continue; } else { if ((vid_range_end - vid_range_start) > 0) num_vlans += 2; else num_vlans += 1; } initvars: vid_range_start = v->vid; vid_range_end = v->vid; vid_range_flags = flags; } if (vid_range_start != 0) { if ((vid_range_end - vid_range_start) > 0) num_vlans += 2; else num_vlans += 1; } return num_vlans; } static int br_get_num_vlan_infos(struct net_bridge_vlan_group *vg, u32 filter_mask) { int num_vlans; if (!vg) return 0; if (filter_mask & RTEXT_FILTER_BRVLAN) return vg->num_vlans; rcu_read_lock(); num_vlans = __get_num_vlan_infos(vg, filter_mask); rcu_read_unlock(); return num_vlans; } static size_t br_get_link_af_size_filtered(const struct net_device *dev, u32 filter_mask) { struct net_bridge_vlan_group *vg = NULL; struct net_bridge_port *p = NULL; struct net_bridge *br = NULL; u32 num_cfm_peer_mep_infos; u32 num_cfm_mep_infos; size_t vinfo_sz = 0; int num_vlan_infos; rcu_read_lock(); if (netif_is_bridge_port(dev)) { p = br_port_get_check_rcu(dev); if (p) vg = nbp_vlan_group_rcu(p); } else if (netif_is_bridge_master(dev)) { br = netdev_priv(dev); vg = br_vlan_group_rcu(br); } num_vlan_infos = br_get_num_vlan_infos(vg, filter_mask); rcu_read_unlock(); if (p && (p->flags & BR_VLAN_TUNNEL)) vinfo_sz += br_get_vlan_tunnel_info_size(vg); /* Each VLAN is returned in bridge_vlan_info along with flags */ vinfo_sz += num_vlan_infos * nla_total_size(sizeof(struct bridge_vlan_info)); if (p && vg && (filter_mask & RTEXT_FILTER_MST)) vinfo_sz += br_mst_info_size(vg); if (!(filter_mask & RTEXT_FILTER_CFM_STATUS)) return vinfo_sz; if (!br) return vinfo_sz; /* CFM status info must be added */ br_cfm_mep_count(br, &num_cfm_mep_infos); br_cfm_peer_mep_count(br, &num_cfm_peer_mep_infos); vinfo_sz += nla_total_size(0); /* IFLA_BRIDGE_CFM */ /* For each status struct the MEP instance (u32) is added */ /* MEP instance (u32) + br_cfm_mep_status */ vinfo_sz += num_cfm_mep_infos * /*IFLA_BRIDGE_CFM_MEP_STATUS_INSTANCE */ (nla_total_size(sizeof(u32)) /* IFLA_BRIDGE_CFM_MEP_STATUS_OPCODE_UNEXP_SEEN */ + nla_total_size(sizeof(u32)) /* IFLA_BRIDGE_CFM_MEP_STATUS_VERSION_UNEXP_SEEN */ + nla_total_size(sizeof(u32)) /* IFLA_BRIDGE_CFM_MEP_STATUS_RX_LEVEL_LOW_SEEN */ + nla_total_size(sizeof(u32))); /* MEP instance (u32) + br_cfm_cc_peer_status */ vinfo_sz += num_cfm_peer_mep_infos * /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_INSTANCE */ (nla_total_size(sizeof(u32)) /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_PEER_MEPID */ + nla_total_size(sizeof(u32)) /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_CCM_DEFECT */ + nla_total_size(sizeof(u32)) /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_RDI */ + nla_total_size(sizeof(u32)) /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_PORT_TLV_VALUE */ + nla_total_size(sizeof(u8)) /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_IF_TLV_VALUE */ + nla_total_size(sizeof(u8)) /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_SEEN */ + nla_total_size(sizeof(u32)) /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_TLV_SEEN */ + nla_total_size(sizeof(u32)) /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_SEQ_UNEXP_SEEN */ + nla_total_size(sizeof(u32))); return vinfo_sz; } static inline size_t br_port_info_size(void) { return nla_total_size(1) /* IFLA_BRPORT_STATE */ + nla_total_size(2) /* IFLA_BRPORT_PRIORITY */ + nla_total_size(4) /* IFLA_BRPORT_COST */ + nla_total_size(1) /* IFLA_BRPORT_MODE */ + nla_total_size(1) /* IFLA_BRPORT_GUARD */ + nla_total_size(1) /* IFLA_BRPORT_PROTECT */ + nla_total_size(1) /* IFLA_BRPORT_FAST_LEAVE */ + nla_total_size(1) /* IFLA_BRPORT_MCAST_TO_UCAST */ + nla_total_size(1) /* IFLA_BRPORT_LEARNING */ + nla_total_size(1) /* IFLA_BRPORT_UNICAST_FLOOD */ + nla_total_size(1) /* IFLA_BRPORT_MCAST_FLOOD */ + nla_total_size(1) /* IFLA_BRPORT_BCAST_FLOOD */ + nla_total_size(1) /* IFLA_BRPORT_PROXYARP */ + nla_total_size(1) /* IFLA_BRPORT_PROXYARP_WIFI */ + nla_total_size(1) /* IFLA_BRPORT_VLAN_TUNNEL */ + nla_total_size(1) /* IFLA_BRPORT_NEIGH_SUPPRESS */ + nla_total_size(1) /* IFLA_BRPORT_ISOLATED */ + nla_total_size(1) /* IFLA_BRPORT_LOCKED */ + nla_total_size(1) /* IFLA_BRPORT_MAB */ + nla_total_size(1) /* IFLA_BRPORT_NEIGH_VLAN_SUPPRESS */ + nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_ROOT_ID */ + nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_BRIDGE_ID */ + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_DESIGNATED_PORT */ + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_DESIGNATED_COST */ + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_ID */ + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_NO */ + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_TOPOLOGY_CHANGE_ACK */ + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_CONFIG_PENDING */ + nla_total_size_64bit(sizeof(u64)) /* IFLA_BRPORT_MESSAGE_AGE_TIMER */ + nla_total_size_64bit(sizeof(u64)) /* IFLA_BRPORT_FORWARD_DELAY_TIMER */ + nla_total_size_64bit(sizeof(u64)) /* IFLA_BRPORT_HOLD_TIMER */ #ifdef CONFIG_BRIDGE_IGMP_SNOOPING + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_MULTICAST_ROUTER */ + nla_total_size(sizeof(u32)) /* IFLA_BRPORT_MCAST_N_GROUPS */ + nla_total_size(sizeof(u32)) /* IFLA_BRPORT_MCAST_MAX_GROUPS */ #endif + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_GROUP_FWD_MASK */ + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_MRP_RING_OPEN */ + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_MRP_IN_OPEN */ + nla_total_size(sizeof(u32)) /* IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT */ + nla_total_size(sizeof(u32)) /* IFLA_BRPORT_MCAST_EHT_HOSTS_CNT */ + nla_total_size(sizeof(u32)) /* IFLA_BRPORT_BACKUP_NHID */ + 0; } static inline size_t br_nlmsg_size(struct net_device *dev, u32 filter_mask) { return NLMSG_ALIGN(sizeof(struct ifinfomsg)) + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */ + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */ + nla_total_size(4) /* IFLA_MASTER */ + nla_total_size(4) /* IFLA_MTU */ + nla_total_size(4) /* IFLA_LINK */ + nla_total_size(1) /* IFLA_OPERSTATE */ + nla_total_size(br_port_info_size()) /* IFLA_PROTINFO */ + nla_total_size(br_get_link_af_size_filtered(dev, filter_mask)) /* IFLA_AF_SPEC */ + nla_total_size(4); /* IFLA_BRPORT_BACKUP_PORT */ } static int br_port_fill_attrs(struct sk_buff *skb, const struct net_bridge_port *p) { u8 mode = !!(p->flags & BR_HAIRPIN_MODE); struct net_bridge_port *backup_p; u64 timerval; if (nla_put_u8(skb, IFLA_BRPORT_STATE, p->state) || nla_put_u16(skb, IFLA_BRPORT_PRIORITY, p->priority) || nla_put_u32(skb, IFLA_BRPORT_COST, p->path_cost) || nla_put_u8(skb, IFLA_BRPORT_MODE, mode) || nla_put_u8(skb, IFLA_BRPORT_GUARD, !!(p->flags & BR_BPDU_GUARD)) || nla_put_u8(skb, IFLA_BRPORT_PROTECT, !!(p->flags & BR_ROOT_BLOCK)) || nla_put_u8(skb, IFLA_BRPORT_FAST_LEAVE, !!(p->flags & BR_MULTICAST_FAST_LEAVE)) || nla_put_u8(skb, IFLA_BRPORT_MCAST_TO_UCAST, !!(p->flags & BR_MULTICAST_TO_UNICAST)) || nla_put_u8(skb, IFLA_BRPORT_LEARNING, !!(p->flags & BR_LEARNING)) || nla_put_u8(skb, IFLA_BRPORT_UNICAST_FLOOD, !!(p->flags & BR_FLOOD)) || nla_put_u8(skb, IFLA_BRPORT_MCAST_FLOOD, !!(p->flags & BR_MCAST_FLOOD)) || nla_put_u8(skb, IFLA_BRPORT_BCAST_FLOOD, !!(p->flags & BR_BCAST_FLOOD)) || nla_put_u8(skb, IFLA_BRPORT_PROXYARP, !!(p->flags & BR_PROXYARP)) || nla_put_u8(skb, IFLA_BRPORT_PROXYARP_WIFI, !!(p->flags & BR_PROXYARP_WIFI)) || nla_put(skb, IFLA_BRPORT_ROOT_ID, sizeof(struct ifla_bridge_id), &p->designated_root) || nla_put(skb, IFLA_BRPORT_BRIDGE_ID, sizeof(struct ifla_bridge_id), &p->designated_bridge) || nla_put_u16(skb, IFLA_BRPORT_DESIGNATED_PORT, p->designated_port) || nla_put_u16(skb, IFLA_BRPORT_DESIGNATED_COST, p->designated_cost) || nla_put_u16(skb, IFLA_BRPORT_ID, p->port_id) || nla_put_u16(skb, IFLA_BRPORT_NO, p->port_no) || nla_put_u8(skb, IFLA_BRPORT_TOPOLOGY_CHANGE_ACK, p->topology_change_ack) || nla_put_u8(skb, IFLA_BRPORT_CONFIG_PENDING, p->config_pending) || nla_put_u8(skb, IFLA_BRPORT_VLAN_TUNNEL, !!(p->flags & BR_VLAN_TUNNEL)) || nla_put_u16(skb, IFLA_BRPORT_GROUP_FWD_MASK, p->group_fwd_mask) || nla_put_u8(skb, IFLA_BRPORT_NEIGH_SUPPRESS, !!(p->flags & BR_NEIGH_SUPPRESS)) || nla_put_u8(skb, IFLA_BRPORT_MRP_RING_OPEN, !!(p->flags & BR_MRP_LOST_CONT)) || nla_put_u8(skb, IFLA_BRPORT_MRP_IN_OPEN, !!(p->flags & BR_MRP_LOST_IN_CONT)) || nla_put_u8(skb, IFLA_BRPORT_ISOLATED, !!(p->flags & BR_ISOLATED)) || nla_put_u8(skb, IFLA_BRPORT_LOCKED, !!(p->flags & BR_PORT_LOCKED)) || nla_put_u8(skb, IFLA_BRPORT_MAB, !!(p->flags & BR_PORT_MAB)) || nla_put_u8(skb, IFLA_BRPORT_NEIGH_VLAN_SUPPRESS, !!(p->flags & BR_NEIGH_VLAN_SUPPRESS))) return -EMSGSIZE; timerval = br_timer_value(&p->message_age_timer); if (nla_put_u64_64bit(skb, IFLA_BRPORT_MESSAGE_AGE_TIMER, timerval, IFLA_BRPORT_PAD)) return -EMSGSIZE; timerval = br_timer_value(&p->forward_delay_timer); if (nla_put_u64_64bit(skb, IFLA_BRPORT_FORWARD_DELAY_TIMER, timerval, IFLA_BRPORT_PAD)) return -EMSGSIZE; timerval = br_timer_value(&p->hold_timer); if (nla_put_u64_64bit(skb, IFLA_BRPORT_HOLD_TIMER, timerval, IFLA_BRPORT_PAD)) return -EMSGSIZE; #ifdef CONFIG_BRIDGE_IGMP_SNOOPING if (nla_put_u8(skb, IFLA_BRPORT_MULTICAST_ROUTER, p->multicast_ctx.multicast_router) || nla_put_u32(skb, IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT, p->multicast_eht_hosts_limit) || nla_put_u32(skb, IFLA_BRPORT_MCAST_EHT_HOSTS_CNT, p->multicast_eht_hosts_cnt) || nla_put_u32(skb, IFLA_BRPORT_MCAST_N_GROUPS, br_multicast_ngroups_get(&p->multicast_ctx)) || nla_put_u32(skb, IFLA_BRPORT_MCAST_MAX_GROUPS, br_multicast_ngroups_get_max(&p->multicast_ctx))) return -EMSGSIZE; #endif /* we might be called only with br->lock */ rcu_read_lock(); backup_p = rcu_dereference(p->backup_port); if (backup_p) nla_put_u32(skb, IFLA_BRPORT_BACKUP_PORT, backup_p->dev->ifindex); rcu_read_unlock(); if (p->backup_nhid && nla_put_u32(skb, IFLA_BRPORT_BACKUP_NHID, p->backup_nhid)) return -EMSGSIZE; return 0; } static int br_fill_ifvlaninfo_range(struct sk_buff *skb, u16 vid_start, u16 vid_end, u16 flags) { struct bridge_vlan_info vinfo; if ((vid_end - vid_start) > 0) { /* add range to skb */ vinfo.vid = vid_start; vinfo.flags = flags | BRIDGE_VLAN_INFO_RANGE_BEGIN; if (nla_put(skb, IFLA_BRIDGE_VLAN_INFO, sizeof(vinfo), &vinfo)) goto nla_put_failure; vinfo.vid = vid_end; vinfo.flags = flags | BRIDGE_VLAN_INFO_RANGE_END; if (nla_put(skb, IFLA_BRIDGE_VLAN_INFO, sizeof(vinfo), &vinfo)) goto nla_put_failure; } else { vinfo.vid = vid_start; vinfo.flags = flags; if (nla_put(skb, IFLA_BRIDGE_VLAN_INFO, sizeof(vinfo), &vinfo)) goto nla_put_failure; } return 0; nla_put_failure: return -EMSGSIZE; } static int br_fill_ifvlaninfo_compressed(struct sk_buff *skb, struct net_bridge_vlan_group *vg) { struct net_bridge_vlan *v; u16 vid_range_start = 0, vid_range_end = 0, vid_range_flags = 0; u16 flags, pvid; int err = 0; /* Pack IFLA_BRIDGE_VLAN_INFO's for every vlan * and mark vlan info with begin and end flags * if vlaninfo represents a range */ pvid = br_get_pvid(vg); list_for_each_entry_rcu(v, &vg->vlan_list, vlist) { flags = 0; if (!br_vlan_should_use(v)) continue; if (v->vid == pvid) flags |= BRIDGE_VLAN_INFO_PVID; if (v->flags & BRIDGE_VLAN_INFO_UNTAGGED) flags |= BRIDGE_VLAN_INFO_UNTAGGED; if (vid_range_start == 0) { goto initvars; } else if ((v->vid - vid_range_end) == 1 && flags == vid_range_flags) { vid_range_end = v->vid; continue; } else { err = br_fill_ifvlaninfo_range(skb, vid_range_start, vid_range_end, vid_range_flags); if (err) return err; } initvars: vid_range_start = v->vid; vid_range_end = v->vid; vid_range_flags = flags; } if (vid_range_start != 0) { /* Call it once more to send any left over vlans */ err = br_fill_ifvlaninfo_range(skb, vid_range_start, vid_range_end, vid_range_flags); if (err) return err; } return 0; } static int br_fill_ifvlaninfo(struct sk_buff *skb, struct net_bridge_vlan_group *vg) { struct bridge_vlan_info vinfo; struct net_bridge_vlan *v; u16 pvid; pvid = br_get_pvid(vg); list_for_each_entry_rcu(v, &vg->vlan_list, vlist) { if (!br_vlan_should_use(v)) continue; vinfo.vid = v->vid; vinfo.flags = 0; if (v->vid == pvid) vinfo.flags |= BRIDGE_VLAN_INFO_PVID; if (v->flags & BRIDGE_VLAN_INFO_UNTAGGED) vinfo.flags |= BRIDGE_VLAN_INFO_UNTAGGED; if (nla_put(skb, IFLA_BRIDGE_VLAN_INFO, sizeof(vinfo), &vinfo)) goto nla_put_failure; } return 0; nla_put_failure: return -EMSGSIZE; } /* * Create one netlink message for one interface * Contains port and master info as well as carrier and bridge state. */ static int br_fill_ifinfo(struct sk_buff *skb, const struct net_bridge_port *port, u32 pid, u32 seq, int event, unsigned int flags, u32 filter_mask, const struct net_device *dev, bool getlink) { u8 operstate = netif_running(dev) ? READ_ONCE(dev->operstate) : IF_OPER_DOWN; struct nlattr *af = NULL; struct net_bridge *br; struct ifinfomsg *hdr; struct nlmsghdr *nlh; if (port) br = port->br; else br = netdev_priv(dev); br_debug(br, "br_fill_ifinfo event %d port %s master %s\n", event, dev->name, br->dev->name); nlh = nlmsg_put(skb, pid, seq, event, sizeof(*hdr), flags); if (nlh == NULL) return -EMSGSIZE; hdr = nlmsg_data(nlh); hdr->ifi_family = AF_BRIDGE; hdr->__ifi_pad = 0; hdr->ifi_type = dev->type; hdr->ifi_index = dev->ifindex; hdr->ifi_flags = netif_get_flags(dev); hdr->ifi_change = 0; if (nla_put_string(skb, IFLA_IFNAME, dev->name) || nla_put_u32(skb, IFLA_MASTER, br->dev->ifindex) || nla_put_u32(skb, IFLA_MTU, dev->mtu) || nla_put_u8(skb, IFLA_OPERSTATE, operstate) || (dev->addr_len && nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr)) || (dev->ifindex != dev_get_iflink(dev) && nla_put_u32(skb, IFLA_LINK, dev_get_iflink(dev)))) goto nla_put_failure; if (event == RTM_NEWLINK && port) { struct nlattr *nest; nest = nla_nest_start(skb, IFLA_PROTINFO); if (nest == NULL || br_port_fill_attrs(skb, port) < 0) goto nla_put_failure; nla_nest_end(skb, nest); } if (filter_mask & (RTEXT_FILTER_BRVLAN | RTEXT_FILTER_BRVLAN_COMPRESSED | RTEXT_FILTER_MRP | RTEXT_FILTER_CFM_CONFIG | RTEXT_FILTER_CFM_STATUS | RTEXT_FILTER_MST)) { af = nla_nest_start_noflag(skb, IFLA_AF_SPEC); if (!af) goto nla_put_failure; } /* Check if the VID information is requested */ if ((filter_mask & RTEXT_FILTER_BRVLAN) || (filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED)) { struct net_bridge_vlan_group *vg; int err; /* RCU needed because of the VLAN locking rules (rcu || rtnl) */ rcu_read_lock(); if (port) vg = nbp_vlan_group_rcu(port); else vg = br_vlan_group_rcu(br); if (!vg || !vg->num_vlans) { rcu_read_unlock(); goto done; } if (filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED) err = br_fill_ifvlaninfo_compressed(skb, vg); else err = br_fill_ifvlaninfo(skb, vg); if (port && (port->flags & BR_VLAN_TUNNEL)) err = br_fill_vlan_tunnel_info(skb, vg); rcu_read_unlock(); if (err) goto nla_put_failure; } if (filter_mask & RTEXT_FILTER_MRP) { int err; if (!br_mrp_enabled(br) || port) goto done; rcu_read_lock(); err = br_mrp_fill_info(skb, br); rcu_read_unlock(); if (err) goto nla_put_failure; } if (filter_mask & (RTEXT_FILTER_CFM_CONFIG | RTEXT_FILTER_CFM_STATUS)) { struct nlattr *cfm_nest = NULL; int err; if (!br_cfm_created(br) || port) goto done; cfm_nest = nla_nest_start(skb, IFLA_BRIDGE_CFM); if (!cfm_nest) goto nla_put_failure; if (filter_mask & RTEXT_FILTER_CFM_CONFIG) { rcu_read_lock(); err = br_cfm_config_fill_info(skb, br); rcu_read_unlock(); if (err) goto nla_put_failure; } if (filter_mask & RTEXT_FILTER_CFM_STATUS) { rcu_read_lock(); err = br_cfm_status_fill_info(skb, br, getlink); rcu_read_unlock(); if (err) goto nla_put_failure; } nla_nest_end(skb, cfm_nest); } if ((filter_mask & RTEXT_FILTER_MST) && br_opt_get(br, BROPT_MST_ENABLED) && port) { const struct net_bridge_vlan_group *vg = nbp_vlan_group(port); struct nlattr *mst_nest; int err; if (!vg || !vg->num_vlans) goto done; mst_nest = nla_nest_start(skb, IFLA_BRIDGE_MST); if (!mst_nest) goto nla_put_failure; err = br_mst_fill_info(skb, vg); if (err) goto nla_put_failure; nla_nest_end(skb, mst_nest); } done: if (af) { if (nlmsg_get_pos(skb) - (void *)af > nla_attr_size(0)) nla_nest_end(skb, af); else nla_nest_cancel(skb, af); } nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } void br_info_notify(int event, const struct net_bridge *br, const struct net_bridge_port *port, u32 filter) { struct net_device *dev; struct sk_buff *skb; int err = -ENOBUFS; struct net *net; u16 port_no = 0; if (WARN_ON(!port && !br)) return; if (port) { dev = port->dev; br = port->br; port_no = port->port_no; } else { dev = br->dev; } net = dev_net(dev); br_debug(br, "port %u(%s) event %d\n", port_no, dev->name, event); skb = nlmsg_new(br_nlmsg_size(dev, filter), GFP_ATOMIC); if (skb == NULL) goto errout; err = br_fill_ifinfo(skb, port, 0, 0, event, 0, filter, dev, false); if (err < 0) { /* -EMSGSIZE implies BUG in br_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_ATOMIC); return; errout: rtnl_set_sk_err(net, RTNLGRP_LINK, err); } /* Notify listeners of a change in bridge or port information */ void br_ifinfo_notify(int event, const struct net_bridge *br, const struct net_bridge_port *port) { u32 filter = RTEXT_FILTER_BRVLAN_COMPRESSED; br_info_notify(event, br, port, filter); } /* * Dump information about all ports, in response to GETLINK */ int br_getlink(struct sk_buff *skb, u32 pid, u32 seq, struct net_device *dev, u32 filter_mask, int nlflags) { struct net_bridge_port *port = br_port_get_rtnl(dev); if (!port && !(filter_mask & RTEXT_FILTER_BRVLAN) && !(filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED) && !(filter_mask & RTEXT_FILTER_MRP) && !(filter_mask & RTEXT_FILTER_CFM_CONFIG) && !(filter_mask & RTEXT_FILTER_CFM_STATUS)) return 0; return br_fill_ifinfo(skb, port, pid, seq, RTM_NEWLINK, nlflags, filter_mask, dev, true); } static int br_vlan_info(struct net_bridge *br, struct net_bridge_port *p, int cmd, struct bridge_vlan_info *vinfo, bool *changed, struct netlink_ext_ack *extack) { bool curr_change; int err = 0; switch (cmd) { case RTM_SETLINK: if (p) { /* if the MASTER flag is set this will act on the global * per-VLAN entry as well */ err = nbp_vlan_add(p, vinfo->vid, vinfo->flags, &curr_change, extack); } else { vinfo->flags |= BRIDGE_VLAN_INFO_BRENTRY; err = br_vlan_add(br, vinfo->vid, vinfo->flags, &curr_change, extack); } if (curr_change) *changed = true; break; case RTM_DELLINK: if (p) { if (!nbp_vlan_delete(p, vinfo->vid)) *changed = true; if ((vinfo->flags & BRIDGE_VLAN_INFO_MASTER) && !br_vlan_delete(p->br, vinfo->vid)) *changed = true; } else if (!br_vlan_delete(br, vinfo->vid)) { *changed = true; } break; } return err; } int br_process_vlan_info(struct net_bridge *br, struct net_bridge_port *p, int cmd, struct bridge_vlan_info *vinfo_curr, struct bridge_vlan_info **vinfo_last, bool *changed, struct netlink_ext_ack *extack) { int err, rtm_cmd; if (!br_vlan_valid_id(vinfo_curr->vid, extack)) return -EINVAL; /* needed for vlan-only NEWVLAN/DELVLAN notifications */ rtm_cmd = br_afspec_cmd_to_rtm(cmd); if (vinfo_curr->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN) { if (!br_vlan_valid_range(vinfo_curr, *vinfo_last, extack)) return -EINVAL; *vinfo_last = vinfo_curr; return 0; } if (*vinfo_last) { struct bridge_vlan_info tmp_vinfo; int v, v_change_start = 0; if (!br_vlan_valid_range(vinfo_curr, *vinfo_last, extack)) return -EINVAL; memcpy(&tmp_vinfo, *vinfo_last, sizeof(struct bridge_vlan_info)); for (v = (*vinfo_last)->vid; v <= vinfo_curr->vid; v++) { bool curr_change = false; tmp_vinfo.vid = v; err = br_vlan_info(br, p, cmd, &tmp_vinfo, &curr_change, extack); if (err) break; if (curr_change) { *changed = curr_change; if (!v_change_start) v_change_start = v; } else { /* nothing to notify yet */ if (!v_change_start) continue; br_vlan_notify(br, p, v_change_start, v - 1, rtm_cmd); v_change_start = 0; } cond_resched(); } /* v_change_start is set only if the last/whole range changed */ if (v_change_start) br_vlan_notify(br, p, v_change_start, v - 1, rtm_cmd); *vinfo_last = NULL; return err; } err = br_vlan_info(br, p, cmd, vinfo_curr, changed, extack); if (*changed) br_vlan_notify(br, p, vinfo_curr->vid, 0, rtm_cmd); return err; } static int br_afspec(struct net_bridge *br, struct net_bridge_port *p, struct nlattr *af_spec, int cmd, bool *changed, struct netlink_ext_ack *extack) { struct bridge_vlan_info *vinfo_curr = NULL; struct bridge_vlan_info *vinfo_last = NULL; struct nlattr *attr; struct vtunnel_info tinfo_last = {}; struct vtunnel_info tinfo_curr = {}; int err = 0, rem; nla_for_each_nested(attr, af_spec, rem) { err = 0; switch (nla_type(attr)) { case IFLA_BRIDGE_VLAN_TUNNEL_INFO: if (!p || !(p->flags & BR_VLAN_TUNNEL)) return -EINVAL; err = br_parse_vlan_tunnel_info(attr, &tinfo_curr); if (err) return err; err = br_process_vlan_tunnel_info(br, p, cmd, &tinfo_curr, &tinfo_last, changed); if (err) return err; break; case IFLA_BRIDGE_VLAN_INFO: if (nla_len(attr) != sizeof(struct bridge_vlan_info)) return -EINVAL; vinfo_curr = nla_data(attr); err = br_process_vlan_info(br, p, cmd, vinfo_curr, &vinfo_last, changed, extack); if (err) return err; break; case IFLA_BRIDGE_MRP: err = br_mrp_parse(br, p, attr, cmd, extack); if (err) return err; break; case IFLA_BRIDGE_CFM: err = br_cfm_parse(br, p, attr, cmd, extack); if (err) return err; break; case IFLA_BRIDGE_MST: if (!p) { NL_SET_ERR_MSG(extack, "MST states can only be set on bridge ports"); return -EINVAL; } if (cmd != RTM_SETLINK) { NL_SET_ERR_MSG(extack, "MST states can only be set through RTM_SETLINK"); return -EINVAL; } err = br_mst_process(p, attr, extack); if (err) return err; break; } } return err; } static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = { [IFLA_BRPORT_UNSPEC] = { .strict_start_type = IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT + 1 }, [IFLA_BRPORT_STATE] = { .type = NLA_U8 }, [IFLA_BRPORT_COST] = { .type = NLA_U32 }, [IFLA_BRPORT_PRIORITY] = { .type = NLA_U16 }, [IFLA_BRPORT_MODE] = { .type = NLA_U8 }, [IFLA_BRPORT_GUARD] = { .type = NLA_U8 }, [IFLA_BRPORT_PROTECT] = { .type = NLA_U8 }, [IFLA_BRPORT_FAST_LEAVE]= { .type = NLA_U8 }, [IFLA_BRPORT_LEARNING] = { .type = NLA_U8 }, [IFLA_BRPORT_UNICAST_FLOOD] = { .type = NLA_U8 }, [IFLA_BRPORT_PROXYARP] = { .type = NLA_U8 }, [IFLA_BRPORT_PROXYARP_WIFI] = { .type = NLA_U8 }, [IFLA_BRPORT_MULTICAST_ROUTER] = { .type = NLA_U8 }, [IFLA_BRPORT_MCAST_TO_UCAST] = { .type = NLA_U8 }, [IFLA_BRPORT_MCAST_FLOOD] = { .type = NLA_U8 }, [IFLA_BRPORT_BCAST_FLOOD] = { .type = NLA_U8 }, [IFLA_BRPORT_VLAN_TUNNEL] = { .type = NLA_U8 }, [IFLA_BRPORT_GROUP_FWD_MASK] = { .type = NLA_U16 }, [IFLA_BRPORT_NEIGH_SUPPRESS] = { .type = NLA_U8 }, [IFLA_BRPORT_ISOLATED] = { .type = NLA_U8 }, [IFLA_BRPORT_LOCKED] = { .type = NLA_U8 }, [IFLA_BRPORT_MAB] = { .type = NLA_U8 }, [IFLA_BRPORT_BACKUP_PORT] = { .type = NLA_U32 }, [IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT] = { .type = NLA_U32 }, [IFLA_BRPORT_MCAST_N_GROUPS] = { .type = NLA_REJECT }, [IFLA_BRPORT_MCAST_MAX_GROUPS] = { .type = NLA_U32 }, [IFLA_BRPORT_NEIGH_VLAN_SUPPRESS] = NLA_POLICY_MAX(NLA_U8, 1), [IFLA_BRPORT_BACKUP_NHID] = { .type = NLA_U32 }, }; /* Change the state of the port and notify spanning tree */ static int br_set_port_state(struct net_bridge_port *p, u8 state) { if (state > BR_STATE_BLOCKING) return -EINVAL; /* if kernel STP is running, don't allow changes */ if (p->br->stp_enabled == BR_KERNEL_STP) return -EBUSY; /* if device is not up, change is not allowed * if link is not present, only allowable state is disabled */ if (!netif_running(p->dev) || (!netif_oper_up(p->dev) && state != BR_STATE_DISABLED)) return -ENETDOWN; br_set_state(p, state); br_port_state_selection(p->br); return 0; } /* Set/clear or port flags based on attribute */ static void br_set_port_flag(struct net_bridge_port *p, struct nlattr *tb[], int attrtype, unsigned long mask) { if (!tb[attrtype]) return; if (nla_get_u8(tb[attrtype])) p->flags |= mask; else p->flags &= ~mask; } /* Process bridge protocol info on port */ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[], struct netlink_ext_ack *extack) { unsigned long old_flags, changed_mask; bool br_vlan_tunnel_old; int err; old_flags = p->flags; br_vlan_tunnel_old = (old_flags & BR_VLAN_TUNNEL) ? true : false; br_set_port_flag(p, tb, IFLA_BRPORT_MODE, BR_HAIRPIN_MODE); br_set_port_flag(p, tb, IFLA_BRPORT_GUARD, BR_BPDU_GUARD); br_set_port_flag(p, tb, IFLA_BRPORT_FAST_LEAVE, BR_MULTICAST_FAST_LEAVE); br_set_port_flag(p, tb, IFLA_BRPORT_PROTECT, BR_ROOT_BLOCK); br_set_port_flag(p, tb, IFLA_BRPORT_LEARNING, BR_LEARNING); br_set_port_flag(p, tb, IFLA_BRPORT_UNICAST_FLOOD, BR_FLOOD); br_set_port_flag(p, tb, IFLA_BRPORT_MCAST_FLOOD, BR_MCAST_FLOOD); br_set_port_flag(p, tb, IFLA_BRPORT_MCAST_TO_UCAST, BR_MULTICAST_TO_UNICAST); br_set_port_flag(p, tb, IFLA_BRPORT_BCAST_FLOOD, BR_BCAST_FLOOD); br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP, BR_PROXYARP); br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP_WIFI, BR_PROXYARP_WIFI); br_set_port_flag(p, tb, IFLA_BRPORT_VLAN_TUNNEL, BR_VLAN_TUNNEL); br_set_port_flag(p, tb, IFLA_BRPORT_NEIGH_SUPPRESS, BR_NEIGH_SUPPRESS); br_set_port_flag(p, tb, IFLA_BRPORT_ISOLATED, BR_ISOLATED); br_set_port_flag(p, tb, IFLA_BRPORT_LOCKED, BR_PORT_LOCKED); br_set_port_flag(p, tb, IFLA_BRPORT_MAB, BR_PORT_MAB); br_set_port_flag(p, tb, IFLA_BRPORT_NEIGH_VLAN_SUPPRESS, BR_NEIGH_VLAN_SUPPRESS); if ((p->flags & BR_PORT_MAB) && (!(p->flags & BR_PORT_LOCKED) || !(p->flags & BR_LEARNING))) { NL_SET_ERR_MSG(extack, "Bridge port must be locked and have learning enabled when MAB is enabled"); p->flags = old_flags; return -EINVAL; } else if (!(p->flags & BR_PORT_MAB) && (old_flags & BR_PORT_MAB)) { struct net_bridge_fdb_flush_desc desc = { .flags = BIT(BR_FDB_LOCKED), .flags_mask = BIT(BR_FDB_LOCKED), .port_ifindex = p->dev->ifindex, }; br_fdb_flush(p->br, &desc); } changed_mask = old_flags ^ p->flags; err = br_switchdev_set_port_flag(p, p->flags, changed_mask, extack); if (err) { p->flags = old_flags; return err; } if (br_vlan_tunnel_old && !(p->flags & BR_VLAN_TUNNEL)) nbp_vlan_tunnel_info_flush(p); br_port_flags_change(p, changed_mask); if (tb[IFLA_BRPORT_COST]) { err = br_stp_set_path_cost(p, nla_get_u32(tb[IFLA_BRPORT_COST])); if (err) return err; } if (tb[IFLA_BRPORT_PRIORITY]) { err = br_stp_set_port_priority(p, nla_get_u16(tb[IFLA_BRPORT_PRIORITY])); if (err) return err; } if (tb[IFLA_BRPORT_STATE]) { err = br_set_port_state(p, nla_get_u8(tb[IFLA_BRPORT_STATE])); if (err) return err; } if (tb[IFLA_BRPORT_FLUSH]) br_fdb_delete_by_port(p->br, p, 0, 0); #ifdef CONFIG_BRIDGE_IGMP_SNOOPING if (tb[IFLA_BRPORT_MULTICAST_ROUTER]) { u8 mcast_router = nla_get_u8(tb[IFLA_BRPORT_MULTICAST_ROUTER]); err = br_multicast_set_port_router(&p->multicast_ctx, mcast_router); if (err) return err; } if (tb[IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT]) { u32 hlimit; hlimit = nla_get_u32(tb[IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT]); err = br_multicast_eht_set_hosts_limit(p, hlimit); if (err) return err; } if (tb[IFLA_BRPORT_MCAST_MAX_GROUPS]) { u32 max_groups; max_groups = nla_get_u32(tb[IFLA_BRPORT_MCAST_MAX_GROUPS]); br_multicast_ngroups_set_max(&p->multicast_ctx, max_groups); } #endif if (tb[IFLA_BRPORT_GROUP_FWD_MASK]) { u16 fwd_mask = nla_get_u16(tb[IFLA_BRPORT_GROUP_FWD_MASK]); if (fwd_mask & BR_GROUPFWD_MACPAUSE) return -EINVAL; p->group_fwd_mask = fwd_mask; } if (tb[IFLA_BRPORT_BACKUP_PORT]) { struct net_device *backup_dev = NULL; u32 backup_ifindex; backup_ifindex = nla_get_u32(tb[IFLA_BRPORT_BACKUP_PORT]); if (backup_ifindex) { backup_dev = __dev_get_by_index(dev_net(p->dev), backup_ifindex); if (!backup_dev) return -ENOENT; } err = nbp_backup_change(p, backup_dev); if (err) return err; } if (tb[IFLA_BRPORT_BACKUP_NHID]) { u32 backup_nhid = nla_get_u32(tb[IFLA_BRPORT_BACKUP_NHID]); WRITE_ONCE(p->backup_nhid, backup_nhid); } return 0; } /* Change state and parameters on port. */ int br_setlink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags, struct netlink_ext_ack *extack) { struct net_bridge *br = (struct net_bridge *)netdev_priv(dev); struct nlattr *tb[IFLA_BRPORT_MAX + 1]; struct net_bridge_port *p; struct nlattr *protinfo; struct nlattr *afspec; bool changed = false; int err = 0; protinfo = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_PROTINFO); afspec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC); if (!protinfo && !afspec) return 0; p = br_port_get_rtnl(dev); /* We want to accept dev as bridge itself if the AF_SPEC * is set to see if someone is setting vlan info on the bridge */ if (!p && !afspec) return -EINVAL; if (p && protinfo) { if (protinfo->nla_type & NLA_F_NESTED) { err = nla_parse_nested_deprecated(tb, IFLA_BRPORT_MAX, protinfo, br_port_policy, NULL); if (err) return err; spin_lock_bh(&p->br->lock); err = br_setport(p, tb, extack); spin_unlock_bh(&p->br->lock); } else { /* Binary compatibility with old RSTP */ if (nla_len(protinfo) < sizeof(u8)) return -EINVAL; spin_lock_bh(&p->br->lock); err = br_set_port_state(p, nla_get_u8(protinfo)); spin_unlock_bh(&p->br->lock); } if (err) goto out; changed = true; } if (afspec) err = br_afspec(br, p, afspec, RTM_SETLINK, &changed, extack); if (changed) br_ifinfo_notify(RTM_NEWLINK, br, p); out: return err; } /* Delete port information */ int br_dellink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags) { struct net_bridge *br = (struct net_bridge *)netdev_priv(dev); struct net_bridge_port *p; struct nlattr *afspec; bool changed = false; int err = 0; afspec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC); if (!afspec) return 0; p = br_port_get_rtnl(dev); /* We want to accept dev as bridge itself as well */ if (!p && !netif_is_bridge_master(dev)) return -EINVAL; err = br_afspec(br, p, afspec, RTM_DELLINK, &changed, NULL); if (changed) /* Send RTM_NEWLINK because userspace * expects RTM_NEWLINK for vlan dels */ br_ifinfo_notify(RTM_NEWLINK, br, p); return err; } static int br_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { if (tb[IFLA_ADDRESS]) { if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) return -EINVAL; if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) return -EADDRNOTAVAIL; } if (!data) return 0; #ifdef CONFIG_BRIDGE_VLAN_FILTERING if (data[IFLA_BR_VLAN_PROTOCOL] && !eth_type_vlan(nla_get_be16(data[IFLA_BR_VLAN_PROTOCOL]))) return -EPROTONOSUPPORT; if (data[IFLA_BR_VLAN_DEFAULT_PVID]) { __u16 defpvid = nla_get_u16(data[IFLA_BR_VLAN_DEFAULT_PVID]); if (defpvid >= VLAN_VID_MASK) return -EINVAL; } #endif return 0; } static int br_port_slave_changelink(struct net_device *brdev, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct net_bridge *br = netdev_priv(brdev); int ret; if (!data) return 0; spin_lock_bh(&br->lock); ret = br_setport(br_port_get_rtnl(dev), data, extack); spin_unlock_bh(&br->lock); return ret; } static int br_port_fill_slave_info(struct sk_buff *skb, const struct net_device *brdev, const struct net_device *dev) { return br_port_fill_attrs(skb, br_port_get_rtnl(dev)); } static size_t br_port_get_slave_size(const struct net_device *brdev, const struct net_device *dev) { return br_port_info_size(); } static const struct nla_policy br_policy[IFLA_BR_MAX + 1] = { [IFLA_BR_UNSPEC] = { .strict_start_type = IFLA_BR_FDB_N_LEARNED }, [IFLA_BR_FORWARD_DELAY] = { .type = NLA_U32 }, [IFLA_BR_HELLO_TIME] = { .type = NLA_U32 }, [IFLA_BR_MAX_AGE] = { .type = NLA_U32 }, [IFLA_BR_AGEING_TIME] = { .type = NLA_U32 }, [IFLA_BR_STP_STATE] = { .type = NLA_U32 }, [IFLA_BR_PRIORITY] = { .type = NLA_U16 }, [IFLA_BR_VLAN_FILTERING] = { .type = NLA_U8 }, [IFLA_BR_VLAN_PROTOCOL] = { .type = NLA_U16 }, [IFLA_BR_GROUP_FWD_MASK] = { .type = NLA_U16 }, [IFLA_BR_GROUP_ADDR] = { .type = NLA_BINARY, .len = ETH_ALEN }, [IFLA_BR_MCAST_ROUTER] = { .type = NLA_U8 }, [IFLA_BR_MCAST_SNOOPING] = { .type = NLA_U8 }, [IFLA_BR_MCAST_QUERY_USE_IFADDR] = { .type = NLA_U8 }, [IFLA_BR_MCAST_QUERIER] = { .type = NLA_U8 }, [IFLA_BR_MCAST_HASH_ELASTICITY] = { .type = NLA_U32 }, [IFLA_BR_MCAST_HASH_MAX] = { .type = NLA_U32 }, [IFLA_BR_MCAST_LAST_MEMBER_CNT] = { .type = NLA_U32 }, [IFLA_BR_MCAST_STARTUP_QUERY_CNT] = { .type = NLA_U32 }, [IFLA_BR_MCAST_LAST_MEMBER_INTVL] = { .type = NLA_U64 }, [IFLA_BR_MCAST_MEMBERSHIP_INTVL] = { .type = NLA_U64 }, [IFLA_BR_MCAST_QUERIER_INTVL] = { .type = NLA_U64 }, [IFLA_BR_MCAST_QUERY_INTVL] = { .type = NLA_U64 }, [IFLA_BR_MCAST_QUERY_RESPONSE_INTVL] = { .type = NLA_U64 }, [IFLA_BR_MCAST_STARTUP_QUERY_INTVL] = { .type = NLA_U64 }, [IFLA_BR_NF_CALL_IPTABLES] = { .type = NLA_U8 }, [IFLA_BR_NF_CALL_IP6TABLES] = { .type = NLA_U8 }, [IFLA_BR_NF_CALL_ARPTABLES] = { .type = NLA_U8 }, [IFLA_BR_VLAN_DEFAULT_PVID] = { .type = NLA_U16 }, [IFLA_BR_VLAN_STATS_ENABLED] = { .type = NLA_U8 }, [IFLA_BR_MCAST_STATS_ENABLED] = { .type = NLA_U8 }, [IFLA_BR_MCAST_IGMP_VERSION] = { .type = NLA_U8 }, [IFLA_BR_MCAST_MLD_VERSION] = { .type = NLA_U8 }, [IFLA_BR_VLAN_STATS_PER_PORT] = { .type = NLA_U8 }, [IFLA_BR_MULTI_BOOLOPT] = NLA_POLICY_EXACT_LEN(sizeof(struct br_boolopt_multi)), [IFLA_BR_FDB_N_LEARNED] = { .type = NLA_REJECT }, [IFLA_BR_FDB_MAX_LEARNED] = { .type = NLA_U32 }, [IFLA_BR_STP_MODE] = NLA_POLICY_RANGE(NLA_U32, BR_STP_MODE_AUTO, BR_STP_MODE_MAX), }; static int br_changelink(struct net_device *brdev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct net_bridge *br = netdev_priv(brdev); int err; if (!data) return 0; if (data[IFLA_BR_FORWARD_DELAY]) { err = br_set_forward_delay(br, nla_get_u32(data[IFLA_BR_FORWARD_DELAY])); if (err) return err; } if (data[IFLA_BR_HELLO_TIME]) { err = br_set_hello_time(br, nla_get_u32(data[IFLA_BR_HELLO_TIME])); if (err) return err; } if (data[IFLA_BR_MAX_AGE]) { err = br_set_max_age(br, nla_get_u32(data[IFLA_BR_MAX_AGE])); if (err) return err; } if (data[IFLA_BR_AGEING_TIME]) { err = br_set_ageing_time(br, nla_get_u32(data[IFLA_BR_AGEING_TIME])); if (err) return err; } if (data[IFLA_BR_STP_MODE]) { u32 mode = nla_get_u32(data[IFLA_BR_STP_MODE]); if (mode != br->stp_mode) { bool stp_off = br->stp_enabled == BR_NO_STP || (data[IFLA_BR_STP_STATE] && !nla_get_u32(data[IFLA_BR_STP_STATE])); if (!stp_off) { NL_SET_ERR_MSG_MOD(extack, "Can't change STP mode while STP is enabled"); return -EBUSY; } } br->stp_mode = mode; } if (data[IFLA_BR_STP_STATE]) { u32 stp_enabled = nla_get_u32(data[IFLA_BR_STP_STATE]); err = br_stp_set_enabled(br, stp_enabled, extack); if (err) return err; } if (data[IFLA_BR_PRIORITY]) { u32 priority = nla_get_u16(data[IFLA_BR_PRIORITY]); br_stp_set_bridge_priority(br, priority); } if (data[IFLA_BR_VLAN_FILTERING]) { u8 vlan_filter = nla_get_u8(data[IFLA_BR_VLAN_FILTERING]); err = br_vlan_filter_toggle(br, vlan_filter, extack); if (err) return err; } #ifdef CONFIG_BRIDGE_VLAN_FILTERING if (data[IFLA_BR_VLAN_PROTOCOL]) { __be16 vlan_proto = nla_get_be16(data[IFLA_BR_VLAN_PROTOCOL]); err = __br_vlan_set_proto(br, vlan_proto, extack); if (err) return err; } if (data[IFLA_BR_VLAN_DEFAULT_PVID]) { __u16 defpvid = nla_get_u16(data[IFLA_BR_VLAN_DEFAULT_PVID]); err = __br_vlan_set_default_pvid(br, defpvid, extack); if (err) return err; } if (data[IFLA_BR_VLAN_STATS_ENABLED]) { __u8 vlan_stats = nla_get_u8(data[IFLA_BR_VLAN_STATS_ENABLED]); err = br_vlan_set_stats(br, vlan_stats); if (err) return err; } if (data[IFLA_BR_VLAN_STATS_PER_PORT]) { __u8 per_port = nla_get_u8(data[IFLA_BR_VLAN_STATS_PER_PORT]); err = br_vlan_set_stats_per_port(br, per_port); if (err) return err; } #endif if (data[IFLA_BR_GROUP_FWD_MASK]) { u16 fwd_mask = nla_get_u16(data[IFLA_BR_GROUP_FWD_MASK]); if (fwd_mask & BR_GROUPFWD_RESTRICTED) return -EINVAL; br->group_fwd_mask = fwd_mask; } if (data[IFLA_BR_GROUP_ADDR]) { u8 new_addr[ETH_ALEN]; if (nla_len(data[IFLA_BR_GROUP_ADDR]) != ETH_ALEN) return -EINVAL; memcpy(new_addr, nla_data(data[IFLA_BR_GROUP_ADDR]), ETH_ALEN); if (!is_link_local_ether_addr(new_addr)) return -EINVAL; if (new_addr[5] == 1 || /* 802.3x Pause address */ new_addr[5] == 2 || /* 802.3ad Slow protocols */ new_addr[5] == 3) /* 802.1X PAE address */ return -EINVAL; spin_lock_bh(&br->lock); memcpy(br->group_addr, new_addr, sizeof(br->group_addr)); spin_unlock_bh(&br->lock); br_opt_toggle(br, BROPT_GROUP_ADDR_SET, true); br_recalculate_fwd_mask(br); } if (data[IFLA_BR_FDB_FLUSH]) { struct net_bridge_fdb_flush_desc desc = { .flags_mask = BIT(BR_FDB_STATIC) }; br_fdb_flush(br, &desc); } #ifdef CONFIG_BRIDGE_IGMP_SNOOPING if (data[IFLA_BR_MCAST_ROUTER]) { u8 multicast_router = nla_get_u8(data[IFLA_BR_MCAST_ROUTER]); err = br_multicast_set_router(&br->multicast_ctx, multicast_router); if (err) return err; } if (data[IFLA_BR_MCAST_SNOOPING]) { u8 mcast_snooping = nla_get_u8(data[IFLA_BR_MCAST_SNOOPING]); err = br_multicast_toggle(br, mcast_snooping, extack); if (err) return err; } if (data[IFLA_BR_MCAST_QUERY_USE_IFADDR]) { u8 val; val = nla_get_u8(data[IFLA_BR_MCAST_QUERY_USE_IFADDR]); br_opt_toggle(br, BROPT_MULTICAST_QUERY_USE_IFADDR, !!val); } if (data[IFLA_BR_MCAST_QUERIER]) { u8 mcast_querier = nla_get_u8(data[IFLA_BR_MCAST_QUERIER]); err = br_multicast_set_querier(&br->multicast_ctx, mcast_querier); if (err) return err; } if (data[IFLA_BR_MCAST_HASH_ELASTICITY]) br_warn(br, "the hash_elasticity option has been deprecated and is always %u\n", RHT_ELASTICITY); if (data[IFLA_BR_MCAST_HASH_MAX]) br->hash_max = nla_get_u32(data[IFLA_BR_MCAST_HASH_MAX]); if (data[IFLA_BR_MCAST_LAST_MEMBER_CNT]) { u32 val = nla_get_u32(data[IFLA_BR_MCAST_LAST_MEMBER_CNT]); br->multicast_ctx.multicast_last_member_count = val; } if (data[IFLA_BR_MCAST_STARTUP_QUERY_CNT]) { u32 val = nla_get_u32(data[IFLA_BR_MCAST_STARTUP_QUERY_CNT]); br->multicast_ctx.multicast_startup_query_count = val; } if (data[IFLA_BR_MCAST_LAST_MEMBER_INTVL]) { u64 val = nla_get_u64(data[IFLA_BR_MCAST_LAST_MEMBER_INTVL]); br->multicast_ctx.multicast_last_member_interval = clock_t_to_jiffies(val); } if (data[IFLA_BR_MCAST_MEMBERSHIP_INTVL]) { u64 val = nla_get_u64(data[IFLA_BR_MCAST_MEMBERSHIP_INTVL]); br->multicast_ctx.multicast_membership_interval = clock_t_to_jiffies(val); } if (data[IFLA_BR_MCAST_QUERIER_INTVL]) { u64 val = nla_get_u64(data[IFLA_BR_MCAST_QUERIER_INTVL]); br->multicast_ctx.multicast_querier_interval = clock_t_to_jiffies(val); } if (data[IFLA_BR_MCAST_QUERY_INTVL]) { u64 val = nla_get_u64(data[IFLA_BR_MCAST_QUERY_INTVL]); br_multicast_set_query_intvl(&br->multicast_ctx, val); } if (data[IFLA_BR_MCAST_QUERY_RESPONSE_INTVL]) { u64 val = nla_get_u64(data[IFLA_BR_MCAST_QUERY_RESPONSE_INTVL]); br->multicast_ctx.multicast_query_response_interval = clock_t_to_jiffies(val); } if (data[IFLA_BR_MCAST_STARTUP_QUERY_INTVL]) { u64 val = nla_get_u64(data[IFLA_BR_MCAST_STARTUP_QUERY_INTVL]); br_multicast_set_startup_query_intvl(&br->multicast_ctx, val); } if (data[IFLA_BR_MCAST_STATS_ENABLED]) { __u8 mcast_stats; mcast_stats = nla_get_u8(data[IFLA_BR_MCAST_STATS_ENABLED]); br_opt_toggle(br, BROPT_MULTICAST_STATS_ENABLED, !!mcast_stats); } if (data[IFLA_BR_MCAST_IGMP_VERSION]) { __u8 igmp_version; igmp_version = nla_get_u8(data[IFLA_BR_MCAST_IGMP_VERSION]); err = br_multicast_set_igmp_version(&br->multicast_ctx, igmp_version); if (err) return err; } #if IS_ENABLED(CONFIG_IPV6) if (data[IFLA_BR_MCAST_MLD_VERSION]) { __u8 mld_version; mld_version = nla_get_u8(data[IFLA_BR_MCAST_MLD_VERSION]); err = br_multicast_set_mld_version(&br->multicast_ctx, mld_version); if (err) return err; } #endif #endif #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) if (data[IFLA_BR_NF_CALL_IPTABLES]) { u8 val = nla_get_u8(data[IFLA_BR_NF_CALL_IPTABLES]); br_opt_toggle(br, BROPT_NF_CALL_IPTABLES, !!val); } if (data[IFLA_BR_NF_CALL_IP6TABLES]) { u8 val = nla_get_u8(data[IFLA_BR_NF_CALL_IP6TABLES]); br_opt_toggle(br, BROPT_NF_CALL_IP6TABLES, !!val); } if (data[IFLA_BR_NF_CALL_ARPTABLES]) { u8 val = nla_get_u8(data[IFLA_BR_NF_CALL_ARPTABLES]); br_opt_toggle(br, BROPT_NF_CALL_ARPTABLES, !!val); } #endif if (data[IFLA_BR_MULTI_BOOLOPT]) { struct br_boolopt_multi *bm; bm = nla_data(data[IFLA_BR_MULTI_BOOLOPT]); err = br_boolopt_multi_toggle(br, bm, extack); if (err) return err; } if (data[IFLA_BR_FDB_MAX_LEARNED]) { u32 val = nla_get_u32(data[IFLA_BR_FDB_MAX_LEARNED]); WRITE_ONCE(br->fdb_max_learned, val); } return 0; } static int br_dev_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct net_bridge *br = netdev_priv(dev); struct nlattr **data = params->data; struct nlattr **tb = params->tb; int err; err = register_netdevice(dev); if (err) return err; if (tb[IFLA_ADDRESS]) { spin_lock_bh(&br->lock); br_stp_change_bridge_id(br, nla_data(tb[IFLA_ADDRESS])); spin_unlock_bh(&br->lock); } err = br_changelink(dev, tb, data, extack); if (err) br_dev_delete(dev, NULL); return err; } static size_t br_get_size(const struct net_device *brdev) { return nla_total_size(sizeof(u32)) + /* IFLA_BR_FORWARD_DELAY */ nla_total_size(sizeof(u32)) + /* IFLA_BR_HELLO_TIME */ nla_total_size(sizeof(u32)) + /* IFLA_BR_MAX_AGE */ nla_total_size(sizeof(u32)) + /* IFLA_BR_AGEING_TIME */ nla_total_size(sizeof(u32)) + /* IFLA_BR_STP_STATE */ nla_total_size(sizeof(u16)) + /* IFLA_BR_PRIORITY */ nla_total_size(sizeof(u8)) + /* IFLA_BR_VLAN_FILTERING */ #ifdef CONFIG_BRIDGE_VLAN_FILTERING nla_total_size(sizeof(__be16)) + /* IFLA_BR_VLAN_PROTOCOL */ nla_total_size(sizeof(u16)) + /* IFLA_BR_VLAN_DEFAULT_PVID */ nla_total_size(sizeof(u8)) + /* IFLA_BR_VLAN_STATS_ENABLED */ nla_total_size(sizeof(u8)) + /* IFLA_BR_VLAN_STATS_PER_PORT */ #endif nla_total_size(sizeof(u16)) + /* IFLA_BR_GROUP_FWD_MASK */ nla_total_size(sizeof(struct ifla_bridge_id)) + /* IFLA_BR_ROOT_ID */ nla_total_size(sizeof(struct ifla_bridge_id)) + /* IFLA_BR_BRIDGE_ID */ nla_total_size(sizeof(u16)) + /* IFLA_BR_ROOT_PORT */ nla_total_size(sizeof(u32)) + /* IFLA_BR_ROOT_PATH_COST */ nla_total_size(sizeof(u8)) + /* IFLA_BR_TOPOLOGY_CHANGE */ nla_total_size(sizeof(u8)) + /* IFLA_BR_TOPOLOGY_CHANGE_DETECTED */ nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_HELLO_TIMER */ nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_TCN_TIMER */ nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_TOPOLOGY_CHANGE_TIMER */ nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_GC_TIMER */ nla_total_size(ETH_ALEN) + /* IFLA_BR_GROUP_ADDR */ nla_total_size(sizeof(u32)) + /* IFLA_BR_FDB_N_LEARNED */ nla_total_size(sizeof(u32)) + /* IFLA_BR_FDB_MAX_LEARNED */ #ifdef CONFIG_BRIDGE_IGMP_SNOOPING nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_ROUTER */ nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_SNOOPING */ nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_QUERY_USE_IFADDR */ nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_QUERIER */ nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_STATS_ENABLED */ nla_total_size(sizeof(u32)) + /* IFLA_BR_MCAST_HASH_ELASTICITY */ nla_total_size(sizeof(u32)) + /* IFLA_BR_MCAST_HASH_MAX */ nla_total_size(sizeof(u32)) + /* IFLA_BR_MCAST_LAST_MEMBER_CNT */ nla_total_size(sizeof(u32)) + /* IFLA_BR_MCAST_STARTUP_QUERY_CNT */ nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_LAST_MEMBER_INTVL */ nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_MEMBERSHIP_INTVL */ nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_QUERIER_INTVL */ nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_QUERY_INTVL */ nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_QUERY_RESPONSE_INTVL */ nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_STARTUP_QUERY_INTVL */ nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_IGMP_VERSION */ nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_MLD_VERSION */ br_multicast_querier_state_size() + /* IFLA_BR_MCAST_QUERIER_STATE */ #endif #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) nla_total_size(sizeof(u8)) + /* IFLA_BR_NF_CALL_IPTABLES */ nla_total_size(sizeof(u8)) + /* IFLA_BR_NF_CALL_IP6TABLES */ nla_total_size(sizeof(u8)) + /* IFLA_BR_NF_CALL_ARPTABLES */ #endif nla_total_size(sizeof(struct br_boolopt_multi)) + /* IFLA_BR_MULTI_BOOLOPT */ nla_total_size(sizeof(u32)) + /* IFLA_BR_STP_MODE */ 0; } static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) { struct net_bridge *br = netdev_priv(brdev); u32 forward_delay = jiffies_to_clock_t(br->forward_delay); u32 hello_time = jiffies_to_clock_t(br->hello_time); u32 age_time = jiffies_to_clock_t(br->max_age); u32 ageing_time = jiffies_to_clock_t(br->ageing_time); u32 stp_enabled = br->stp_enabled; u16 priority = (br->bridge_id.prio[0] << 8) | br->bridge_id.prio[1]; u8 vlan_enabled = br_vlan_enabled(br->dev); struct br_boolopt_multi bm; u64 clockval; clockval = br_timer_value(&br->hello_timer); if (nla_put_u64_64bit(skb, IFLA_BR_HELLO_TIMER, clockval, IFLA_BR_PAD)) return -EMSGSIZE; clockval = br_timer_value(&br->tcn_timer); if (nla_put_u64_64bit(skb, IFLA_BR_TCN_TIMER, clockval, IFLA_BR_PAD)) return -EMSGSIZE; clockval = br_timer_value(&br->topology_change_timer); if (nla_put_u64_64bit(skb, IFLA_BR_TOPOLOGY_CHANGE_TIMER, clockval, IFLA_BR_PAD)) return -EMSGSIZE; clockval = br_timer_value(&br->gc_work.timer); if (nla_put_u64_64bit(skb, IFLA_BR_GC_TIMER, clockval, IFLA_BR_PAD)) return -EMSGSIZE; br_boolopt_multi_get(br, &bm); if (nla_put_u32(skb, IFLA_BR_FORWARD_DELAY, forward_delay) || nla_put_u32(skb, IFLA_BR_HELLO_TIME, hello_time) || nla_put_u32(skb, IFLA_BR_MAX_AGE, age_time) || nla_put_u32(skb, IFLA_BR_AGEING_TIME, ageing_time) || nla_put_u32(skb, IFLA_BR_STP_STATE, stp_enabled) || nla_put_u16(skb, IFLA_BR_PRIORITY, priority) || nla_put_u8(skb, IFLA_BR_VLAN_FILTERING, vlan_enabled) || nla_put_u16(skb, IFLA_BR_GROUP_FWD_MASK, br->group_fwd_mask) || nla_put(skb, IFLA_BR_BRIDGE_ID, sizeof(struct ifla_bridge_id), &br->bridge_id) || nla_put(skb, IFLA_BR_ROOT_ID, sizeof(struct ifla_bridge_id), &br->designated_root) || nla_put_u16(skb, IFLA_BR_ROOT_PORT, br->root_port) || nla_put_u32(skb, IFLA_BR_ROOT_PATH_COST, br->root_path_cost) || nla_put_u8(skb, IFLA_BR_TOPOLOGY_CHANGE, br->topology_change) || nla_put_u8(skb, IFLA_BR_TOPOLOGY_CHANGE_DETECTED, br->topology_change_detected) || nla_put(skb, IFLA_BR_GROUP_ADDR, ETH_ALEN, br->group_addr) || nla_put(skb, IFLA_BR_MULTI_BOOLOPT, sizeof(bm), &bm) || nla_put_u32(skb, IFLA_BR_FDB_N_LEARNED, atomic_read(&br->fdb_n_learned)) || nla_put_u32(skb, IFLA_BR_FDB_MAX_LEARNED, br->fdb_max_learned) || nla_put_u32(skb, IFLA_BR_STP_MODE, br->stp_mode)) return -EMSGSIZE; #ifdef CONFIG_BRIDGE_VLAN_FILTERING if (nla_put_be16(skb, IFLA_BR_VLAN_PROTOCOL, br->vlan_proto) || nla_put_u16(skb, IFLA_BR_VLAN_DEFAULT_PVID, br->default_pvid) || nla_put_u8(skb, IFLA_BR_VLAN_STATS_ENABLED, br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) || nla_put_u8(skb, IFLA_BR_VLAN_STATS_PER_PORT, br_opt_get(br, BROPT_VLAN_STATS_PER_PORT))) return -EMSGSIZE; #endif #ifdef CONFIG_BRIDGE_IGMP_SNOOPING if (nla_put_u8(skb, IFLA_BR_MCAST_ROUTER, br->multicast_ctx.multicast_router) || nla_put_u8(skb, IFLA_BR_MCAST_SNOOPING, br_opt_get(br, BROPT_MULTICAST_ENABLED)) || nla_put_u8(skb, IFLA_BR_MCAST_QUERY_USE_IFADDR, br_opt_get(br, BROPT_MULTICAST_QUERY_USE_IFADDR)) || nla_put_u8(skb, IFLA_BR_MCAST_QUERIER, br->multicast_ctx.multicast_querier) || nla_put_u8(skb, IFLA_BR_MCAST_STATS_ENABLED, br_opt_get(br, BROPT_MULTICAST_STATS_ENABLED)) || nla_put_u32(skb, IFLA_BR_MCAST_HASH_ELASTICITY, RHT_ELASTICITY) || nla_put_u32(skb, IFLA_BR_MCAST_HASH_MAX, br->hash_max) || nla_put_u32(skb, IFLA_BR_MCAST_LAST_MEMBER_CNT, br->multicast_ctx.multicast_last_member_count) || nla_put_u32(skb, IFLA_BR_MCAST_STARTUP_QUERY_CNT, br->multicast_ctx.multicast_startup_query_count) || nla_put_u8(skb, IFLA_BR_MCAST_IGMP_VERSION, br->multicast_ctx.multicast_igmp_version) || br_multicast_dump_querier_state(skb, &br->multicast_ctx, IFLA_BR_MCAST_QUERIER_STATE)) return -EMSGSIZE; #if IS_ENABLED(CONFIG_IPV6) if (nla_put_u8(skb, IFLA_BR_MCAST_MLD_VERSION, br->multicast_ctx.multicast_mld_version)) return -EMSGSIZE; #endif clockval = jiffies_to_clock_t(br->multicast_ctx.multicast_last_member_interval); if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_LAST_MEMBER_INTVL, clockval, IFLA_BR_PAD)) return -EMSGSIZE; clockval = jiffies_to_clock_t(br->multicast_ctx.multicast_membership_interval); if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_MEMBERSHIP_INTVL, clockval, IFLA_BR_PAD)) return -EMSGSIZE; clockval = jiffies_to_clock_t(br->multicast_ctx.multicast_querier_interval); if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_QUERIER_INTVL, clockval, IFLA_BR_PAD)) return -EMSGSIZE; clockval = jiffies_to_clock_t(br->multicast_ctx.multicast_query_interval); if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_QUERY_INTVL, clockval, IFLA_BR_PAD)) return -EMSGSIZE; clockval = jiffies_to_clock_t(br->multicast_ctx.multicast_query_response_interval); if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_QUERY_RESPONSE_INTVL, clockval, IFLA_BR_PAD)) return -EMSGSIZE; clockval = jiffies_to_clock_t(br->multicast_ctx.multicast_startup_query_interval); if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_STARTUP_QUERY_INTVL, clockval, IFLA_BR_PAD)) return -EMSGSIZE; #endif #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) if (nla_put_u8(skb, IFLA_BR_NF_CALL_IPTABLES, br_opt_get(br, BROPT_NF_CALL_IPTABLES) ? 1 : 0) || nla_put_u8(skb, IFLA_BR_NF_CALL_IP6TABLES, br_opt_get(br, BROPT_NF_CALL_IP6TABLES) ? 1 : 0) || nla_put_u8(skb, IFLA_BR_NF_CALL_ARPTABLES, br_opt_get(br, BROPT_NF_CALL_ARPTABLES) ? 1 : 0)) return -EMSGSIZE; #endif return 0; } static size_t br_get_linkxstats_size(const struct net_device *dev, int attr) { struct net_bridge_port *p = NULL; struct net_bridge_vlan_group *vg; struct net_bridge_vlan *v; struct net_bridge *br; int numvls = 0; switch (attr) { case IFLA_STATS_LINK_XSTATS: br = netdev_priv(dev); vg = br_vlan_group(br); break; case IFLA_STATS_LINK_XSTATS_SLAVE: p = br_port_get_rtnl(dev); if (!p) return 0; vg = nbp_vlan_group(p); break; default: return 0; } if (vg) { /* we need to count all, even placeholder entries */ list_for_each_entry(v, &vg->vlan_list, vlist) numvls++; } return numvls * nla_total_size(sizeof(struct bridge_vlan_xstats)) + nla_total_size_64bit(sizeof(struct br_mcast_stats)) + (p ? nla_total_size_64bit(sizeof(p->stp_xstats)) : 0) + nla_total_size(0); } static int br_fill_linkxstats(struct sk_buff *skb, const struct net_device *dev, int *prividx, int attr) { struct nlattr *nla __maybe_unused; struct net_bridge_port *p = NULL; struct net_bridge_vlan_group *vg; struct net_bridge_vlan *v; struct net_bridge *br; struct nlattr *nest; int vl_idx = 0; switch (attr) { case IFLA_STATS_LINK_XSTATS: br = netdev_priv(dev); vg = br_vlan_group(br); break; case IFLA_STATS_LINK_XSTATS_SLAVE: p = br_port_get_rtnl(dev); if (!p) return 0; br = p->br; vg = nbp_vlan_group(p); break; default: return -EINVAL; } nest = nla_nest_start_noflag(skb, LINK_XSTATS_TYPE_BRIDGE); if (!nest) return -EMSGSIZE; if (vg) { u16 pvid; pvid = br_get_pvid(vg); list_for_each_entry(v, &vg->vlan_list, vlist) { struct bridge_vlan_xstats vxi; struct pcpu_sw_netstats stats; if (++vl_idx < *prividx) continue; memset(&vxi, 0, sizeof(vxi)); vxi.vid = v->vid; vxi.flags = v->flags; if (v->vid == pvid) vxi.flags |= BRIDGE_VLAN_INFO_PVID; br_vlan_get_stats(v, &stats); vxi.rx_bytes = u64_stats_read(&stats.rx_bytes); vxi.rx_packets = u64_stats_read(&stats.rx_packets); vxi.tx_bytes = u64_stats_read(&stats.tx_bytes); vxi.tx_packets = u64_stats_read(&stats.tx_packets); if (nla_put(skb, BRIDGE_XSTATS_VLAN, sizeof(vxi), &vxi)) goto nla_put_failure; } } #ifdef CONFIG_BRIDGE_IGMP_SNOOPING if (++vl_idx >= *prividx) { nla = nla_reserve_64bit(skb, BRIDGE_XSTATS_MCAST, sizeof(struct br_mcast_stats), BRIDGE_XSTATS_PAD); if (!nla) goto nla_put_failure; br_multicast_get_stats(br, p, nla_data(nla)); } #endif if (p) { nla = nla_reserve_64bit(skb, BRIDGE_XSTATS_STP, sizeof(p->stp_xstats), BRIDGE_XSTATS_PAD); if (!nla) goto nla_put_failure; spin_lock_bh(&br->lock); memcpy(nla_data(nla), &p->stp_xstats, sizeof(p->stp_xstats)); spin_unlock_bh(&br->lock); } nla_nest_end(skb, nest); *prividx = 0; return 0; nla_put_failure: nla_nest_end(skb, nest); *prividx = vl_idx; return -EMSGSIZE; } static struct rtnl_af_ops br_af_ops __read_mostly = { .family = AF_BRIDGE, .get_link_af_size = br_get_link_af_size_filtered, }; struct rtnl_link_ops br_link_ops __read_mostly = { .kind = "bridge", .priv_size = sizeof(struct net_bridge), .setup = br_dev_setup, .maxtype = IFLA_BR_MAX, .policy = br_policy, .validate = br_validate, .newlink = br_dev_newlink, .changelink = br_changelink, .dellink = br_dev_delete, .get_size = br_get_size, .fill_info = br_fill_info, .fill_linkxstats = br_fill_linkxstats, .get_linkxstats_size = br_get_linkxstats_size, .slave_maxtype = IFLA_BRPORT_MAX, .slave_policy = br_port_policy, .slave_changelink = br_port_slave_changelink, .get_slave_size = br_port_get_slave_size, .fill_slave_info = br_port_fill_slave_info, }; int __init br_netlink_init(void) { int err; err = br_vlan_rtnl_init(); if (err) goto out; err = rtnl_af_register(&br_af_ops); if (err) goto out_vlan; err = rtnl_link_register(&br_link_ops); if (err) goto out_af; return 0; out_af: rtnl_af_unregister(&br_af_ops); out_vlan: br_vlan_rtnl_uninit(); out: return err; } void br_netlink_fini(void) { br_vlan_rtnl_uninit(); rtnl_af_unregister(&br_af_ops); rtnl_link_unregister(&br_link_ops); }
3 3 3 3 3 3 3 3 3 3 3 3 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2016 Thomas Gleixner. * Copyright (C) 2016-2017 Christoph Hellwig. */ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/cpu.h> #include <linux/sort.h> #include <linux/group_cpus.h> #ifdef CONFIG_SMP static void grp_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk, unsigned int cpus_per_grp) { const struct cpumask *siblmsk; int cpu, sibl; for ( ; cpus_per_grp > 0; ) { cpu = cpumask_first(nmsk); /* Should not happen, but I'm too lazy to think about it */ if (cpu >= nr_cpu_ids) return; cpumask_clear_cpu(cpu, nmsk); cpumask_set_cpu(cpu, irqmsk); cpus_per_grp--; /* If the cpu has siblings, use them first */ siblmsk = topology_sibling_cpumask(cpu); for (sibl = -1; cpus_per_grp > 0; ) { sibl = cpumask_next(sibl, siblmsk); if (sibl >= nr_cpu_ids) break; if (!cpumask_test_and_clear_cpu(sibl, nmsk)) continue; cpumask_set_cpu(sibl, irqmsk); cpus_per_grp--; } } } static cpumask_var_t *alloc_node_to_cpumask(void) { cpumask_var_t *masks; int node; masks = kzalloc_objs(cpumask_var_t, nr_node_ids); if (!masks) return NULL; for (node = 0; node < nr_node_ids; node++) { if (!zalloc_cpumask_var(&masks[node], GFP_KERNEL)) goto out_unwind; } return masks; out_unwind: while (--node >= 0) free_cpumask_var(masks[node]); kfree(masks); return NULL; } static void free_node_to_cpumask(cpumask_var_t *masks) { int node; for (node = 0; node < nr_node_ids; node++) free_cpumask_var(masks[node]); kfree(masks); } static void build_node_to_cpumask(cpumask_var_t *masks) { int cpu; for_each_possible_cpu(cpu) cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]); } static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask, const struct cpumask *mask, nodemask_t *nodemsk) { int n, nodes = 0; /* Calculate the number of nodes in the supplied affinity mask */ for_each_node(n) { if (cpumask_intersects(mask, node_to_cpumask[n])) { node_set(n, *nodemsk); nodes++; } } return nodes; } struct node_groups { unsigned id; union { unsigned ngroups; unsigned ncpus; }; }; static int ncpus_cmp_func(const void *l, const void *r) { const struct node_groups *ln = l; const struct node_groups *rn = r; return ln->ncpus - rn->ncpus; } static void alloc_groups_to_nodes(unsigned int numgrps, unsigned int numcpus, struct node_groups *node_groups, unsigned int num_nodes) { unsigned int n, remaining_ncpus = numcpus; unsigned int ngroups, ncpus; sort(node_groups, num_nodes, sizeof(node_groups[0]), ncpus_cmp_func, NULL); /* * Allocate groups for each node according to the ratio of this * node's nr_cpus to remaining un-assigned ncpus. 'numgrps' is * bigger than number of active numa nodes. Always start the * allocation from the node with minimized nr_cpus. * * This way guarantees that each active node gets allocated at * least one group, and the theory is simple: over-allocation * is only done when this node is assigned by one group, so * other nodes will be allocated >= 1 groups, since 'numgrps' is * bigger than number of numa nodes. * * One perfect invariant is that number of allocated groups for * each node is <= CPU count of this node: * * 1) suppose there are two nodes: A and B * ncpu(X) is CPU count of node X * grps(X) is the group count allocated to node X via this * algorithm * * ncpu(A) <= ncpu(B) * ncpu(A) + ncpu(B) = N * grps(A) + grps(B) = G * * grps(A) = max(1, round_down(G * ncpu(A) / N)) * grps(B) = G - grps(A) * * both N and G are integer, and 2 <= G <= N, suppose * G = N - delta, and 0 <= delta <= N - 2 * * 2) obviously grps(A) <= ncpu(A) because: * * if grps(A) is 1, then grps(A) <= ncpu(A) given * ncpu(A) >= 1 * * otherwise, * grps(A) <= G * ncpu(A) / N <= ncpu(A), given G <= N * * 3) prove how grps(B) <= ncpu(B): * * if round_down(G * ncpu(A) / N) == 0, vecs(B) won't be * over-allocated, so grps(B) <= ncpu(B), * * otherwise: * * grps(A) = * round_down(G * ncpu(A) / N) = * round_down((N - delta) * ncpu(A) / N) = * round_down((N * ncpu(A) - delta * ncpu(A)) / N) >= * round_down((N * ncpu(A) - delta * N) / N) = * cpu(A) - delta * * then: * * grps(A) - G >= ncpu(A) - delta - G * => * G - grps(A) <= G + delta - ncpu(A) * => * grps(B) <= N - ncpu(A) * => * grps(B) <= cpu(B) * * For nodes >= 3, it can be thought as one node and another big * node given that is exactly what this algorithm is implemented, * and we always re-calculate 'remaining_ncpus' & 'numgrps', and * finally for each node X: grps(X) <= ncpu(X). * */ for (n = 0; n < num_nodes; n++) { if (node_groups[n].ncpus == UINT_MAX) continue; WARN_ON_ONCE(numgrps == 0); ncpus = node_groups[n].ncpus; ngroups = max_t(unsigned, 1, numgrps * ncpus / remaining_ncpus); WARN_ON_ONCE(ngroups > ncpus); node_groups[n].ngroups = ngroups; remaining_ncpus -= ncpus; numgrps -= ngroups; } } /* * Allocate group number for each node, so that for each node: * * 1) the allocated number is >= 1 * * 2) the allocated number is <= active CPU number of this node * * The actual allocated total groups may be less than @numgrps when * active total CPU number is less than @numgrps. * * Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]' * for each node. */ static void alloc_nodes_groups(unsigned int numgrps, cpumask_var_t *node_to_cpumask, const struct cpumask *cpu_mask, const nodemask_t nodemsk, struct cpumask *nmsk, struct node_groups *node_groups) { unsigned int n, numcpus = 0; for (n = 0; n < nr_node_ids; n++) { node_groups[n].id = n; node_groups[n].ncpus = UINT_MAX; } for_each_node_mask(n, nodemsk) { unsigned int ncpus; cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); ncpus = cpumask_weight(nmsk); if (!ncpus) continue; numcpus += ncpus; node_groups[n].ncpus = ncpus; } numgrps = min_t(unsigned int, numcpus, numgrps); alloc_groups_to_nodes(numgrps, numcpus, node_groups, nr_node_ids); } static void assign_cpus_to_groups(unsigned int ncpus, struct cpumask *nmsk, struct node_groups *nv, struct cpumask *masks, unsigned int *curgrp, unsigned int last_grp) { unsigned int v, cpus_per_grp, extra_grps; /* Account for rounding errors */ extra_grps = ncpus - nv->ngroups * (ncpus / nv->ngroups); /* Spread allocated groups on CPUs of the current node */ for (v = 0; v < nv->ngroups; v++, *curgrp += 1) { cpus_per_grp = ncpus / nv->ngroups; /* Account for extra groups to compensate rounding errors */ if (extra_grps) { cpus_per_grp++; --extra_grps; } /* * wrapping has to be considered given 'startgrp' * may start anywhere */ if (*curgrp >= last_grp) *curgrp = 0; grp_spread_init_one(&masks[*curgrp], nmsk, cpus_per_grp); } } static int alloc_cluster_groups(unsigned int ncpus, unsigned int ngroups, struct cpumask *node_cpumask, cpumask_var_t msk, const struct cpumask ***clusters_ptr, struct node_groups **cluster_groups_ptr) { unsigned int ncluster = 0; unsigned int cpu, nc, n; const struct cpumask *cluster_mask; const struct cpumask **clusters; struct node_groups *cluster_groups; cpumask_copy(msk, node_cpumask); /* Probe how many clusters in this node. */ while (1) { cpu = cpumask_first(msk); if (cpu >= nr_cpu_ids) break; cluster_mask = topology_cluster_cpumask(cpu); if (!cpumask_weight(cluster_mask)) goto no_cluster; /* Clean out CPUs on the same cluster. */ cpumask_andnot(msk, msk, cluster_mask); ncluster++; } /* If ngroups < ncluster, cross cluster is inevitable, skip. */ if (ncluster == 0 || ncluster > ngroups) goto no_cluster; /* Allocate memory based on cluster number. */ clusters = kzalloc_objs(*clusters, ncluster); if (!clusters) goto no_cluster; cluster_groups = kzalloc_objs(struct node_groups, ncluster); if (!cluster_groups) goto fail_cluster_groups; /* Filling cluster info for later process. */ cpumask_copy(msk, node_cpumask); for (n = 0; n < ncluster; n++) { cpu = cpumask_first(msk); cluster_mask = topology_cluster_cpumask(cpu); nc = cpumask_weight_and(cluster_mask, node_cpumask); clusters[n] = cluster_mask; cluster_groups[n].id = n; cluster_groups[n].ncpus = nc; cpumask_andnot(msk, msk, cluster_mask); } alloc_groups_to_nodes(ngroups, ncpus, cluster_groups, ncluster); *clusters_ptr = clusters; *cluster_groups_ptr = cluster_groups; return ncluster; fail_cluster_groups: kfree(clusters); no_cluster: return 0; } /* * Try group CPUs evenly for cluster locality within a NUMA node. * * Return: true if success, false otherwise. */ static bool __try_group_cluster_cpus(unsigned int ncpus, unsigned int ngroups, struct cpumask *node_cpumask, struct cpumask *masks, unsigned int *curgrp, unsigned int last_grp) { struct node_groups *cluster_groups; const struct cpumask **clusters; unsigned int ncluster; bool ret = false; cpumask_var_t nmsk; unsigned int i, nc; if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) goto fail_nmsk_alloc; ncluster = alloc_cluster_groups(ncpus, ngroups, node_cpumask, nmsk, &clusters, &cluster_groups); if (ncluster == 0) goto fail_no_clusters; for (i = 0; i < ncluster; i++) { struct node_groups *nv = &cluster_groups[i]; /* Get the cpus on this cluster. */ cpumask_and(nmsk, node_cpumask, clusters[nv->id]); nc = cpumask_weight(nmsk); if (!nc) continue; WARN_ON_ONCE(nv->ngroups > nc); assign_cpus_to_groups(nc, nmsk, nv, masks, curgrp, last_grp); } ret = true; kfree(cluster_groups); kfree(clusters); fail_no_clusters: free_cpumask_var(nmsk); fail_nmsk_alloc: return ret; } static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps, cpumask_var_t *node_to_cpumask, const struct cpumask *cpu_mask, struct cpumask *nmsk, struct cpumask *masks) { unsigned int i, n, nodes, done = 0; unsigned int last_grp = numgrps; unsigned int curgrp = startgrp; nodemask_t nodemsk = NODE_MASK_NONE; struct node_groups *node_groups; if (cpumask_empty(cpu_mask)) return 0; nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_mask, &nodemsk); /* * If the number of nodes in the mask is greater than or equal the * number of groups we just spread the groups across the nodes. */ if (numgrps <= nodes) { for_each_node_mask(n, nodemsk) { /* Ensure that only CPUs which are in both masks are set */ cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); cpumask_or(&masks[curgrp], &masks[curgrp], nmsk); if (++curgrp == last_grp) curgrp = 0; } return numgrps; } node_groups = kzalloc_objs(struct node_groups, nr_node_ids); if (!node_groups) return -ENOMEM; /* allocate group number for each node */ alloc_nodes_groups(numgrps, node_to_cpumask, cpu_mask, nodemsk, nmsk, node_groups); for (i = 0; i < nr_node_ids; i++) { unsigned int ncpus; struct node_groups *nv = &node_groups[i]; if (nv->ngroups == UINT_MAX) continue; /* Get the cpus on this node which are in the mask */ cpumask_and(nmsk, cpu_mask, node_to_cpumask[nv->id]); ncpus = cpumask_weight(nmsk); if (!ncpus) continue; WARN_ON_ONCE(nv->ngroups > ncpus); if (__try_group_cluster_cpus(ncpus, nv->ngroups, nmsk, masks, &curgrp, last_grp)) { done += nv->ngroups; continue; } assign_cpus_to_groups(ncpus, nmsk, nv, masks, &curgrp, last_grp); done += nv->ngroups; } kfree(node_groups); return done; } /** * group_cpus_evenly - Group all CPUs evenly per NUMA/CPU locality * @numgrps: number of groups * @nummasks: number of initialized cpumasks * * Return: cpumask array if successful, NULL otherwise. And each element * includes CPUs assigned to this group. nummasks contains the number * of initialized masks which can be less than numgrps. * * Try to put close CPUs from viewpoint of CPU and NUMA locality into * same group, and run two-stage grouping: * 1) allocate present CPUs on these groups evenly first * 2) allocate other possible CPUs on these groups evenly * * We guarantee in the resulted grouping that all CPUs are covered, and * no same CPU is assigned to multiple groups */ struct cpumask *group_cpus_evenly(unsigned int numgrps, unsigned int *nummasks) { unsigned int curgrp = 0, nr_present = 0, nr_others = 0; cpumask_var_t *node_to_cpumask; cpumask_var_t nmsk, npresmsk; int ret = -ENOMEM; struct cpumask *masks = NULL; if (numgrps == 0) return NULL; if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) return NULL; if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL)) goto fail_nmsk; node_to_cpumask = alloc_node_to_cpumask(); if (!node_to_cpumask) goto fail_npresmsk; masks = kzalloc_objs(*masks, numgrps); if (!masks) goto fail_node_to_cpumask; build_node_to_cpumask(node_to_cpumask); /* * Make a local cache of 'cpu_present_mask', so the two stages * spread can observe consistent 'cpu_present_mask' without holding * cpu hotplug lock, then we can reduce deadlock risk with cpu * hotplug code. * * Here CPU hotplug may happen when reading `cpu_present_mask`, and * we can live with the case because it only affects that hotplug * CPU is handled in the 1st or 2nd stage, and either way is correct * from API user viewpoint since 2-stage spread is sort of * optimization. */ cpumask_copy(npresmsk, data_race(cpu_present_mask)); /* grouping present CPUs first */ ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask, npresmsk, nmsk, masks); if (ret < 0) goto fail_node_to_cpumask; nr_present = ret; /* * Allocate non present CPUs starting from the next group to be * handled. If the grouping of present CPUs already exhausted the * group space, assign the non present CPUs to the already * allocated out groups. */ if (nr_present >= numgrps) curgrp = 0; else curgrp = nr_present; cpumask_andnot(npresmsk, cpu_possible_mask, npresmsk); ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask, npresmsk, nmsk, masks); if (ret >= 0) nr_others = ret; fail_node_to_cpumask: free_node_to_cpumask(node_to_cpumask); fail_npresmsk: free_cpumask_var(npresmsk); fail_nmsk: free_cpumask_var(nmsk); if (ret < 0) { kfree(masks); return NULL; } *nummasks = min(nr_present + nr_others, numgrps); return masks; } #else /* CONFIG_SMP */ struct cpumask *group_cpus_evenly(unsigned int numgrps, unsigned int *nummasks) { struct cpumask *masks; if (numgrps == 0) return NULL; masks = kzalloc_objs(*masks, numgrps); if (!masks) return NULL; /* assign all CPUs(cpu 0) to the 1st group only */ cpumask_copy(&masks[0], cpu_possible_mask); *nummasks = 1; return masks; } #endif /* CONFIG_SMP */ EXPORT_SYMBOL_GPL(group_cpus_evenly);
6 6 7 7 7 1 1 4 1 1 5 3 3 1 2 1 1 1 2 6 6 6 6 6 6 6 4 4 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683 7684 7685 7686 7687 7688 7689 7690 7691 7692 7693 7694 7695 7696 7697 7698 7699 7700 7701 7702 7703 7704 7705 7706 7707 7708 7709 7710 7711 7712 7713 7714 7715 7716 7717 7718 7719 7720 7721 7722 7723 7724 7725 7726 7727 7728 7729 7730 7731 7732 7733 7734 7735 7736 7737 7738 7739 7740 7741 7742 7743 7744 7745 7746 7747 7748 7749 7750 7751 7752 7753 7754 7755 7756 7757 7758 7759 7760 7761 7762 7763 7764 7765 7766 7767 7768 7769 7770 7771 7772 7773 7774 7775 7776 7777 7778 7779 7780 7781 7782 7783 7784 7785 7786 7787 7788 7789 7790 7791 7792 7793 7794 7795 7796 7797 7798 7799 7800 7801 7802 7803 7804 7805 7806 7807 7808 7809 7810 7811 7812 7813 7814 7815 7816 7817 7818 7819 7820 7821 7822 7823 7824 7825 7826 7827 7828 7829 7830 7831 7832 7833 7834 7835 7836 7837 7838 7839 7840 7841 7842 7843 7844 7845 7846 7847 7848 7849 7850 7851 7852 7853 7854 7855 7856 7857 7858 7859 7860 7861 7862 7863 7864 7865 7866 7867 7868 7869 7870 7871 7872 7873 7874 7875 7876 7877 7878 7879 7880 7881 7882 7883 7884 7885 7886 7887 7888 7889 7890 7891 7892 7893 7894 7895 7896 7897 7898 7899 7900 7901 7902 7903 7904 7905 7906 7907 7908 7909 7910 7911 7912 7913 7914 7915 7916 7917 7918 7919 7920 7921 7922 7923 7924 7925 7926 7927 7928 7929 7930 7931 7932 7933 7934 7935 7936 7937 7938 7939 7940 7941 7942 7943 7944 7945 7946 7947 7948 7949 7950 7951 7952 7953 7954 7955 7956 7957 7958 7959 7960 7961 7962 7963 7964 7965 7966 7967 7968 7969 7970 7971 7972 7973 7974 7975 7976 7977 7978 7979 7980 7981 7982 7983 7984 7985 7986 7987 7988 7989 7990 7991 7992 7993 7994 7995 7996 7997 7998 7999 8000 8001 8002 8003 8004 8005 8006 8007 8008 8009 8010 8011 8012 8013 8014 8015 8016 8017 8018 8019 8020 8021 8022 8023 8024 8025 8026 8027 8028 8029 8030 8031 8032 8033 8034 8035 8036 8037 8038 8039 8040 8041 8042 8043 8044 8045 8046 8047 8048 8049 8050 8051 8052 8053 8054 8055 8056 8057 8058 8059 8060 8061 8062 8063 8064 8065 8066 8067 8068 8069 8070 8071 8072 8073 8074 8075 8076 8077 8078 8079 8080 8081 8082 8083 8084 8085 8086 8087 8088 8089 8090 8091 8092 8093 8094 8095 8096 8097 8098 8099 8100 8101 8102 8103 8104 8105 8106 8107 8108 8109 8110 8111 8112 8113 8114 8115 8116 8117 8118 8119 8120 8121 8122 8123 8124 8125 8126 8127 8128 8129 8130 8131 8132 8133 8134 8135 8136 8137 8138 8139 8140 8141 8142 8143 8144 8145 8146 8147 8148 8149 8150 8151 8152 8153 8154 8155 8156 8157 8158 8159 8160 8161 8162 8163 8164 8165 8166 8167 8168 8169 8170 8171 8172 8173 8174 8175 8176 8177 8178 8179 8180 8181 8182 8183 8184 8185 8186 8187 8188 8189 8190 8191 8192 8193 8194 8195 8196 8197 8198 8199 8200 8201 8202 8203 8204 8205 8206 8207 8208 8209 8210 8211 8212 8213 8214 8215 8216 8217 8218 8219 8220 8221 8222 8223 8224 8225 8226 8227 8228 8229 8230 8231 8232 8233 8234 8235 8236 8237 8238 8239 8240 8241 8242 8243 8244 8245 8246 8247 8248 8249 8250 8251 8252 8253 8254 8255 8256 8257 8258 8259 8260 8261 8262 8263 8264 8265 8266 8267 8268 8269 8270 8271 8272 8273 8274 8275 8276 8277 8278 8279 8280 8281 8282 8283 8284 8285 8286 8287 8288 8289 8290 8291 8292 8293 8294 8295 8296 8297 8298 8299 8300 8301 8302 8303 8304 8305 8306 8307 8308 8309 8310 8311 8312 8313 8314 8315 8316 8317 8318 8319 8320 8321 8322 8323 8324 8325 8326 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2012 Alexander Block. All rights reserved. */ #include <linux/bsearch.h> #include <linux/falloc.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/sort.h> #include <linux/mount.h> #include <linux/xattr.h> #include <linux/posix_acl_xattr.h> #include <linux/radix-tree.h> #include <linux/vmalloc.h> #include <linux/string.h> #include <linux/compat.h> #include <linux/crc32c.h> #include <linux/fsverity.h> #include "send.h" #include "ctree.h" #include "backref.h" #include "locking.h" #include "disk-io.h" #include "btrfs_inode.h" #include "transaction.h" #include "compression.h" #include "print-tree.h" #include "accessors.h" #include "dir-item.h" #include "file-item.h" #include "ioctl.h" #include "verity.h" #include "lru_cache.h" /* * Maximum number of references an extent can have in order for us to attempt to * issue clone operations instead of write operations. This currently exists to * avoid hitting limitations of the backreference walking code (taking a lot of * time and using too much memory for extents with large number of references). */ #define SEND_MAX_EXTENT_REFS 1024 /* * A fs_path is a helper to dynamically build path names with unknown size. * It reallocates the internal buffer on demand. * It allows fast adding of path elements on the right side (normal path) and * fast adding to the left side (reversed path). A reversed path can also be * unreversed if needed. * * The definition of struct fs_path relies on -fms-extensions to allow * including a tagged struct as an anonymous member. */ struct __fs_path { char *start; char *end; char *buf; unsigned short buf_len:15; unsigned short reversed:1; }; static_assert(sizeof(struct __fs_path) < 256); struct fs_path { struct __fs_path; /* * Average path length does not exceed 200 bytes, we'll have * better packing in the slab and higher chance to satisfy * an allocation later during send. */ char inline_buf[256 - sizeof(struct __fs_path)]; }; #define FS_PATH_INLINE_SIZE \ sizeof_field(struct fs_path, inline_buf) /* reused for each extent */ struct clone_root { struct btrfs_root *root; u64 ino; u64 offset; u64 num_bytes; bool found_ref; }; #define SEND_MAX_NAME_CACHE_SIZE 256 /* * Limit the root_ids array of struct backref_cache_entry to 17 elements. * This makes the size of a cache entry to be exactly 192 bytes on x86_64, which * can be satisfied from the kmalloc-192 slab, without wasting any space. * The most common case is to have a single root for cloning, which corresponds * to the send root. Having the user specify more than 16 clone roots is not * common, and in such rare cases we simply don't use caching if the number of * cloning roots that lead down to a leaf is more than 17. */ #define SEND_MAX_BACKREF_CACHE_ROOTS 17 /* * Max number of entries in the cache. * With SEND_MAX_BACKREF_CACHE_ROOTS as 17, the size in bytes, excluding * maple tree's internal nodes, is 24K. */ #define SEND_MAX_BACKREF_CACHE_SIZE 128 /* * A backref cache entry maps a leaf to a list of IDs of roots from which the * leaf is accessible and we can use for clone operations. * With SEND_MAX_BACKREF_CACHE_ROOTS as 12, each cache entry is 128 bytes (on * x86_64). */ struct backref_cache_entry { struct btrfs_lru_cache_entry entry; u64 root_ids[SEND_MAX_BACKREF_CACHE_ROOTS]; /* Number of valid elements in the root_ids array. */ int num_roots; }; /* See the comment at lru_cache.h about struct btrfs_lru_cache_entry. */ static_assert(offsetof(struct backref_cache_entry, entry) == 0); /* * Max number of entries in the cache that stores directories that were already * created. The cache uses raw struct btrfs_lru_cache_entry entries, so it uses * at most 4096 bytes - sizeof(struct btrfs_lru_cache_entry) is 48 bytes, but * the kmalloc-64 slab is used, so we get 4096 bytes (64 bytes * 64). */ #define SEND_MAX_DIR_CREATED_CACHE_SIZE 64 /* * Max number of entries in the cache that stores directories that were already * created. The cache uses raw struct btrfs_lru_cache_entry entries, so it uses * at most 4096 bytes - sizeof(struct btrfs_lru_cache_entry) is 48 bytes, but * the kmalloc-64 slab is used, so we get 4096 bytes (64 bytes * 64). */ #define SEND_MAX_DIR_UTIMES_CACHE_SIZE 64 struct send_ctx { struct file *send_filp; loff_t send_off; char *send_buf; u32 send_size; u32 send_max_size; /* * Whether BTRFS_SEND_A_DATA attribute was already added to current * command (since protocol v2, data must be the last attribute). */ bool put_data; struct page **send_buf_pages; u64 flags; /* 'flags' member of btrfs_ioctl_send_args is u64 */ /* Protocol version compatibility requested */ u32 proto; struct btrfs_root *send_root; struct btrfs_root *parent_root; struct clone_root *clone_roots; int clone_roots_cnt; /* current state of the compare_tree call */ struct btrfs_path *left_path; struct btrfs_path *right_path; struct btrfs_key *cmp_key; /* * Keep track of the generation of the last transaction that was used * for relocating a block group. This is periodically checked in order * to detect if a relocation happened since the last check, so that we * don't operate on stale extent buffers for nodes (level >= 1) or on * stale disk_bytenr values of file extent items. */ u64 last_reloc_trans; /* * infos of the currently processed inode. In case of deleted inodes, * these are the values from the deleted inode. */ u64 cur_ino; u64 cur_inode_gen; u64 cur_inode_size; u64 cur_inode_mode; u64 cur_inode_rdev; u64 cur_inode_last_extent; u64 cur_inode_next_write_offset; bool cur_inode_new; bool cur_inode_new_gen; bool cur_inode_deleted; bool ignore_cur_inode; bool cur_inode_needs_verity; void *verity_descriptor; u64 send_progress; struct list_head new_refs; struct list_head deleted_refs; struct btrfs_lru_cache name_cache; /* * The inode we are currently processing. It's not NULL only when we * need to issue write commands for data extents from this inode. */ struct inode *cur_inode; struct file_ra_state ra; u64 page_cache_clear_start; bool clean_page_cache; /* * We process inodes by their increasing order, so if before an * incremental send we reverse the parent/child relationship of * directories such that a directory with a lower inode number was * the parent of a directory with a higher inode number, and the one * becoming the new parent got renamed too, we can't rename/move the * directory with lower inode number when we finish processing it - we * must process the directory with higher inode number first, then * rename/move it and then rename/move the directory with lower inode * number. Example follows. * * Tree state when the first send was performed: * * . * |-- a (ino 257) * |-- b (ino 258) * | * | * |-- c (ino 259) * | |-- d (ino 260) * | * |-- c2 (ino 261) * * Tree state when the second (incremental) send is performed: * * . * |-- a (ino 257) * |-- b (ino 258) * |-- c2 (ino 261) * |-- d2 (ino 260) * |-- cc (ino 259) * * The sequence of steps that lead to the second state was: * * mv /a/b/c/d /a/b/c2/d2 * mv /a/b/c /a/b/c2/d2/cc * * "c" has lower inode number, but we can't move it (2nd mv operation) * before we move "d", which has higher inode number. * * So we just memorize which move/rename operations must be performed * later when their respective parent is processed and moved/renamed. */ /* Indexed by parent directory inode number. */ struct rb_root pending_dir_moves; /* * Reverse index, indexed by the inode number of a directory that * is waiting for the move/rename of its immediate parent before its * own move/rename can be performed. */ struct rb_root waiting_dir_moves; /* * A directory that is going to be rm'ed might have a child directory * which is in the pending directory moves index above. In this case, * the directory can only be removed after the move/rename of its child * is performed. Example: * * Parent snapshot: * * . (ino 256) * |-- a/ (ino 257) * |-- b/ (ino 258) * |-- c/ (ino 259) * | |-- x/ (ino 260) * | * |-- y/ (ino 261) * * Send snapshot: * * . (ino 256) * |-- a/ (ino 257) * |-- b/ (ino 258) * |-- YY/ (ino 261) * |-- x/ (ino 260) * * Sequence of steps that lead to the send snapshot: * rm -f /a/b/c/foo.txt * mv /a/b/y /a/b/YY * mv /a/b/c/x /a/b/YY * rmdir /a/b/c * * When the child is processed, its move/rename is delayed until its * parent is processed (as explained above), but all other operations * like update utimes, chown, chgrp, etc, are performed and the paths * that it uses for those operations must use the orphanized name of * its parent (the directory we're going to rm later), so we need to * memorize that name. * * Indexed by the inode number of the directory to be deleted. */ struct rb_root orphan_dirs; struct rb_root rbtree_new_refs; struct rb_root rbtree_deleted_refs; struct btrfs_lru_cache backref_cache; u64 backref_cache_last_reloc_trans; struct btrfs_lru_cache dir_created_cache; struct btrfs_lru_cache dir_utimes_cache; struct fs_path cur_inode_path; }; struct pending_dir_move { struct rb_node node; struct list_head list; u64 parent_ino; u64 ino; u64 gen; struct list_head update_refs; }; struct waiting_dir_move { struct rb_node node; u64 ino; /* * There might be some directory that could not be removed because it * was waiting for this directory inode to be moved first. Therefore * after this directory is moved, we can try to rmdir the ino rmdir_ino. */ u64 rmdir_ino; u64 rmdir_gen; bool orphanized; }; struct orphan_dir_info { struct rb_node node; u64 ino; u64 gen; u64 last_dir_index_offset; u64 dir_high_seq_ino; }; struct name_cache_entry { /* * The key in the entry is an inode number, and the generation matches * the inode's generation. */ struct btrfs_lru_cache_entry entry; u64 parent_ino; u64 parent_gen; int ret; int need_later_update; /* Name length without NUL terminator. */ int name_len; /* Not NUL terminated. */ char name[] __counted_by(name_len) __nonstring; }; /* See the comment at lru_cache.h about struct btrfs_lru_cache_entry. */ static_assert(offsetof(struct name_cache_entry, entry) == 0); #define ADVANCE 1 #define ADVANCE_ONLY_NEXT -1 enum btrfs_compare_tree_result { BTRFS_COMPARE_TREE_NEW, BTRFS_COMPARE_TREE_DELETED, BTRFS_COMPARE_TREE_CHANGED, BTRFS_COMPARE_TREE_SAME, }; __cold static void inconsistent_snapshot_error(struct send_ctx *sctx, enum btrfs_compare_tree_result result, const char *what) { const char *result_string; switch (result) { case BTRFS_COMPARE_TREE_NEW: result_string = "new"; break; case BTRFS_COMPARE_TREE_DELETED: result_string = "deleted"; break; case BTRFS_COMPARE_TREE_CHANGED: result_string = "updated"; break; case BTRFS_COMPARE_TREE_SAME: DEBUG_WARN("no change between trees"); result_string = "unchanged"; break; default: DEBUG_WARN("unexpected comparison result %d", result); result_string = "unexpected"; } btrfs_err(sctx->send_root->fs_info, "Send: inconsistent snapshot, found %s %s for inode %llu without updated inode item, send root is %llu, parent root is %llu", result_string, what, sctx->cmp_key->objectid, btrfs_root_id(sctx->send_root), (sctx->parent_root ? btrfs_root_id(sctx->parent_root) : 0)); } __maybe_unused static bool proto_cmd_ok(const struct send_ctx *sctx, int cmd) { switch (sctx->proto) { case 1: return cmd <= BTRFS_SEND_C_MAX_V1; case 2: return cmd <= BTRFS_SEND_C_MAX_V2; case 3: return cmd <= BTRFS_SEND_C_MAX_V3; default: return false; } } static int is_waiting_for_move(struct send_ctx *sctx, u64 ino); static struct waiting_dir_move * get_waiting_dir_move(struct send_ctx *sctx, u64 ino); static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino, u64 gen); static int need_send_hole(struct send_ctx *sctx) { return (sctx->parent_root && !sctx->cur_inode_new && !sctx->cur_inode_new_gen && !sctx->cur_inode_deleted && S_ISREG(sctx->cur_inode_mode)); } static void fs_path_reset(struct fs_path *p) { if (p->reversed) p->start = p->buf + p->buf_len - 1; else p->start = p->buf; p->end = p->start; *p->start = 0; } static void init_path(struct fs_path *p) { p->reversed = 0; p->buf = p->inline_buf; p->buf_len = FS_PATH_INLINE_SIZE; fs_path_reset(p); } static struct fs_path *fs_path_alloc(void) { struct fs_path *p; p = kmalloc_obj(*p); if (!p) return NULL; init_path(p); return p; } static struct fs_path *fs_path_alloc_reversed(void) { struct fs_path *p; p = fs_path_alloc(); if (!p) return NULL; p->reversed = 1; fs_path_reset(p); return p; } static void fs_path_free(struct fs_path *p) { if (!p) return; if (p->buf != p->inline_buf) kfree(p->buf); kfree(p); } static inline int fs_path_len(const struct fs_path *p) { return p->end - p->start; } static int fs_path_ensure_buf(struct fs_path *p, int len) { char *tmp_buf; int path_len; int old_buf_len; len++; if (p->buf_len >= len) return 0; if (WARN_ON(len > PATH_MAX)) return -ENAMETOOLONG; path_len = fs_path_len(p); old_buf_len = p->buf_len; /* * Allocate to the next largest kmalloc bucket size, to let * the fast path happen most of the time. */ len = kmalloc_size_roundup(len); /* * First time the inline_buf does not suffice */ if (p->buf == p->inline_buf) { tmp_buf = kmalloc(len, GFP_KERNEL); if (tmp_buf) memcpy(tmp_buf, p->buf, old_buf_len); } else { tmp_buf = krealloc(p->buf, len, GFP_KERNEL); } if (!tmp_buf) return -ENOMEM; p->buf = tmp_buf; p->buf_len = len; if (p->reversed) { tmp_buf = p->buf + old_buf_len - path_len - 1; p->end = p->buf + p->buf_len - 1; p->start = p->end - path_len; memmove(p->start, tmp_buf, path_len + 1); } else { p->start = p->buf; p->end = p->start + path_len; } return 0; } static int fs_path_prepare_for_add(struct fs_path *p, int name_len, char **prepared) { int ret; int new_len; new_len = fs_path_len(p) + name_len; if (p->start != p->end) new_len++; ret = fs_path_ensure_buf(p, new_len); if (ret < 0) return ret; if (p->reversed) { if (p->start != p->end) *--p->start = '/'; p->start -= name_len; *prepared = p->start; } else { if (p->start != p->end) *p->end++ = '/'; *prepared = p->end; p->end += name_len; *p->end = 0; } return 0; } static int fs_path_add(struct fs_path *p, const char *name, int name_len) { int ret; char *prepared; ret = fs_path_prepare_for_add(p, name_len, &prepared); if (ret < 0) return ret; memcpy(prepared, name, name_len); return 0; } static inline int fs_path_add_path(struct fs_path *p, const struct fs_path *p2) { return fs_path_add(p, p2->start, fs_path_len(p2)); } static int fs_path_add_from_extent_buffer(struct fs_path *p, struct extent_buffer *eb, unsigned long off, int len) { int ret; char *prepared; ret = fs_path_prepare_for_add(p, len, &prepared); if (ret < 0) return ret; read_extent_buffer(eb, prepared, off, len); return 0; } static int fs_path_copy(struct fs_path *p, struct fs_path *from) { p->reversed = from->reversed; fs_path_reset(p); return fs_path_add_path(p, from); } static void fs_path_unreverse(struct fs_path *p) { char *tmp; int len; if (!p->reversed) return; tmp = p->start; len = fs_path_len(p); p->start = p->buf; p->end = p->start + len; memmove(p->start, tmp, len + 1); p->reversed = 0; } static inline bool is_current_inode_path(const struct send_ctx *sctx, const struct fs_path *path) { const struct fs_path *cur = &sctx->cur_inode_path; return (strncmp(path->start, cur->start, fs_path_len(cur)) == 0); } static struct btrfs_path *alloc_path_for_send(void) { struct btrfs_path *path; path = btrfs_alloc_path(); if (!path) return NULL; path->search_commit_root = true; path->skip_locking = true; path->need_commit_sem = true; return path; } static int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off) { int ret; u32 pos = 0; while (pos < len) { ret = kernel_write(filp, buf + pos, len - pos, off); if (ret < 0) return ret; if (unlikely(ret == 0)) return -EIO; pos += ret; } return 0; } static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len) { struct btrfs_tlv_header *hdr; int total_len = sizeof(*hdr) + len; int left = sctx->send_max_size - sctx->send_size; if (WARN_ON_ONCE(sctx->put_data)) return -EINVAL; if (unlikely(left < total_len)) return -EOVERFLOW; hdr = (struct btrfs_tlv_header *) (sctx->send_buf + sctx->send_size); put_unaligned_le16(attr, &hdr->tlv_type); put_unaligned_le16(len, &hdr->tlv_len); memcpy(hdr + 1, data, len); sctx->send_size += total_len; return 0; } #define TLV_PUT_DEFINE_INT(bits) \ static int tlv_put_u##bits(struct send_ctx *sctx, \ u##bits attr, u##bits value) \ { \ __le##bits __tmp = cpu_to_le##bits(value); \ return tlv_put(sctx, attr, &__tmp, sizeof(__tmp)); \ } TLV_PUT_DEFINE_INT(8) TLV_PUT_DEFINE_INT(32) TLV_PUT_DEFINE_INT(64) static int tlv_put_string(struct send_ctx *sctx, u16 attr, const char *str, int len) { if (len == -1) len = strlen(str); return tlv_put(sctx, attr, str, len); } static int tlv_put_uuid(struct send_ctx *sctx, u16 attr, const u8 *uuid) { return tlv_put(sctx, attr, uuid, BTRFS_UUID_SIZE); } static int tlv_put_btrfs_timespec(struct send_ctx *sctx, u16 attr, struct extent_buffer *eb, struct btrfs_timespec *ts) { struct btrfs_timespec bts; read_extent_buffer(eb, &bts, (unsigned long)ts, sizeof(bts)); return tlv_put(sctx, attr, &bts, sizeof(bts)); } #define TLV_PUT(sctx, attrtype, data, attrlen) \ do { \ ret = tlv_put(sctx, attrtype, data, attrlen); \ if (ret < 0) \ goto tlv_put_failure; \ } while (0) #define TLV_PUT_INT(sctx, attrtype, bits, value) \ do { \ ret = tlv_put_u##bits(sctx, attrtype, value); \ if (ret < 0) \ goto tlv_put_failure; \ } while (0) #define TLV_PUT_U8(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 8, data) #define TLV_PUT_U16(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 16, data) #define TLV_PUT_U32(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 32, data) #define TLV_PUT_U64(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 64, data) #define TLV_PUT_STRING(sctx, attrtype, str, len) \ do { \ ret = tlv_put_string(sctx, attrtype, str, len); \ if (ret < 0) \ goto tlv_put_failure; \ } while (0) #define TLV_PUT_PATH(sctx, attrtype, p) \ do { \ ret = tlv_put_string(sctx, attrtype, p->start, \ fs_path_len((p))); \ if (ret < 0) \ goto tlv_put_failure; \ } while(0) #define TLV_PUT_UUID(sctx, attrtype, uuid) \ do { \ ret = tlv_put_uuid(sctx, attrtype, uuid); \ if (ret < 0) \ goto tlv_put_failure; \ } while (0) #define TLV_PUT_BTRFS_TIMESPEC(sctx, attrtype, eb, ts) \ do { \ ret = tlv_put_btrfs_timespec(sctx, attrtype, eb, ts); \ if (ret < 0) \ goto tlv_put_failure; \ } while (0) static int send_header(struct send_ctx *sctx) { struct btrfs_stream_header hdr; strscpy(hdr.magic, BTRFS_SEND_STREAM_MAGIC); hdr.version = cpu_to_le32(sctx->proto); return write_buf(sctx->send_filp, &hdr, sizeof(hdr), &sctx->send_off); } /* * For each command/item we want to send to userspace, we call this function. */ static int begin_cmd(struct send_ctx *sctx, int cmd) { struct btrfs_cmd_header *hdr; if (WARN_ON(!sctx->send_buf)) return -EINVAL; if (unlikely(sctx->send_size != 0)) { btrfs_err(sctx->send_root->fs_info, "send: command header buffer not empty cmd %d offset %llu", cmd, sctx->send_off); return -EINVAL; } sctx->send_size += sizeof(*hdr); hdr = (struct btrfs_cmd_header *)sctx->send_buf; put_unaligned_le16(cmd, &hdr->cmd); return 0; } static int send_cmd(struct send_ctx *sctx) { int ret; struct btrfs_cmd_header *hdr; u32 crc; hdr = (struct btrfs_cmd_header *)sctx->send_buf; put_unaligned_le32(sctx->send_size - sizeof(*hdr), &hdr->len); put_unaligned_le32(0, &hdr->crc); crc = crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size); put_unaligned_le32(crc, &hdr->crc); ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size, &sctx->send_off); sctx->send_size = 0; sctx->put_data = false; return ret; } /* * Sends a move instruction to user space */ static int send_rename(struct send_ctx *sctx, struct fs_path *from, struct fs_path *to) { int ret; ret = begin_cmd(sctx, BTRFS_SEND_C_RENAME); if (ret < 0) return ret; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, from); TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_TO, to); ret = send_cmd(sctx); tlv_put_failure: return ret; } /* * Sends a link instruction to user space */ static int send_link(struct send_ctx *sctx, struct fs_path *path, struct fs_path *lnk) { int ret; ret = begin_cmd(sctx, BTRFS_SEND_C_LINK); if (ret < 0) return ret; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, lnk); ret = send_cmd(sctx); tlv_put_failure: return ret; } /* * Sends an unlink instruction to user space */ static int send_unlink(struct send_ctx *sctx, struct fs_path *path) { int ret; ret = begin_cmd(sctx, BTRFS_SEND_C_UNLINK); if (ret < 0) return ret; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); ret = send_cmd(sctx); tlv_put_failure: return ret; } /* * Sends a rmdir instruction to user space */ static int send_rmdir(struct send_ctx *sctx, struct fs_path *path) { int ret; ret = begin_cmd(sctx, BTRFS_SEND_C_RMDIR); if (ret < 0) return ret; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); ret = send_cmd(sctx); tlv_put_failure: return ret; } struct btrfs_inode_info { u64 size; u64 gen; u64 mode; u64 uid; u64 gid; u64 rdev; u64 fileattr; u64 nlink; }; /* * Helper function to retrieve some fields from an inode item. */ static int get_inode_info(struct btrfs_root *root, u64 ino, struct btrfs_inode_info *info) { int ret; BTRFS_PATH_AUTO_FREE(path); struct btrfs_inode_item *ii; struct btrfs_key key; path = alloc_path_for_send(); if (!path) return -ENOMEM; key.objectid = ino; key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret) { if (ret > 0) ret = -ENOENT; return ret; } if (!info) return 0; ii = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_item); info->size = btrfs_inode_size(path->nodes[0], ii); info->gen = btrfs_inode_generation(path->nodes[0], ii); info->mode = btrfs_inode_mode(path->nodes[0], ii); info->uid = btrfs_inode_uid(path->nodes[0], ii); info->gid = btrfs_inode_gid(path->nodes[0], ii); info->rdev = btrfs_inode_rdev(path->nodes[0], ii); info->nlink = btrfs_inode_nlink(path->nodes[0], ii); /* * Transfer the unchanged u64 value of btrfs_inode_item::flags, that's * otherwise logically split to 32/32 parts. */ info->fileattr = btrfs_inode_flags(path->nodes[0], ii); return 0; } static int get_inode_gen(struct btrfs_root *root, u64 ino, u64 *gen) { int ret; struct btrfs_inode_info info = { 0 }; ASSERT(gen); ret = get_inode_info(root, ino, &info); *gen = info.gen; return ret; } typedef int (*iterate_inode_ref_t)(u64 dir, struct fs_path *p, void *ctx); /* * Helper function to iterate the entries in ONE btrfs_inode_ref or * btrfs_inode_extref. * The iterate callback may return a non zero value to stop iteration. This can * be a negative value for error codes or 1 to simply stop it. * * path must point to the INODE_REF or INODE_EXTREF when called. */ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *found_key, bool resolve, iterate_inode_ref_t iterate, void *ctx) { struct extent_buffer *eb = path->nodes[0]; struct btrfs_inode_ref *iref; struct btrfs_inode_extref *extref; BTRFS_PATH_AUTO_FREE(tmp_path); struct fs_path *p; u32 cur = 0; u32 total; int slot = path->slots[0]; u32 name_len; char *start; int ret = 0; u64 dir; unsigned long name_off; unsigned long elem_size; unsigned long ptr; p = fs_path_alloc_reversed(); if (!p) return -ENOMEM; tmp_path = alloc_path_for_send(); if (!tmp_path) { fs_path_free(p); return -ENOMEM; } if (found_key->type == BTRFS_INODE_REF_KEY) { ptr = (unsigned long)btrfs_item_ptr(eb, slot, struct btrfs_inode_ref); total = btrfs_item_size(eb, slot); elem_size = sizeof(*iref); } else { ptr = btrfs_item_ptr_offset(eb, slot); total = btrfs_item_size(eb, slot); elem_size = sizeof(*extref); } while (cur < total) { fs_path_reset(p); if (found_key->type == BTRFS_INODE_REF_KEY) { iref = (struct btrfs_inode_ref *)(ptr + cur); name_len = btrfs_inode_ref_name_len(eb, iref); name_off = (unsigned long)(iref + 1); dir = found_key->offset; } else { extref = (struct btrfs_inode_extref *)(ptr + cur); name_len = btrfs_inode_extref_name_len(eb, extref); name_off = (unsigned long)&extref->name; dir = btrfs_inode_extref_parent(eb, extref); } if (resolve) { start = btrfs_ref_to_path(root, tmp_path, name_len, name_off, eb, dir, p->buf, p->buf_len); if (IS_ERR(start)) { ret = PTR_ERR(start); goto out; } if (start < p->buf) { /* overflow , try again with larger buffer */ ret = fs_path_ensure_buf(p, p->buf_len + p->buf - start); if (ret < 0) goto out; start = btrfs_ref_to_path(root, tmp_path, name_len, name_off, eb, dir, p->buf, p->buf_len); if (IS_ERR(start)) { ret = PTR_ERR(start); goto out; } if (unlikely(start < p->buf)) { btrfs_err(root->fs_info, "send: path ref buffer underflow for key " BTRFS_KEY_FMT, BTRFS_KEY_FMT_VALUE(found_key)); ret = -EINVAL; goto out; } } p->start = start; } else { ret = fs_path_add_from_extent_buffer(p, eb, name_off, name_len); if (ret < 0) goto out; } cur += elem_size + name_len; ret = iterate(dir, p, ctx); if (ret) goto out; } out: fs_path_free(p); return ret; } typedef int (*iterate_dir_item_t)(int num, struct btrfs_key *di_key, const char *name, int name_len, const char *data, int data_len, void *ctx); /* * Helper function to iterate the entries in ONE btrfs_dir_item. * The iterate callback may return a non zero value to stop iteration. This can * be a negative value for error codes or 1 to simply stop it. * * path must point to the dir item when called. */ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, iterate_dir_item_t iterate, void *ctx) { int ret = 0; struct extent_buffer *eb; struct btrfs_dir_item *di; struct btrfs_key di_key; char *buf = NULL; int buf_len; u32 name_len; u32 data_len; u32 cur; u32 len; u32 total; int slot; int num; /* * Start with a small buffer (1 page). If later we end up needing more * space, which can happen for xattrs on a fs with a leaf size greater * than the page size, attempt to increase the buffer. Typically xattr * values are small. */ buf_len = PATH_MAX; buf = kmalloc(buf_len, GFP_KERNEL); if (!buf) { ret = -ENOMEM; goto out; } eb = path->nodes[0]; slot = path->slots[0]; di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); cur = 0; len = 0; total = btrfs_item_size(eb, slot); num = 0; while (cur < total) { name_len = btrfs_dir_name_len(eb, di); data_len = btrfs_dir_data_len(eb, di); btrfs_dir_item_key_to_cpu(eb, di, &di_key); if (btrfs_dir_ftype(eb, di) == BTRFS_FT_XATTR) { if (unlikely(name_len > XATTR_NAME_MAX)) { ret = -ENAMETOOLONG; goto out; } if (unlikely(name_len + data_len > BTRFS_MAX_XATTR_SIZE(root->fs_info))) { ret = -E2BIG; goto out; } } else { /* * Path too long */ if (unlikely(name_len + data_len > PATH_MAX)) { ret = -ENAMETOOLONG; goto out; } } if (name_len + data_len > buf_len) { buf_len = name_len + data_len; if (is_vmalloc_addr(buf)) { vfree(buf); buf = NULL; } else { char *tmp = krealloc(buf, buf_len, GFP_KERNEL | __GFP_NOWARN); if (!tmp) kfree(buf); buf = tmp; } if (!buf) { buf = kvmalloc(buf_len, GFP_KERNEL); if (!buf) { ret = -ENOMEM; goto out; } } } read_extent_buffer(eb, buf, (unsigned long)(di + 1), name_len + data_len); len = sizeof(*di) + name_len + data_len; di = (struct btrfs_dir_item *)((char *)di + len); cur += len; ret = iterate(num, &di_key, buf, name_len, buf + name_len, data_len, ctx); if (ret < 0) goto out; if (ret) { ret = 0; goto out; } num++; } out: kvfree(buf); return ret; } static int __copy_first_ref(u64 dir, struct fs_path *p, void *ctx) { int ret; struct fs_path *pt = ctx; ret = fs_path_copy(pt, p); if (ret < 0) return ret; /* we want the first only */ return 1; } /* * Retrieve the first path of an inode. If an inode has more then one * ref/hardlink, this is ignored. */ static int get_inode_path(struct btrfs_root *root, u64 ino, struct fs_path *path) { int ret; struct btrfs_key key, found_key; BTRFS_PATH_AUTO_FREE(p); p = alloc_path_for_send(); if (!p) return -ENOMEM; fs_path_reset(path); key.objectid = ino; key.type = BTRFS_INODE_REF_KEY; key.offset = 0; ret = btrfs_search_slot_for_read(root, &key, p, 1, 0); if (ret < 0) return ret; if (ret) return 1; btrfs_item_key_to_cpu(p->nodes[0], &found_key, p->slots[0]); if (found_key.objectid != ino || (found_key.type != BTRFS_INODE_REF_KEY && found_key.type != BTRFS_INODE_EXTREF_KEY)) return -ENOENT; ret = iterate_inode_ref(root, p, &found_key, true, __copy_first_ref, path); if (ret < 0) return ret; return 0; } struct backref_ctx { struct send_ctx *sctx; /* number of total found references */ u64 found; /* * used for clones found in send_root. clones found behind cur_objectid * and cur_offset are not considered as allowed clones. */ u64 cur_objectid; u64 cur_offset; /* may be truncated in case it's the last extent in a file */ u64 extent_len; /* The bytenr the file extent item we are processing refers to. */ u64 bytenr; /* The owner (root id) of the data backref for the current extent. */ u64 backref_owner; /* The offset of the data backref for the current extent. */ u64 backref_offset; }; static int __clone_root_cmp_bsearch(const void *key, const void *elt) { u64 root = (u64)(uintptr_t)key; const struct clone_root *cr = elt; if (root < btrfs_root_id(cr->root)) return -1; if (root > btrfs_root_id(cr->root)) return 1; return 0; } static int __clone_root_cmp_sort(const void *e1, const void *e2) { const struct clone_root *cr1 = e1; const struct clone_root *cr2 = e2; if (btrfs_root_id(cr1->root) < btrfs_root_id(cr2->root)) return -1; if (btrfs_root_id(cr1->root) > btrfs_root_id(cr2->root)) return 1; return 0; } /* * Called for every backref that is found for the current extent. * Results are collected in sctx->clone_roots->ino/offset. */ static int iterate_backrefs(u64 ino, u64 offset, u64 num_bytes, u64 root_id, void *ctx_) { struct backref_ctx *bctx = ctx_; struct clone_root *clone_root; /* First check if the root is in the list of accepted clone sources */ clone_root = bsearch((void *)(uintptr_t)root_id, bctx->sctx->clone_roots, bctx->sctx->clone_roots_cnt, sizeof(struct clone_root), __clone_root_cmp_bsearch); if (!clone_root) return 0; /* This is our own reference, bail out as we can't clone from it. */ if (clone_root->root == bctx->sctx->send_root && ino == bctx->cur_objectid && offset == bctx->cur_offset) return 0; /* * Make sure we don't consider clones from send_root that are * behind the current inode/offset. */ if (clone_root->root == bctx->sctx->send_root) { /* * If the source inode was not yet processed we can't issue a * clone operation, as the source extent does not exist yet at * the destination of the stream. */ if (ino > bctx->cur_objectid) return 0; /* * We clone from the inode currently being sent as long as the * source extent is already processed, otherwise we could try * to clone from an extent that does not exist yet at the * destination of the stream. */ if (ino == bctx->cur_objectid && offset + bctx->extent_len > bctx->sctx->cur_inode_next_write_offset) return 0; } bctx->found++; clone_root->found_ref = true; /* * If the given backref refers to a file extent item with a larger * number of bytes than what we found before, use the new one so that * we clone more optimally and end up doing less writes and getting * less exclusive, non-shared extents at the destination. */ if (num_bytes > clone_root->num_bytes) { clone_root->ino = ino; clone_root->offset = offset; clone_root->num_bytes = num_bytes; /* * Found a perfect candidate, so there's no need to continue * backref walking. */ if (num_bytes >= bctx->extent_len) return BTRFS_ITERATE_EXTENT_INODES_STOP; } return 0; } static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx, const u64 **root_ids_ret, int *root_count_ret) { struct backref_ctx *bctx = ctx; struct send_ctx *sctx = bctx->sctx; struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; const u64 key = leaf_bytenr >> fs_info->nodesize_bits; struct btrfs_lru_cache_entry *raw_entry; struct backref_cache_entry *entry; if (sctx->backref_cache.size == 0) return false; /* * If relocation happened since we first filled the cache, then we must * empty the cache and can not use it, because even though we operate on * read-only roots, their leaves and nodes may have been reallocated and * now be used for different nodes/leaves of the same tree or some other * tree. * * We are called from iterate_extent_inodes() while either holding a * transaction handle or holding fs_info->commit_root_sem, so no need * to take any lock here. */ if (fs_info->last_reloc_trans > sctx->backref_cache_last_reloc_trans) { btrfs_lru_cache_clear(&sctx->backref_cache); return false; } raw_entry = btrfs_lru_cache_lookup(&sctx->backref_cache, key, 0); if (!raw_entry) return false; entry = container_of(raw_entry, struct backref_cache_entry, entry); *root_ids_ret = entry->root_ids; *root_count_ret = entry->num_roots; return true; } static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids, void *ctx) { struct backref_ctx *bctx = ctx; struct send_ctx *sctx = bctx->sctx; struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; struct backref_cache_entry *new_entry; struct ulist_iterator uiter; struct ulist_node *node; int ret; /* * We're called while holding a transaction handle or while holding * fs_info->commit_root_sem (at iterate_extent_inodes()), so must do a * NOFS allocation. */ new_entry = kmalloc_obj(struct backref_cache_entry, GFP_NOFS); /* No worries, cache is optional. */ if (!new_entry) return; new_entry->entry.key = leaf_bytenr >> fs_info->nodesize_bits; new_entry->entry.gen = 0; new_entry->num_roots = 0; ULIST_ITER_INIT(&uiter); while ((node = ulist_next(root_ids, &uiter)) != NULL) { const u64 root_id = node->val; struct clone_root *root; root = bsearch((void *)(uintptr_t)root_id, sctx->clone_roots, sctx->clone_roots_cnt, sizeof(struct clone_root), __clone_root_cmp_bsearch); if (!root) continue; /* Too many roots, just exit, no worries as caching is optional. */ if (new_entry->num_roots >= SEND_MAX_BACKREF_CACHE_ROOTS) { kfree(new_entry); return; } new_entry->root_ids[new_entry->num_roots] = root_id; new_entry->num_roots++; } /* * We may have not added any roots to the new cache entry, which means * none of the roots is part of the list of roots from which we are * allowed to clone. Cache the new entry as it's still useful to avoid * backref walking to determine which roots have a path to the leaf. * * Also use GFP_NOFS because we're called while holding a transaction * handle or while holding fs_info->commit_root_sem. */ ret = btrfs_lru_cache_store(&sctx->backref_cache, &new_entry->entry, GFP_NOFS); ASSERT(ret == 0 || ret == -ENOMEM); if (ret) { /* Caching is optional, no worries. */ kfree(new_entry); return; } /* * We are called from iterate_extent_inodes() while either holding a * transaction handle or holding fs_info->commit_root_sem, so no need * to take any lock here. */ if (sctx->backref_cache.size == 1) sctx->backref_cache_last_reloc_trans = fs_info->last_reloc_trans; } static int check_extent_item(u64 bytenr, const struct btrfs_extent_item *ei, const struct extent_buffer *leaf, void *ctx) { const u64 refs = btrfs_extent_refs(leaf, ei); const struct backref_ctx *bctx = ctx; const struct send_ctx *sctx = bctx->sctx; if (bytenr == bctx->bytenr) { const u64 flags = btrfs_extent_flags(leaf, ei); if (WARN_ON(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) return -EUCLEAN; /* * If we have only one reference and only the send root as a * clone source - meaning no clone roots were given in the * struct btrfs_ioctl_send_args passed to the send ioctl - then * it's our reference and there's no point in doing backref * walking which is expensive, so exit early. */ if (refs == 1 && sctx->clone_roots_cnt == 1) return -ENOENT; } /* * Backreference walking (iterate_extent_inodes() below) is currently * too expensive when an extent has a large number of references, both * in time spent and used memory. So for now just fallback to write * operations instead of clone operations when an extent has more than * a certain amount of references. */ if (refs > SEND_MAX_EXTENT_REFS) return -ENOENT; return 0; } static bool skip_self_data_ref(u64 root, u64 ino, u64 offset, void *ctx) { const struct backref_ctx *bctx = ctx; if (ino == bctx->cur_objectid && root == bctx->backref_owner && offset == bctx->backref_offset) return true; return false; } /* * Given an inode, offset and extent item, it finds a good clone for a clone * instruction. Returns -ENOENT when none could be found. The function makes * sure that the returned clone is usable at the point where sending is at the * moment. This means, that no clones are accepted which lie behind the current * inode+offset. * * path must point to the extent item when called. */ static int find_extent_clone(struct send_ctx *sctx, struct btrfs_path *path, u64 ino, u64 data_offset, u64 ino_size, struct clone_root **found) { struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; int ret; int extent_type; u64 disk_byte; u64 num_bytes; struct btrfs_file_extent_item *fi; struct extent_buffer *eb = path->nodes[0]; struct backref_ctx backref_ctx = { 0 }; struct btrfs_backref_walk_ctx backref_walk_ctx = { 0 }; struct clone_root *cur_clone_root; int compressed; u32 i; /* * With fallocate we can get prealloc extents beyond the inode's i_size, * so we don't do anything here because clone operations can not clone * to a range beyond i_size without increasing the i_size of the * destination inode. */ if (data_offset >= ino_size) return 0; fi = btrfs_item_ptr(eb, path->slots[0], struct btrfs_file_extent_item); extent_type = btrfs_file_extent_type(eb, fi); if (extent_type == BTRFS_FILE_EXTENT_INLINE) return -ENOENT; disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); if (disk_byte == 0) return -ENOENT; compressed = btrfs_file_extent_compression(eb, fi); num_bytes = btrfs_file_extent_num_bytes(eb, fi); /* * Setup the clone roots. */ for (i = 0; i < sctx->clone_roots_cnt; i++) { cur_clone_root = sctx->clone_roots + i; cur_clone_root->ino = (u64)-1; cur_clone_root->offset = 0; cur_clone_root->num_bytes = 0; cur_clone_root->found_ref = false; } backref_ctx.sctx = sctx; backref_ctx.cur_objectid = ino; backref_ctx.cur_offset = data_offset; backref_ctx.bytenr = disk_byte; /* * Use the header owner and not the send root's id, because in case of a * snapshot we can have shared subtrees. */ backref_ctx.backref_owner = btrfs_header_owner(eb); backref_ctx.backref_offset = data_offset - btrfs_file_extent_offset(eb, fi); /* * The last extent of a file may be too large due to page alignment. * We need to adjust extent_len in this case so that the checks in * iterate_backrefs() work. */ if (data_offset + num_bytes >= ino_size) backref_ctx.extent_len = ino_size - data_offset; else backref_ctx.extent_len = num_bytes; /* * Now collect all backrefs. */ backref_walk_ctx.bytenr = disk_byte; if (compressed == BTRFS_COMPRESS_NONE) backref_walk_ctx.extent_item_pos = btrfs_file_extent_offset(eb, fi); backref_walk_ctx.fs_info = fs_info; backref_walk_ctx.cache_lookup = lookup_backref_cache; backref_walk_ctx.cache_store = store_backref_cache; backref_walk_ctx.indirect_ref_iterator = iterate_backrefs; backref_walk_ctx.check_extent_item = check_extent_item; backref_walk_ctx.user_ctx = &backref_ctx; /* * If have a single clone root, then it's the send root and we can tell * the backref walking code to skip our own backref and not resolve it, * since we can not use it for cloning - the source and destination * ranges can't overlap and in case the leaf is shared through a subtree * due to snapshots, we can't use those other roots since they are not * in the list of clone roots. */ if (sctx->clone_roots_cnt == 1) backref_walk_ctx.skip_data_ref = skip_self_data_ref; ret = iterate_extent_inodes(&backref_walk_ctx, true, iterate_backrefs, &backref_ctx); if (ret < 0) return ret; down_read(&fs_info->commit_root_sem); if (fs_info->last_reloc_trans > sctx->last_reloc_trans) { /* * A transaction commit for a transaction in which block group * relocation was done just happened. * The disk_bytenr of the file extent item we processed is * possibly stale, referring to the extent's location before * relocation. So act as if we haven't found any clone sources * and fallback to write commands, which will read the correct * data from the new extent location. Otherwise we will fail * below because we haven't found our own back reference or we * could be getting incorrect sources in case the old extent * was already reallocated after the relocation. */ up_read(&fs_info->commit_root_sem); return -ENOENT; } up_read(&fs_info->commit_root_sem); if (!backref_ctx.found) return -ENOENT; cur_clone_root = NULL; for (i = 0; i < sctx->clone_roots_cnt; i++) { struct clone_root *clone_root = &sctx->clone_roots[i]; if (!clone_root->found_ref) continue; /* * Choose the root from which we can clone more bytes, to * minimize write operations and therefore have more extent * sharing at the destination (the same as in the source). */ if (!cur_clone_root || clone_root->num_bytes > cur_clone_root->num_bytes) { cur_clone_root = clone_root; /* * We found an optimal clone candidate (any inode from * any root is fine), so we're done. */ if (clone_root->num_bytes >= backref_ctx.extent_len) break; } } if (cur_clone_root) { *found = cur_clone_root; ret = 0; } else { ret = -ENOENT; } return ret; } static int read_symlink(struct btrfs_root *root, u64 ino, struct fs_path *dest) { int ret; BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_file_extent_item *ei; u8 type; u8 compression; unsigned long off; int len; path = alloc_path_for_send(); if (!path) return -ENOMEM; key.objectid = ino; key.type = BTRFS_EXTENT_DATA_KEY; key.offset = 0; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) return ret; if (unlikely(ret)) { /* * An empty symlink inode. Can happen in rare error paths when * creating a symlink (transaction committed before the inode * eviction handler removed the symlink inode items and a crash * happened in between or the subvol was snapshotted in between). * Print an informative message to dmesg/syslog so that the user * can delete the symlink. */ btrfs_err(root->fs_info, "Found empty symlink inode %llu at root %llu", ino, btrfs_root_id(root)); return -EIO; } ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_file_extent_item); type = btrfs_file_extent_type(path->nodes[0], ei); if (unlikely(type != BTRFS_FILE_EXTENT_INLINE)) { ret = -EUCLEAN; btrfs_crit(root->fs_info, "send: found symlink extent that is not inline, ino %llu root %llu extent type %d", ino, btrfs_root_id(root), type); return ret; } compression = btrfs_file_extent_compression(path->nodes[0], ei); if (unlikely(compression != BTRFS_COMPRESS_NONE)) { ret = -EUCLEAN; btrfs_crit(root->fs_info, "send: found symlink extent with compression, ino %llu root %llu compression type %d", ino, btrfs_root_id(root), compression); return ret; } off = btrfs_file_extent_inline_start(ei); len = btrfs_file_extent_ram_bytes(path->nodes[0], ei); return fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len); } /* * Helper function to generate a file name that is unique in the root of * send_root and parent_root. This is used to generate names for orphan inodes. */ static int gen_unique_name(struct send_ctx *sctx, u64 ino, u64 gen, struct fs_path *dest) { BTRFS_PATH_AUTO_FREE(path); struct btrfs_dir_item *di; char tmp[64]; int len; u64 idx = 0; path = alloc_path_for_send(); if (!path) return -ENOMEM; while (1) { struct fscrypt_str tmp_name; len = snprintf(tmp, sizeof(tmp), "o%llu-%llu-%llu", ino, gen, idx); ASSERT(len < sizeof(tmp)); tmp_name.name = tmp; tmp_name.len = len; di = btrfs_lookup_dir_item(NULL, sctx->send_root, path, BTRFS_FIRST_FREE_OBJECTID, &tmp_name, 0); btrfs_release_path(path); if (IS_ERR(di)) return PTR_ERR(di); if (di) { /* not unique, try again */ idx++; continue; } if (!sctx->parent_root) { /* unique */ break; } di = btrfs_lookup_dir_item(NULL, sctx->parent_root, path, BTRFS_FIRST_FREE_OBJECTID, &tmp_name, 0); btrfs_release_path(path); if (IS_ERR(di)) return PTR_ERR(di); if (di) { /* not unique, try again */ idx++; continue; } /* unique */ break; } return fs_path_add(dest, tmp, len); } enum inode_state { inode_state_no_change, inode_state_will_create, inode_state_did_create, inode_state_will_delete, inode_state_did_delete, }; static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen, u64 *send_gen, u64 *parent_gen) { int ret; int left_ret; int right_ret; u64 left_gen; u64 right_gen = 0; struct btrfs_inode_info info; ret = get_inode_info(sctx->send_root, ino, &info); if (ret < 0 && ret != -ENOENT) return ret; left_ret = (info.nlink == 0) ? -ENOENT : ret; left_gen = info.gen; if (send_gen) *send_gen = ((left_ret == -ENOENT) ? 0 : info.gen); if (!sctx->parent_root) { right_ret = -ENOENT; } else { ret = get_inode_info(sctx->parent_root, ino, &info); if (ret < 0 && ret != -ENOENT) return ret; right_ret = (info.nlink == 0) ? -ENOENT : ret; right_gen = info.gen; if (parent_gen) *parent_gen = ((right_ret == -ENOENT) ? 0 : info.gen); } if (!left_ret && !right_ret) { if (left_gen == gen && right_gen == gen) { ret = inode_state_no_change; } else if (left_gen == gen) { if (ino < sctx->send_progress) ret = inode_state_did_create; else ret = inode_state_will_create; } else if (right_gen == gen) { if (ino < sctx->send_progress) ret = inode_state_did_delete; else ret = inode_state_will_delete; } else { ret = -ENOENT; } } else if (!left_ret) { if (left_gen == gen) { if (ino < sctx->send_progress) ret = inode_state_did_create; else ret = inode_state_will_create; } else { ret = -ENOENT; } } else if (!right_ret) { if (right_gen == gen) { if (ino < sctx->send_progress) ret = inode_state_did_delete; else ret = inode_state_will_delete; } else { ret = -ENOENT; } } else { ret = -ENOENT; } return ret; } static int is_inode_existent(struct send_ctx *sctx, u64 ino, u64 gen, u64 *send_gen, u64 *parent_gen) { int ret; if (ino == BTRFS_FIRST_FREE_OBJECTID) return 1; ret = get_cur_inode_state(sctx, ino, gen, send_gen, parent_gen); if (ret < 0) return ret; if (ret == inode_state_no_change || ret == inode_state_did_create || ret == inode_state_will_delete) return 1; return 0; } /* * Helper function to lookup a dir item in a dir. */ static int lookup_dir_item_inode(struct btrfs_root *root, u64 dir, const char *name, int name_len, u64 *found_inode) { int ret = 0; struct btrfs_dir_item *di; struct btrfs_key key; BTRFS_PATH_AUTO_FREE(path); struct fscrypt_str name_str = FSTR_INIT((char *)name, name_len); path = alloc_path_for_send(); if (!path) return -ENOMEM; di = btrfs_lookup_dir_item(NULL, root, path, dir, &name_str, 0); if (IS_ERR_OR_NULL(di)) return di ? PTR_ERR(di) : -ENOENT; btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key); if (key.type == BTRFS_ROOT_ITEM_KEY) return -ENOENT; *found_inode = key.objectid; return ret; } /* * Looks up the first btrfs_inode_ref of a given ino. It returns the parent dir, * generation of the parent dir and the name of the dir entry. */ static int get_first_ref(struct btrfs_root *root, u64 ino, u64 *dir, u64 *dir_gen, struct fs_path *name) { int ret; struct btrfs_key key; struct btrfs_key found_key; BTRFS_PATH_AUTO_FREE(path); int len; u64 parent_dir; path = alloc_path_for_send(); if (!path) return -ENOMEM; key.objectid = ino; key.type = BTRFS_INODE_REF_KEY; key.offset = 0; ret = btrfs_search_slot_for_read(root, &key, path, 1, 0); if (ret < 0) return ret; if (!ret) btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); if (ret || found_key.objectid != ino || (found_key.type != BTRFS_INODE_REF_KEY && found_key.type != BTRFS_INODE_EXTREF_KEY)) return -ENOENT; if (found_key.type == BTRFS_INODE_REF_KEY) { struct btrfs_inode_ref *iref; iref = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_ref); len = btrfs_inode_ref_name_len(path->nodes[0], iref); ret = fs_path_add_from_extent_buffer(name, path->nodes[0], (unsigned long)(iref + 1), len); parent_dir = found_key.offset; } else { struct btrfs_inode_extref *extref; extref = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_extref); len = btrfs_inode_extref_name_len(path->nodes[0], extref); ret = fs_path_add_from_extent_buffer(name, path->nodes[0], (unsigned long)&extref->name, len); parent_dir = btrfs_inode_extref_parent(path->nodes[0], extref); } if (ret < 0) return ret; btrfs_release_path(path); if (dir_gen) { ret = get_inode_gen(root, parent_dir, dir_gen); if (ret < 0) return ret; } *dir = parent_dir; return ret; } static int is_first_ref(struct btrfs_root *root, u64 ino, u64 dir, const char *name, int name_len) { int ret; struct fs_path *tmp_name; u64 tmp_dir; tmp_name = fs_path_alloc(); if (!tmp_name) return -ENOMEM; ret = get_first_ref(root, ino, &tmp_dir, NULL, tmp_name); if (ret < 0) goto out; if (dir != tmp_dir || name_len != fs_path_len(tmp_name)) { ret = 0; goto out; } ret = !memcmp(tmp_name->start, name, name_len); out: fs_path_free(tmp_name); return ret; } /* * Used by process_recorded_refs to determine if a new ref would overwrite an * already existing ref. In case it detects an overwrite, it returns the * inode/gen in who_ino/who_gen. * When an overwrite is detected, process_recorded_refs does proper orphanizing * to make sure later references to the overwritten inode are possible. * Orphanizing is however only required for the first ref of an inode. * process_recorded_refs does an additional is_first_ref check to see if * orphanizing is really required. */ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, const char *name, int name_len, u64 *who_ino, u64 *who_gen, u64 *who_mode) { int ret; u64 parent_root_dir_gen; u64 other_inode = 0; struct btrfs_inode_info info; if (!sctx->parent_root) return 0; ret = is_inode_existent(sctx, dir, dir_gen, NULL, &parent_root_dir_gen); if (ret <= 0) return 0; /* * If we have a parent root we need to verify that the parent dir was * not deleted and then re-created, if it was then we have no overwrite * and we can just unlink this entry. * * @parent_root_dir_gen was set to 0 if the inode does not exist in the * parent root. */ if (sctx->parent_root && dir != BTRFS_FIRST_FREE_OBJECTID && parent_root_dir_gen != dir_gen) return 0; ret = lookup_dir_item_inode(sctx->parent_root, dir, name, name_len, &other_inode); if (ret == -ENOENT) return 0; else if (ret < 0) return ret; /* * Check if the overwritten ref was already processed. If yes, the ref * was already unlinked/moved, so we can safely assume that we will not * overwrite anything at this point in time. */ if (other_inode > sctx->send_progress || is_waiting_for_move(sctx, other_inode)) { ret = get_inode_info(sctx->parent_root, other_inode, &info); if (ret < 0) return ret; *who_ino = other_inode; *who_gen = info.gen; *who_mode = info.mode; return 1; } return 0; } /* * Checks if the ref was overwritten by an already processed inode. This is * used by __get_cur_name_and_parent to find out if the ref was orphanized and * thus the orphan name needs be used. * process_recorded_refs also uses it to avoid unlinking of refs that were * overwritten. */ static int did_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, u64 ino, u64 ino_gen, const char *name, int name_len) { int ret; u64 ow_inode; u64 ow_gen = 0; u64 send_root_dir_gen; if (!sctx->parent_root) return 0; ret = is_inode_existent(sctx, dir, dir_gen, &send_root_dir_gen, NULL); if (ret <= 0) return ret; /* * @send_root_dir_gen was set to 0 if the inode does not exist in the * send root. */ if (dir != BTRFS_FIRST_FREE_OBJECTID && send_root_dir_gen != dir_gen) return 0; /* check if the ref was overwritten by another ref */ ret = lookup_dir_item_inode(sctx->send_root, dir, name, name_len, &ow_inode); if (ret == -ENOENT) { /* was never and will never be overwritten */ return 0; } else if (ret < 0) { return ret; } if (ow_inode == ino) { ret = get_inode_gen(sctx->send_root, ow_inode, &ow_gen); if (ret < 0) return ret; /* It's the same inode, so no overwrite happened. */ if (ow_gen == ino_gen) return 0; } /* * We know that it is or will be overwritten. Check this now. * The current inode being processed might have been the one that caused * inode 'ino' to be orphanized, therefore check if ow_inode matches * the current inode being processed. */ if (ow_inode < sctx->send_progress) return 1; if (ino != sctx->cur_ino && ow_inode == sctx->cur_ino) { if (ow_gen == 0) { ret = get_inode_gen(sctx->send_root, ow_inode, &ow_gen); if (ret < 0) return ret; } if (ow_gen == sctx->cur_inode_gen) return 1; } return 0; } /* * Same as did_overwrite_ref, but also checks if it is the first ref of an inode * that got overwritten. This is used by process_recorded_refs to determine * if it has to use the path as returned by get_cur_path or the orphan name. */ static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen) { int ret = 0; struct fs_path *name = NULL; u64 dir; u64 dir_gen; if (!sctx->parent_root) goto out; name = fs_path_alloc(); if (!name) return -ENOMEM; ret = get_first_ref(sctx->parent_root, ino, &dir, &dir_gen, name); if (ret < 0) goto out; ret = did_overwrite_ref(sctx, dir, dir_gen, ino, gen, name->start, fs_path_len(name)); out: fs_path_free(name); return ret; } static inline struct name_cache_entry *name_cache_search(struct send_ctx *sctx, u64 ino, u64 gen) { struct btrfs_lru_cache_entry *entry; entry = btrfs_lru_cache_lookup(&sctx->name_cache, ino, gen); if (!entry) return NULL; return container_of(entry, struct name_cache_entry, entry); } /* * Used by get_cur_path for each ref up to the root. * Returns 0 if it succeeded. * Returns 1 if the inode is not existent or got overwritten. In that case, the * name is an orphan name. This instructs get_cur_path to stop iterating. If 1 * is returned, parent_ino/parent_gen are not guaranteed to be valid. * Returns <0 in case of error. */ static int __get_cur_name_and_parent(struct send_ctx *sctx, u64 ino, u64 gen, u64 *parent_ino, u64 *parent_gen, struct fs_path *dest) { int ret; int nce_ret; struct name_cache_entry *nce; /* * First check if we already did a call to this function with the same * ino/gen. If yes, check if the cache entry is still up-to-date. If yes * return the cached result. */ nce = name_cache_search(sctx, ino, gen); if (nce) { if (ino < sctx->send_progress && nce->need_later_update) { btrfs_lru_cache_remove(&sctx->name_cache, &nce->entry); nce = NULL; } else { *parent_ino = nce->parent_ino; *parent_gen = nce->parent_gen; ret = fs_path_add(dest, nce->name, nce->name_len); if (ret < 0) return ret; return nce->ret; } } /* * If the inode is not existent yet, add the orphan name and return 1. * This should only happen for the parent dir that we determine in * record_new_ref_if_needed(). */ ret = is_inode_existent(sctx, ino, gen, NULL, NULL); if (ret < 0) return ret; if (!ret) { ret = gen_unique_name(sctx, ino, gen, dest); if (ret < 0) return ret; ret = 1; goto out_cache; } /* * Depending on whether the inode was already processed or not, use * send_root or parent_root for ref lookup. */ if (ino < sctx->send_progress) ret = get_first_ref(sctx->send_root, ino, parent_ino, parent_gen, dest); else ret = get_first_ref(sctx->parent_root, ino, parent_ino, parent_gen, dest); if (ret < 0) return ret; /* * Check if the ref was overwritten by an inode's ref that was processed * earlier. If yes, treat as orphan and return 1. */ ret = did_overwrite_ref(sctx, *parent_ino, *parent_gen, ino, gen, dest->start, fs_path_len(dest)); if (ret < 0) return ret; if (ret) { fs_path_reset(dest); ret = gen_unique_name(sctx, ino, gen, dest); if (ret < 0) return ret; ret = 1; } out_cache: /* * Store the result of the lookup in the name cache. */ nce = kmalloc(sizeof(*nce) + fs_path_len(dest), GFP_KERNEL); if (!nce) return -ENOMEM; nce->entry.key = ino; nce->entry.gen = gen; nce->parent_ino = *parent_ino; nce->parent_gen = *parent_gen; nce->name_len = fs_path_len(dest); nce->ret = ret; memcpy(nce->name, dest->start, nce->name_len); if (ino < sctx->send_progress) nce->need_later_update = 0; else nce->need_later_update = 1; nce_ret = btrfs_lru_cache_store(&sctx->name_cache, &nce->entry, GFP_KERNEL); if (nce_ret < 0) { kfree(nce); return nce_ret; } return ret; } /* * Magic happens here. This function returns the first ref to an inode as it * would look like while receiving the stream at this point in time. * We walk the path up to the root. For every inode in between, we check if it * was already processed/sent. If yes, we continue with the parent as found * in send_root. If not, we continue with the parent as found in parent_root. * If we encounter an inode that was deleted at this point in time, we use the * inodes "orphan" name instead of the real name and stop. Same with new inodes * that were not created yet and overwritten inodes/refs. * * When do we have orphan inodes: * 1. When an inode is freshly created and thus no valid refs are available yet * 2. When a directory lost all it's refs (deleted) but still has dir items * inside which were not processed yet (pending for move/delete). If anyone * tried to get the path to the dir items, it would get a path inside that * orphan directory. * 3. When an inode is moved around or gets new links, it may overwrite the ref * of an unprocessed inode. If in that case the first ref would be * overwritten, the overwritten inode gets "orphanized". Later when we * process this overwritten inode, it is restored at a new place by moving * the orphan inode. * * sctx->send_progress tells this function at which point in time receiving * would be. */ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen, struct fs_path *dest) { int ret = 0; struct fs_path *name = NULL; u64 parent_inode = 0; u64 parent_gen = 0; int stop = 0; const bool is_cur_inode = (ino == sctx->cur_ino && gen == sctx->cur_inode_gen); if (is_cur_inode && fs_path_len(&sctx->cur_inode_path) > 0) { if (dest != &sctx->cur_inode_path) return fs_path_copy(dest, &sctx->cur_inode_path); return 0; } name = fs_path_alloc(); if (!name) { ret = -ENOMEM; goto out; } dest->reversed = 1; fs_path_reset(dest); while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) { struct waiting_dir_move *wdm; fs_path_reset(name); if (is_waiting_for_rm(sctx, ino, gen)) { ret = gen_unique_name(sctx, ino, gen, name); if (ret < 0) goto out; ret = fs_path_add_path(dest, name); break; } wdm = get_waiting_dir_move(sctx, ino); if (wdm && wdm->orphanized) { ret = gen_unique_name(sctx, ino, gen, name); stop = 1; } else if (wdm) { ret = get_first_ref(sctx->parent_root, ino, &parent_inode, &parent_gen, name); } else { ret = __get_cur_name_and_parent(sctx, ino, gen, &parent_inode, &parent_gen, name); if (ret) stop = 1; } if (ret < 0) goto out; ret = fs_path_add_path(dest, name); if (ret < 0) goto out; ino = parent_inode; gen = parent_gen; } out: fs_path_free(name); if (!ret) { fs_path_unreverse(dest); if (is_cur_inode && dest != &sctx->cur_inode_path) ret = fs_path_copy(&sctx->cur_inode_path, dest); } return ret; } /* * Sends a BTRFS_SEND_C_SUBVOL command/item to userspace */ static int send_subvol_begin(struct send_ctx *sctx) { int ret; struct btrfs_root *send_root = sctx->send_root; struct btrfs_root *parent_root = sctx->parent_root; BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_root_ref *ref; struct extent_buffer *leaf; char AUTO_KFREE(name); int namelen; path = btrfs_alloc_path(); if (!path) return -ENOMEM; name = kmalloc(BTRFS_PATH_NAME_MAX, GFP_KERNEL); if (!name) return -ENOMEM; key.objectid = btrfs_root_id(send_root); key.type = BTRFS_ROOT_BACKREF_KEY; key.offset = 0; ret = btrfs_search_slot_for_read(send_root->fs_info->tree_root, &key, path, 1, 0); if (ret < 0) return ret; if (ret) return -ENOENT; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); if (key.type != BTRFS_ROOT_BACKREF_KEY || key.objectid != btrfs_root_id(send_root)) { return -ENOENT; } ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); namelen = btrfs_root_ref_name_len(leaf, ref); read_extent_buffer(leaf, name, (unsigned long)(ref + 1), namelen); btrfs_release_path(path); if (parent_root) { ret = begin_cmd(sctx, BTRFS_SEND_C_SNAPSHOT); if (ret < 0) return ret; } else { ret = begin_cmd(sctx, BTRFS_SEND_C_SUBVOL); if (ret < 0) return ret; } TLV_PUT_STRING(sctx, BTRFS_SEND_A_PATH, name, namelen); if (!btrfs_is_empty_uuid(sctx->send_root->root_item.received_uuid)) TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID, sctx->send_root->root_item.received_uuid); else TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID, sctx->send_root->root_item.uuid); TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID, btrfs_root_ctransid(&sctx->send_root->root_item)); if (parent_root) { if (!btrfs_is_empty_uuid(parent_root->root_item.received_uuid)) TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, parent_root->root_item.received_uuid); else TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, parent_root->root_item.uuid); TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID, btrfs_root_ctransid(&sctx->parent_root->root_item)); } ret = send_cmd(sctx); tlv_put_failure: return ret; } static struct fs_path *get_cur_inode_path(struct send_ctx *sctx) { if (fs_path_len(&sctx->cur_inode_path) == 0) { int ret; ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, &sctx->cur_inode_path); if (ret < 0) return ERR_PTR(ret); } return &sctx->cur_inode_path; } static struct fs_path *get_path_for_command(struct send_ctx *sctx, u64 ino, u64 gen) { struct fs_path *path; int ret; if (ino == sctx->cur_ino && gen == sctx->cur_inode_gen) return get_cur_inode_path(sctx); path = fs_path_alloc(); if (!path) return ERR_PTR(-ENOMEM); ret = get_cur_path(sctx, ino, gen, path); if (ret < 0) { fs_path_free(path); return ERR_PTR(ret); } return path; } static void free_path_for_command(const struct send_ctx *sctx, struct fs_path *path) { if (path != &sctx->cur_inode_path) fs_path_free(path); } static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size) { int ret = 0; struct fs_path *p; p = get_path_for_command(sctx, ino, gen); if (IS_ERR(p)) return PTR_ERR(p); ret = begin_cmd(sctx, BTRFS_SEND_C_TRUNCATE); if (ret < 0) goto out; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, size); ret = send_cmd(sctx); tlv_put_failure: out: free_path_for_command(sctx, p); return ret; } static int send_chmod(struct send_ctx *sctx, u64 ino, u64 gen, u64 mode) { int ret = 0; struct fs_path *p; p = get_path_for_command(sctx, ino, gen); if (IS_ERR(p)) return PTR_ERR(p); ret = begin_cmd(sctx, BTRFS_SEND_C_CHMOD); if (ret < 0) goto out; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_MODE, mode & 07777); ret = send_cmd(sctx); tlv_put_failure: out: free_path_for_command(sctx, p); return ret; } static int send_fileattr(struct send_ctx *sctx, u64 ino, u64 gen, u64 fileattr) { int ret = 0; struct fs_path *p; if (sctx->proto < 2) return 0; p = get_path_for_command(sctx, ino, gen); if (IS_ERR(p)) return PTR_ERR(p); ret = begin_cmd(sctx, BTRFS_SEND_C_FILEATTR); if (ret < 0) goto out; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_FILEATTR, fileattr); ret = send_cmd(sctx); tlv_put_failure: out: free_path_for_command(sctx, p); return ret; } static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid) { int ret = 0; struct fs_path *p; p = get_path_for_command(sctx, ino, gen); if (IS_ERR(p)) return PTR_ERR(p); ret = begin_cmd(sctx, BTRFS_SEND_C_CHOWN); if (ret < 0) goto out; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_UID, uid); TLV_PUT_U64(sctx, BTRFS_SEND_A_GID, gid); ret = send_cmd(sctx); tlv_put_failure: out: free_path_for_command(sctx, p); return ret; } static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen) { int ret = 0; struct fs_path *p = NULL; struct btrfs_inode_item *ii; BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *eb; struct btrfs_key key; int slot; p = get_path_for_command(sctx, ino, gen); if (IS_ERR(p)) return PTR_ERR(p); path = alloc_path_for_send(); if (!path) { ret = -ENOMEM; goto out; } key.objectid = ino; key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0); if (ret > 0) ret = -ENOENT; if (ret < 0) goto out; eb = path->nodes[0]; slot = path->slots[0]; ii = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); ret = begin_cmd(sctx, BTRFS_SEND_C_UTIMES); if (ret < 0) goto out; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb, &ii->atime); TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb, &ii->mtime); TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb, &ii->ctime); if (sctx->proto >= 2) TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_OTIME, eb, &ii->otime); ret = send_cmd(sctx); tlv_put_failure: out: free_path_for_command(sctx, p); return ret; } /* * If the cache is full, we can't remove entries from it and do a call to * send_utimes() for each respective inode, because we might be finishing * processing an inode that is a directory and it just got renamed, and existing * entries in the cache may refer to inodes that have the directory in their * full path - in which case we would generate outdated paths (pre-rename) * for the inodes that the cache entries point to. Instead of pruning the * cache when inserting, do it after we finish processing each inode at * finish_inode_if_needed(). */ static int cache_dir_utimes(struct send_ctx *sctx, u64 dir, u64 gen) { struct btrfs_lru_cache_entry *entry; int ret; entry = btrfs_lru_cache_lookup(&sctx->dir_utimes_cache, dir, gen); if (entry != NULL) return 0; /* Caching is optional, don't fail if we can't allocate memory. */ entry = kmalloc_obj(*entry); if (!entry) return send_utimes(sctx, dir, gen); entry->key = dir; entry->gen = gen; ret = btrfs_lru_cache_store(&sctx->dir_utimes_cache, entry, GFP_KERNEL); ASSERT(ret != -EEXIST); if (ret) { kfree(entry); return send_utimes(sctx, dir, gen); } return 0; } static int trim_dir_utimes_cache(struct send_ctx *sctx) { while (sctx->dir_utimes_cache.size > SEND_MAX_DIR_UTIMES_CACHE_SIZE) { struct btrfs_lru_cache_entry *lru; int ret; lru = btrfs_lru_cache_lru_entry(&sctx->dir_utimes_cache); ASSERT(lru != NULL); ret = send_utimes(sctx, lru->key, lru->gen); if (ret) return ret; btrfs_lru_cache_remove(&sctx->dir_utimes_cache, lru); } return 0; } /* * Sends a BTRFS_SEND_C_MKXXX or SYMLINK command to user space. We don't have * a valid path yet because we did not process the refs yet. So, the inode * is created as orphan. */ static int send_create_inode(struct send_ctx *sctx, u64 ino) { int ret = 0; struct fs_path *p; int cmd; struct btrfs_inode_info info; u64 gen; u64 mode; u64 rdev; p = fs_path_alloc(); if (!p) return -ENOMEM; if (ino != sctx->cur_ino) { ret = get_inode_info(sctx->send_root, ino, &info); if (ret < 0) goto out; gen = info.gen; mode = info.mode; rdev = info.rdev; } else { gen = sctx->cur_inode_gen; mode = sctx->cur_inode_mode; rdev = sctx->cur_inode_rdev; } if (S_ISREG(mode)) { cmd = BTRFS_SEND_C_MKFILE; } else if (S_ISDIR(mode)) { cmd = BTRFS_SEND_C_MKDIR; } else if (S_ISLNK(mode)) { cmd = BTRFS_SEND_C_SYMLINK; } else if (S_ISCHR(mode) || S_ISBLK(mode)) { cmd = BTRFS_SEND_C_MKNOD; } else if (S_ISFIFO(mode)) { cmd = BTRFS_SEND_C_MKFIFO; } else if (S_ISSOCK(mode)) { cmd = BTRFS_SEND_C_MKSOCK; } else { btrfs_warn(sctx->send_root->fs_info, "unexpected inode type %o", (int)(mode & S_IFMT)); ret = -EOPNOTSUPP; goto out; } ret = begin_cmd(sctx, cmd); if (ret < 0) goto out; ret = gen_unique_name(sctx, ino, gen, p); if (ret < 0) goto out; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_INO, ino); if (S_ISLNK(mode)) { fs_path_reset(p); ret = read_symlink(sctx->send_root, ino, p); if (ret < 0) goto out; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p); } else if (S_ISCHR(mode) || S_ISBLK(mode) || S_ISFIFO(mode) || S_ISSOCK(mode)) { TLV_PUT_U64(sctx, BTRFS_SEND_A_RDEV, new_encode_dev(rdev)); TLV_PUT_U64(sctx, BTRFS_SEND_A_MODE, mode); } ret = send_cmd(sctx); if (ret < 0) goto out; tlv_put_failure: out: fs_path_free(p); return ret; } static void cache_dir_created(struct send_ctx *sctx, u64 dir) { struct btrfs_lru_cache_entry *entry; int ret; /* Caching is optional, ignore any failures. */ entry = kmalloc_obj(*entry); if (!entry) return; entry->key = dir; entry->gen = 0; ret = btrfs_lru_cache_store(&sctx->dir_created_cache, entry, GFP_KERNEL); if (ret < 0) kfree(entry); } /* * We need some special handling for inodes that get processed before the parent * directory got created. See process_recorded_refs for details. * This function does the check if we already created the dir out of order. */ static int did_create_dir(struct send_ctx *sctx, u64 dir) { int ret = 0; int iter_ret = 0; BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_key found_key; struct btrfs_key di_key; struct btrfs_dir_item *di; if (btrfs_lru_cache_lookup(&sctx->dir_created_cache, dir, 0)) return 1; path = alloc_path_for_send(); if (!path) return -ENOMEM; key.objectid = dir; key.type = BTRFS_DIR_INDEX_KEY; key.offset = 0; btrfs_for_each_slot(sctx->send_root, &key, &found_key, path, iter_ret) { struct extent_buffer *eb = path->nodes[0]; if (found_key.objectid != key.objectid || found_key.type != key.type) { ret = 0; break; } di = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dir_item); btrfs_dir_item_key_to_cpu(eb, di, &di_key); if (di_key.type != BTRFS_ROOT_ITEM_KEY && di_key.objectid < sctx->send_progress) { ret = 1; cache_dir_created(sctx, dir); break; } } /* Catch error found during iteration */ if (iter_ret < 0) ret = iter_ret; return ret; } /* * Only creates the inode if it is: * 1. Not a directory * 2. Or a directory which was not created already due to out of order * directories. See did_create_dir and process_recorded_refs for details. */ static int send_create_inode_if_needed(struct send_ctx *sctx) { int ret; if (S_ISDIR(sctx->cur_inode_mode)) { ret = did_create_dir(sctx, sctx->cur_ino); if (ret < 0) return ret; else if (ret > 0) return 0; } ret = send_create_inode(sctx, sctx->cur_ino); if (ret == 0 && S_ISDIR(sctx->cur_inode_mode)) cache_dir_created(sctx, sctx->cur_ino); return ret; } struct recorded_ref { struct list_head list; char *name; struct fs_path *full_path; u64 dir; u64 dir_gen; int name_len; struct rb_node node; struct rb_root *root; }; static struct recorded_ref *recorded_ref_alloc(void) { struct recorded_ref *ref; ref = kzalloc_obj(*ref); if (!ref) return NULL; RB_CLEAR_NODE(&ref->node); INIT_LIST_HEAD(&ref->list); return ref; } static void recorded_ref_free(struct recorded_ref *ref) { if (!ref) return; if (!RB_EMPTY_NODE(&ref->node)) rb_erase(&ref->node, ref->root); list_del(&ref->list); fs_path_free(ref->full_path); kfree(ref); } static void set_ref_path(struct recorded_ref *ref, struct fs_path *path) { ref->full_path = path; ref->name = (char *)kbasename(ref->full_path->start); ref->name_len = ref->full_path->end - ref->name; } static int dup_ref(struct recorded_ref *ref, struct list_head *list) { struct recorded_ref *new; new = recorded_ref_alloc(); if (!new) return -ENOMEM; new->dir = ref->dir; new->dir_gen = ref->dir_gen; list_add_tail(&new->list, list); return 0; } static void __free_recorded_refs(struct list_head *head) { struct recorded_ref *cur; while (!list_empty(head)) { cur = list_first_entry(head, struct recorded_ref, list); recorded_ref_free(cur); } } static void free_recorded_refs(struct send_ctx *sctx) { __free_recorded_refs(&sctx->new_refs); __free_recorded_refs(&sctx->deleted_refs); } /* * Renames/moves a file/dir to its orphan name. Used when the first * ref of an unprocessed inode gets overwritten and for all non empty * directories. */ static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen, struct fs_path *path) { int ret; struct fs_path *orphan; orphan = fs_path_alloc(); if (!orphan) return -ENOMEM; ret = gen_unique_name(sctx, ino, gen, orphan); if (ret < 0) goto out; ret = send_rename(sctx, path, orphan); if (ret < 0) goto out; if (ino == sctx->cur_ino && gen == sctx->cur_inode_gen) ret = fs_path_copy(&sctx->cur_inode_path, orphan); out: fs_path_free(orphan); return ret; } static struct orphan_dir_info *add_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino, u64 dir_gen) { struct rb_node **p = &sctx->orphan_dirs.rb_node; struct rb_node *parent = NULL; struct orphan_dir_info *entry, *odi; while (*p) { parent = *p; entry = rb_entry(parent, struct orphan_dir_info, node); if (dir_ino < entry->ino) p = &(*p)->rb_left; else if (dir_ino > entry->ino) p = &(*p)->rb_right; else if (dir_gen < entry->gen) p = &(*p)->rb_left; else if (dir_gen > entry->gen) p = &(*p)->rb_right; else return entry; } odi = kmalloc_obj(*odi); if (!odi) return ERR_PTR(-ENOMEM); odi->ino = dir_ino; odi->gen = dir_gen; odi->last_dir_index_offset = 0; odi->dir_high_seq_ino = 0; rb_link_node(&odi->node, parent, p); rb_insert_color(&odi->node, &sctx->orphan_dirs); return odi; } static struct orphan_dir_info *get_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino, u64 gen) { struct rb_node *n = sctx->orphan_dirs.rb_node; struct orphan_dir_info *entry; while (n) { entry = rb_entry(n, struct orphan_dir_info, node); if (dir_ino < entry->ino) n = n->rb_left; else if (dir_ino > entry->ino) n = n->rb_right; else if (gen < entry->gen) n = n->rb_left; else if (gen > entry->gen) n = n->rb_right; else return entry; } return NULL; } static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino, u64 gen) { struct orphan_dir_info *odi = get_orphan_dir_info(sctx, dir_ino, gen); return odi != NULL; } static void free_orphan_dir_info(struct send_ctx *sctx, struct orphan_dir_info *odi) { if (!odi) return; rb_erase(&odi->node, &sctx->orphan_dirs); kfree(odi); } /* * Returns 1 if a directory can be removed at this point in time. * We check this by iterating all dir items and checking if the inode behind * the dir item was already processed. */ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen) { int ret = 0; int iter_ret = 0; struct btrfs_root *root = sctx->parent_root; struct btrfs_path *path; struct btrfs_key key; struct btrfs_key found_key; struct btrfs_key loc; struct btrfs_dir_item *di; struct orphan_dir_info *odi = NULL; u64 dir_high_seq_ino = 0; u64 last_dir_index_offset = 0; /* * Don't try to rmdir the top/root subvolume dir. */ if (dir == BTRFS_FIRST_FREE_OBJECTID) return 0; odi = get_orphan_dir_info(sctx, dir, dir_gen); if (odi && sctx->cur_ino < odi->dir_high_seq_ino) return 0; path = alloc_path_for_send(); if (!path) return -ENOMEM; if (!odi) { /* * Find the inode number associated with the last dir index * entry. This is very likely the inode with the highest number * of all inodes that have an entry in the directory. We can * then use it to avoid future calls to can_rmdir(), when * processing inodes with a lower number, from having to search * the parent root b+tree for dir index keys. */ key.objectid = dir; key.type = BTRFS_DIR_INDEX_KEY; key.offset = (u64)-1; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) { goto out; } else if (ret > 0) { /* Can't happen, the root is never empty. */ ASSERT(path->slots[0] > 0); if (WARN_ON(path->slots[0] == 0)) { ret = -EUCLEAN; goto out; } path->slots[0]--; } btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); if (key.objectid != dir || key.type != BTRFS_DIR_INDEX_KEY) { /* No index keys, dir can be removed. */ ret = 1; goto out; } di = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_dir_item); btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc); dir_high_seq_ino = loc.objectid; if (sctx->cur_ino < dir_high_seq_ino) { ret = 0; goto out; } btrfs_release_path(path); } key.objectid = dir; key.type = BTRFS_DIR_INDEX_KEY; key.offset = (odi ? odi->last_dir_index_offset : 0); btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { struct waiting_dir_move *dm; if (found_key.objectid != key.objectid || found_key.type != key.type) break; di = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_dir_item); btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc); dir_high_seq_ino = max(dir_high_seq_ino, loc.objectid); last_dir_index_offset = found_key.offset; dm = get_waiting_dir_move(sctx, loc.objectid); if (dm) { dm->rmdir_ino = dir; dm->rmdir_gen = dir_gen; ret = 0; goto out; } if (loc.objectid > sctx->cur_ino) { ret = 0; goto out; } } if (iter_ret < 0) { ret = iter_ret; goto out; } free_orphan_dir_info(sctx, odi); ret = 1; out: btrfs_free_path(path); if (ret) return ret; if (!odi) { odi = add_orphan_dir_info(sctx, dir, dir_gen); if (IS_ERR(odi)) return PTR_ERR(odi); odi->gen = dir_gen; } odi->last_dir_index_offset = last_dir_index_offset; odi->dir_high_seq_ino = max(odi->dir_high_seq_ino, dir_high_seq_ino); return 0; } static int is_waiting_for_move(struct send_ctx *sctx, u64 ino) { struct waiting_dir_move *entry = get_waiting_dir_move(sctx, ino); return entry != NULL; } static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino, bool orphanized) { struct rb_node **p = &sctx->waiting_dir_moves.rb_node; struct rb_node *parent = NULL; struct waiting_dir_move *entry, *dm; dm = kmalloc_obj(*dm); if (!dm) return -ENOMEM; dm->ino = ino; dm->rmdir_ino = 0; dm->rmdir_gen = 0; dm->orphanized = orphanized; while (*p) { parent = *p; entry = rb_entry(parent, struct waiting_dir_move, node); if (ino < entry->ino) { p = &(*p)->rb_left; } else if (ino > entry->ino) { p = &(*p)->rb_right; } else { kfree(dm); return -EEXIST; } } rb_link_node(&dm->node, parent, p); rb_insert_color(&dm->node, &sctx->waiting_dir_moves); return 0; } static struct waiting_dir_move * get_waiting_dir_move(struct send_ctx *sctx, u64 ino) { struct rb_node *n = sctx->waiting_dir_moves.rb_node; struct waiting_dir_move *entry; while (n) { entry = rb_entry(n, struct waiting_dir_move, node); if (ino < entry->ino) n = n->rb_left; else if (ino > entry->ino) n = n->rb_right; else return entry; } return NULL; } static void free_waiting_dir_move(struct send_ctx *sctx, struct waiting_dir_move *dm) { if (!dm) return; rb_erase(&dm->node, &sctx->waiting_dir_moves); kfree(dm); } static int add_pending_dir_move(struct send_ctx *sctx, u64 ino, u64 ino_gen, u64 parent_ino, struct list_head *new_refs, struct list_head *deleted_refs, const bool is_orphan) { struct rb_node **p = &sctx->pending_dir_moves.rb_node; struct rb_node *parent = NULL; struct pending_dir_move *entry = NULL, *pm; struct recorded_ref *cur; int exists = 0; int ret; pm = kmalloc_obj(*pm); if (!pm) return -ENOMEM; pm->parent_ino = parent_ino; pm->ino = ino; pm->gen = ino_gen; INIT_LIST_HEAD(&pm->list); INIT_LIST_HEAD(&pm->update_refs); RB_CLEAR_NODE(&pm->node); while (*p) { parent = *p; entry = rb_entry(parent, struct pending_dir_move, node); if (parent_ino < entry->parent_ino) { p = &(*p)->rb_left; } else if (parent_ino > entry->parent_ino) { p = &(*p)->rb_right; } else { exists = 1; break; } } list_for_each_entry(cur, deleted_refs, list) { ret = dup_ref(cur, &pm->update_refs); if (ret < 0) goto out; } list_for_each_entry(cur, new_refs, list) { ret = dup_ref(cur, &pm->update_refs); if (ret < 0) goto out; } ret = add_waiting_dir_move(sctx, pm->ino, is_orphan); if (ret) goto out; if (exists) { list_add_tail(&pm->list, &entry->list); } else { rb_link_node(&pm->node, parent, p); rb_insert_color(&pm->node, &sctx->pending_dir_moves); } ret = 0; out: if (ret) { __free_recorded_refs(&pm->update_refs); kfree(pm); } return ret; } static struct pending_dir_move *get_pending_dir_moves(struct send_ctx *sctx, u64 parent_ino) { struct rb_node *n = sctx->pending_dir_moves.rb_node; struct pending_dir_move *entry; while (n) { entry = rb_entry(n, struct pending_dir_move, node); if (parent_ino < entry->parent_ino) n = n->rb_left; else if (parent_ino > entry->parent_ino) n = n->rb_right; else return entry; } return NULL; } static int path_loop(struct send_ctx *sctx, struct fs_path *name, u64 ino, u64 gen, u64 *ancestor_ino) { int ret = 0; u64 parent_inode = 0; u64 parent_gen = 0; u64 start_ino = ino; *ancestor_ino = 0; while (ino != BTRFS_FIRST_FREE_OBJECTID) { fs_path_reset(name); if (is_waiting_for_rm(sctx, ino, gen)) break; if (is_waiting_for_move(sctx, ino)) { if (*ancestor_ino == 0) *ancestor_ino = ino; ret = get_first_ref(sctx->parent_root, ino, &parent_inode, &parent_gen, name); } else { ret = __get_cur_name_and_parent(sctx, ino, gen, &parent_inode, &parent_gen, name); if (ret > 0) { ret = 0; break; } } if (ret < 0) break; if (parent_inode == start_ino) { ret = 1; if (*ancestor_ino == 0) *ancestor_ino = ino; break; } ino = parent_inode; gen = parent_gen; } return ret; } static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) { struct fs_path *from_path = NULL; struct fs_path *to_path = NULL; struct fs_path *name = NULL; u64 orig_progress = sctx->send_progress; struct recorded_ref *cur; u64 parent_ino, parent_gen; struct waiting_dir_move *dm = NULL; u64 rmdir_ino = 0; u64 rmdir_gen; u64 ancestor; bool is_orphan; int ret; name = fs_path_alloc(); from_path = fs_path_alloc(); if (!name || !from_path) { ret = -ENOMEM; goto out; } dm = get_waiting_dir_move(sctx, pm->ino); ASSERT(dm); rmdir_ino = dm->rmdir_ino; rmdir_gen = dm->rmdir_gen; is_orphan = dm->orphanized; free_waiting_dir_move(sctx, dm); if (is_orphan) { ret = gen_unique_name(sctx, pm->ino, pm->gen, from_path); } else { ret = get_first_ref(sctx->parent_root, pm->ino, &parent_ino, &parent_gen, name); if (ret < 0) goto out; ret = get_cur_path(sctx, parent_ino, parent_gen, from_path); if (ret < 0) goto out; ret = fs_path_add_path(from_path, name); } if (ret < 0) goto out; sctx->send_progress = sctx->cur_ino + 1; ret = path_loop(sctx, name, pm->ino, pm->gen, &ancestor); if (ret < 0) goto out; if (ret) { LIST_HEAD(deleted_refs); ASSERT(ancestor > BTRFS_FIRST_FREE_OBJECTID); ret = add_pending_dir_move(sctx, pm->ino, pm->gen, ancestor, &pm->update_refs, &deleted_refs, is_orphan); if (ret < 0) goto out; if (rmdir_ino) { dm = get_waiting_dir_move(sctx, pm->ino); ASSERT(dm); dm->rmdir_ino = rmdir_ino; dm->rmdir_gen = rmdir_gen; } goto out; } fs_path_reset(name); to_path = name; name = NULL; ret = get_cur_path(sctx, pm->ino, pm->gen, to_path); if (ret < 0) goto out; ret = send_rename(sctx, from_path, to_path); if (ret < 0) goto out; if (rmdir_ino) { struct orphan_dir_info *odi; u64 gen; odi = get_orphan_dir_info(sctx, rmdir_ino, rmdir_gen); if (!odi) { /* already deleted */ goto finish; } gen = odi->gen; ret = can_rmdir(sctx, rmdir_ino, gen); if (ret < 0) goto out; if (!ret) goto finish; name = fs_path_alloc(); if (!name) { ret = -ENOMEM; goto out; } ret = get_cur_path(sctx, rmdir_ino, gen, name); if (ret < 0) goto out; ret = send_rmdir(sctx, name); if (ret < 0) goto out; } finish: ret = cache_dir_utimes(sctx, pm->ino, pm->gen); if (ret < 0) goto out; /* * After rename/move, need to update the utimes of both new parent(s) * and old parent(s). */ list_for_each_entry(cur, &pm->update_refs, list) { /* * The parent inode might have been deleted in the send snapshot */ ret = get_inode_info(sctx->send_root, cur->dir, NULL); if (ret == -ENOENT) { ret = 0; continue; } if (ret < 0) goto out; ret = cache_dir_utimes(sctx, cur->dir, cur->dir_gen); if (ret < 0) goto out; } out: fs_path_free(name); fs_path_free(from_path); fs_path_free(to_path); sctx->send_progress = orig_progress; return ret; } static void free_pending_move(struct send_ctx *sctx, struct pending_dir_move *m) { if (!list_empty(&m->list)) list_del(&m->list); if (!RB_EMPTY_NODE(&m->node)) rb_erase(&m->node, &sctx->pending_dir_moves); __free_recorded_refs(&m->update_refs); kfree(m); } static void tail_append_pending_moves(struct send_ctx *sctx, struct pending_dir_move *moves, struct list_head *stack) { if (list_empty(&moves->list)) { list_add_tail(&moves->list, stack); } else { LIST_HEAD(list); list_splice_init(&moves->list, &list); list_add_tail(&moves->list, stack); list_splice_tail(&list, stack); } if (!RB_EMPTY_NODE(&moves->node)) { rb_erase(&moves->node, &sctx->pending_dir_moves); RB_CLEAR_NODE(&moves->node); } } static int apply_children_dir_moves(struct send_ctx *sctx) { struct pending_dir_move *pm; LIST_HEAD(stack); u64 parent_ino = sctx->cur_ino; int ret = 0; pm = get_pending_dir_moves(sctx, parent_ino); if (!pm) return 0; tail_append_pending_moves(sctx, pm, &stack); while (!list_empty(&stack)) { pm = list_first_entry(&stack, struct pending_dir_move, list); parent_ino = pm->ino; ret = apply_dir_move(sctx, pm); free_pending_move(sctx, pm); if (ret) goto out; pm = get_pending_dir_moves(sctx, parent_ino); if (pm) tail_append_pending_moves(sctx, pm, &stack); } return 0; out: while (!list_empty(&stack)) { pm = list_first_entry(&stack, struct pending_dir_move, list); free_pending_move(sctx, pm); } return ret; } /* * We might need to delay a directory rename even when no ancestor directory * (in the send root) with a higher inode number than ours (sctx->cur_ino) was * renamed. This happens when we rename a directory to the old name (the name * in the parent root) of some other unrelated directory that got its rename * delayed due to some ancestor with higher number that got renamed. * * Example: * * Parent snapshot: * . (ino 256) * |---- a/ (ino 257) * | |---- file (ino 260) * | * |---- b/ (ino 258) * |---- c/ (ino 259) * * Send snapshot: * . (ino 256) * |---- a/ (ino 258) * |---- x/ (ino 259) * |---- y/ (ino 257) * |----- file (ino 260) * * Here we can not rename 258 from 'b' to 'a' without the rename of inode 257 * from 'a' to 'x/y' happening first, which in turn depends on the rename of * inode 259 from 'c' to 'x'. So the order of rename commands the send stream * must issue is: * * 1 - rename 259 from 'c' to 'x' * 2 - rename 257 from 'a' to 'x/y' * 3 - rename 258 from 'b' to 'a' * * Returns 1 if the rename of sctx->cur_ino needs to be delayed, 0 if it can * be done right away and < 0 on error. */ static int wait_for_dest_dir_move(struct send_ctx *sctx, struct recorded_ref *parent_ref, const bool is_orphan) { BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_key di_key; struct btrfs_dir_item *di; u64 left_gen; u64 right_gen; int ret = 0; struct waiting_dir_move *wdm; if (RB_EMPTY_ROOT(&sctx->waiting_dir_moves)) return 0; path = alloc_path_for_send(); if (!path) return -ENOMEM; key.objectid = parent_ref->dir; key.type = BTRFS_DIR_ITEM_KEY; key.offset = btrfs_name_hash(parent_ref->name, parent_ref->name_len); ret = btrfs_search_slot(NULL, sctx->parent_root, &key, path, 0, 0); if (ret < 0) return ret; if (ret > 0) return 0; di = btrfs_match_dir_item_name(path, parent_ref->name, parent_ref->name_len); if (!di) return 0; /* * di_key.objectid has the number of the inode that has a dentry in the * parent directory with the same name that sctx->cur_ino is being * renamed to. We need to check if that inode is in the send root as * well and if it is currently marked as an inode with a pending rename, * if it is, we need to delay the rename of sctx->cur_ino as well, so * that it happens after that other inode is renamed. */ btrfs_dir_item_key_to_cpu(path->nodes[0], di, &di_key); if (di_key.type != BTRFS_INODE_ITEM_KEY) return 0; ret = get_inode_gen(sctx->parent_root, di_key.objectid, &left_gen); if (ret < 0) return ret; ret = get_inode_gen(sctx->send_root, di_key.objectid, &right_gen); if (ret < 0) { if (ret == -ENOENT) ret = 0; return ret; } /* Different inode, no need to delay the rename of sctx->cur_ino */ if (right_gen != left_gen) return 0; wdm = get_waiting_dir_move(sctx, di_key.objectid); if (wdm && !wdm->orphanized) { ret = add_pending_dir_move(sctx, sctx->cur_ino, sctx->cur_inode_gen, di_key.objectid, &sctx->new_refs, &sctx->deleted_refs, is_orphan); if (!ret) ret = 1; } return ret; } /* * Check if inode ino2, or any of its ancestors, is inode ino1. * Return 1 if true, 0 if false and < 0 on error. */ static int check_ino_in_path(struct btrfs_root *root, const u64 ino1, const u64 ino1_gen, const u64 ino2, const u64 ino2_gen, struct fs_path *fs_path) { u64 ino = ino2; if (ino1 == ino2) return ino1_gen == ino2_gen; while (ino > BTRFS_FIRST_FREE_OBJECTID) { u64 parent; u64 parent_gen; int ret; fs_path_reset(fs_path); ret = get_first_ref(root, ino, &parent, &parent_gen, fs_path); if (ret < 0) return ret; if (parent == ino1) return parent_gen == ino1_gen; ino = parent; } return 0; } /* * Check if inode ino1 is an ancestor of inode ino2 in the given root for any * possible path (in case ino2 is not a directory and has multiple hard links). * Return 1 if true, 0 if false and < 0 on error. */ static int is_ancestor(struct btrfs_root *root, const u64 ino1, const u64 ino1_gen, const u64 ino2, struct fs_path *fs_path) { bool free_fs_path = false; int ret = 0; int iter_ret = 0; BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; if (!fs_path) { fs_path = fs_path_alloc(); if (!fs_path) return -ENOMEM; free_fs_path = true; } path = alloc_path_for_send(); if (!path) { ret = -ENOMEM; goto out; } key.objectid = ino2; key.type = BTRFS_INODE_REF_KEY; key.offset = 0; btrfs_for_each_slot(root, &key, &key, path, iter_ret) { struct extent_buffer *leaf = path->nodes[0]; int slot = path->slots[0]; u32 cur_offset = 0; u32 item_size; if (key.objectid != ino2) break; if (key.type != BTRFS_INODE_REF_KEY && key.type != BTRFS_INODE_EXTREF_KEY) break; item_size = btrfs_item_size(leaf, slot); while (cur_offset < item_size) { u64 parent; u64 parent_gen; if (key.type == BTRFS_INODE_EXTREF_KEY) { unsigned long ptr; struct btrfs_inode_extref *extref; ptr = btrfs_item_ptr_offset(leaf, slot); extref = (struct btrfs_inode_extref *) (ptr + cur_offset); parent = btrfs_inode_extref_parent(leaf, extref); cur_offset += sizeof(*extref); cur_offset += btrfs_inode_extref_name_len(leaf, extref); } else { parent = key.offset; cur_offset = item_size; } ret = get_inode_gen(root, parent, &parent_gen); if (ret < 0) goto out; ret = check_ino_in_path(root, ino1, ino1_gen, parent, parent_gen, fs_path); if (ret) goto out; } } ret = 0; if (iter_ret < 0) ret = iter_ret; out: if (free_fs_path) fs_path_free(fs_path); return ret; } static int wait_for_parent_move(struct send_ctx *sctx, struct recorded_ref *parent_ref, const bool is_orphan) { int ret = 0; u64 ino = parent_ref->dir; u64 ino_gen = parent_ref->dir_gen; u64 parent_ino_before, parent_ino_after; struct fs_path *path_before = NULL; struct fs_path *path_after = NULL; int len1, len2; path_after = fs_path_alloc(); path_before = fs_path_alloc(); if (!path_after || !path_before) { ret = -ENOMEM; goto out; } /* * Our current directory inode may not yet be renamed/moved because some * ancestor (immediate or not) has to be renamed/moved first. So find if * such ancestor exists and make sure our own rename/move happens after * that ancestor is processed to avoid path build infinite loops (done * at get_cur_path()). */ while (ino > BTRFS_FIRST_FREE_OBJECTID) { u64 parent_ino_after_gen; if (is_waiting_for_move(sctx, ino)) { /* * If the current inode is an ancestor of ino in the * parent root, we need to delay the rename of the * current inode, otherwise don't delayed the rename * because we can end up with a circular dependency * of renames, resulting in some directories never * getting the respective rename operations issued in * the send stream or getting into infinite path build * loops. */ ret = is_ancestor(sctx->parent_root, sctx->cur_ino, sctx->cur_inode_gen, ino, path_before); if (ret) break; } fs_path_reset(path_before); fs_path_reset(path_after); ret = get_first_ref(sctx->send_root, ino, &parent_ino_after, &parent_ino_after_gen, path_after); if (ret < 0) goto out; ret = get_first_ref(sctx->parent_root, ino, &parent_ino_before, NULL, path_before); if (ret < 0 && ret != -ENOENT) { goto out; } else if (ret == -ENOENT) { ret = 0; break; } len1 = fs_path_len(path_before); len2 = fs_path_len(path_after); if (ino > sctx->cur_ino && (parent_ino_before != parent_ino_after || len1 != len2 || memcmp(path_before->start, path_after->start, len1))) { u64 parent_ino_gen; ret = get_inode_gen(sctx->parent_root, ino, &parent_ino_gen); if (ret < 0) goto out; if (ino_gen == parent_ino_gen) { ret = 1; break; } } ino = parent_ino_after; ino_gen = parent_ino_after_gen; } out: fs_path_free(path_before); fs_path_free(path_after); if (ret == 1) { ret = add_pending_dir_move(sctx, sctx->cur_ino, sctx->cur_inode_gen, ino, &sctx->new_refs, &sctx->deleted_refs, is_orphan); if (!ret) ret = 1; } return ret; } static int update_ref_path(struct send_ctx *sctx, struct recorded_ref *ref) { int ret; struct fs_path *new_path; /* * Our reference's name member points to its full_path member string, so * we use here a new path. */ new_path = fs_path_alloc(); if (!new_path) return -ENOMEM; ret = get_cur_path(sctx, ref->dir, ref->dir_gen, new_path); if (ret < 0) { fs_path_free(new_path); return ret; } ret = fs_path_add(new_path, ref->name, ref->name_len); if (ret < 0) { fs_path_free(new_path); return ret; } fs_path_free(ref->full_path); set_ref_path(ref, new_path); return 0; } /* * When processing the new references for an inode we may orphanize an existing * directory inode because its old name conflicts with one of the new references * of the current inode. Later, when processing another new reference of our * inode, we might need to orphanize another inode, but the path we have in the * reference reflects the pre-orphanization name of the directory we previously * orphanized. For example: * * parent snapshot looks like: * * . (ino 256) * |----- f1 (ino 257) * |----- f2 (ino 258) * |----- d1/ (ino 259) * |----- d2/ (ino 260) * * send snapshot looks like: * * . (ino 256) * |----- d1 (ino 258) * |----- f2/ (ino 259) * |----- f2_link/ (ino 260) * | |----- f1 (ino 257) * | * |----- d2 (ino 258) * * When processing inode 257 we compute the name for inode 259 as "d1", and we * cache it in the name cache. Later when we start processing inode 258, when * collecting all its new references we set a full path of "d1/d2" for its new * reference with name "d2". When we start processing the new references we * start by processing the new reference with name "d1", and this results in * orphanizing inode 259, since its old reference causes a conflict. Then we * move on the next new reference, with name "d2", and we find out we must * orphanize inode 260, as its old reference conflicts with ours - but for the * orphanization we use a source path corresponding to the path we stored in the * new reference, which is "d1/d2" and not "o259-6-0/d2" - this makes the * receiver fail since the path component "d1/" no longer exists, it was renamed * to "o259-6-0/" when processing the previous new reference. So in this case we * must recompute the path in the new reference and use it for the new * orphanization operation. */ static int refresh_ref_path(struct send_ctx *sctx, struct recorded_ref *ref) { char AUTO_KFREE(name); int ret; name = kmemdup(ref->name, ref->name_len, GFP_KERNEL); if (!name) return -ENOMEM; fs_path_reset(ref->full_path); ret = get_cur_path(sctx, ref->dir, ref->dir_gen, ref->full_path); if (ret < 0) return ret; ret = fs_path_add(ref->full_path, name, ref->name_len); if (ret < 0) return ret; /* Update the reference's base name pointer. */ set_ref_path(ref, ref->full_path); return 0; } static int rbtree_check_dir_ref_comp(const void *k, const struct rb_node *node) { const struct recorded_ref *data = k; const struct recorded_ref *ref = rb_entry(node, struct recorded_ref, node); if (data->dir > ref->dir) return 1; if (data->dir < ref->dir) return -1; if (data->dir_gen > ref->dir_gen) return 1; if (data->dir_gen < ref->dir_gen) return -1; return 0; } static bool rbtree_check_dir_ref_less(struct rb_node *node, const struct rb_node *parent) { const struct recorded_ref *entry = rb_entry(node, struct recorded_ref, node); return rbtree_check_dir_ref_comp(entry, parent) < 0; } static int record_check_dir_ref_in_tree(struct rb_root *root, struct recorded_ref *ref, struct list_head *list) { struct recorded_ref *tmp_ref; int ret; if (rb_find(ref, root, rbtree_check_dir_ref_comp)) return 0; ret = dup_ref(ref, list); if (ret < 0) return ret; tmp_ref = list_last_entry(list, struct recorded_ref, list); rb_add(&tmp_ref->node, root, rbtree_check_dir_ref_less); tmp_ref->root = root; return 0; } static int rename_current_inode(struct send_ctx *sctx, struct fs_path *current_path, struct fs_path *new_path) { int ret; ret = send_rename(sctx, current_path, new_path); if (ret < 0) return ret; ret = fs_path_copy(&sctx->cur_inode_path, new_path); if (ret < 0) return ret; return fs_path_copy(current_path, new_path); } /* * This does all the move/link/unlink/rmdir magic. */ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) { struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; int ret = 0; struct recorded_ref *cur; struct recorded_ref *cur2; LIST_HEAD(check_dirs); struct rb_root rbtree_check_dirs = RB_ROOT; struct fs_path *valid_path = NULL; u64 ow_inode = 0; u64 ow_gen; u64 ow_mode; bool did_overwrite = false; bool is_orphan = false; bool can_rename = true; bool orphanized_dir = false; bool orphanized_ancestor = false; /* * This should never happen as the root dir always has the same ref * which is always '..' */ if (unlikely(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID)) { btrfs_err(fs_info, "send: unexpected inode %llu in process_recorded_refs()", sctx->cur_ino); ret = -EINVAL; goto out; } valid_path = fs_path_alloc(); if (!valid_path) { ret = -ENOMEM; goto out; } /* * First, check if the first ref of the current inode was overwritten * before. If yes, we know that the current inode was already orphanized * and thus use the orphan name. If not, we can use get_cur_path to * get the path of the first ref as it would like while receiving at * this point in time. * New inodes are always orphan at the beginning, so force to use the * orphan name in this case. * The first ref is stored in valid_path and will be updated if it * gets moved around. */ if (!sctx->cur_inode_new) { ret = did_overwrite_first_ref(sctx, sctx->cur_ino, sctx->cur_inode_gen); if (ret < 0) goto out; if (ret) did_overwrite = true; } if (sctx->cur_inode_new || did_overwrite) { ret = gen_unique_name(sctx, sctx->cur_ino, sctx->cur_inode_gen, valid_path); if (ret < 0) goto out; is_orphan = true; } else { ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, valid_path); if (ret < 0) goto out; } /* * Before doing any rename and link operations, do a first pass on the * new references to orphanize any unprocessed inodes that may have a * reference that conflicts with one of the new references of the current * inode. This needs to happen first because a new reference may conflict * with the old reference of a parent directory, so we must make sure * that the path used for link and rename commands don't use an * orphanized name when an ancestor was not yet orphanized. * * Example: * * Parent snapshot: * * . (ino 256) * |----- testdir/ (ino 259) * | |----- a (ino 257) * | * |----- b (ino 258) * * Send snapshot: * * . (ino 256) * |----- testdir_2/ (ino 259) * | |----- a (ino 260) * | * |----- testdir (ino 257) * |----- b (ino 257) * |----- b2 (ino 258) * * Processing the new reference for inode 257 with name "b" may happen * before processing the new reference with name "testdir". If so, we * must make sure that by the time we send a link command to create the * hard link "b", inode 259 was already orphanized, since the generated * path in "valid_path" already contains the orphanized name for 259. * We are processing inode 257, so only later when processing 259 we do * the rename operation to change its temporary (orphanized) name to * "testdir_2". */ list_for_each_entry(cur, &sctx->new_refs, list) { ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen, NULL, NULL); if (ret < 0) goto out; if (ret == inode_state_will_create) continue; /* * Check if this new ref would overwrite the first ref of another * unprocessed inode. If yes, orphanize the overwritten inode. * If we find an overwritten ref that is not the first ref, * simply unlink it. */ ret = will_overwrite_ref(sctx, cur->dir, cur->dir_gen, cur->name, cur->name_len, &ow_inode, &ow_gen, &ow_mode); if (ret < 0) goto out; if (ret) { ret = is_first_ref(sctx->parent_root, ow_inode, cur->dir, cur->name, cur->name_len); if (ret < 0) goto out; if (ret) { struct name_cache_entry *nce; struct waiting_dir_move *wdm; if (orphanized_dir) { ret = refresh_ref_path(sctx, cur); if (ret < 0) goto out; } ret = orphanize_inode(sctx, ow_inode, ow_gen, cur->full_path); if (ret < 0) goto out; if (S_ISDIR(ow_mode)) orphanized_dir = true; /* * If ow_inode has its rename operation delayed * make sure that its orphanized name is used in * the source path when performing its rename * operation. */ wdm = get_waiting_dir_move(sctx, ow_inode); if (wdm) wdm->orphanized = true; /* * Make sure we clear our orphanized inode's * name from the name cache. This is because the * inode ow_inode might be an ancestor of some * other inode that will be orphanized as well * later and has an inode number greater than * sctx->send_progress. We need to prevent * future name lookups from using the old name * and get instead the orphan name. */ nce = name_cache_search(sctx, ow_inode, ow_gen); if (nce) btrfs_lru_cache_remove(&sctx->name_cache, &nce->entry); /* * ow_inode might currently be an ancestor of * cur_ino, therefore compute valid_path (the * current path of cur_ino) again because it * might contain the pre-orphanization name of * ow_inode, which is no longer valid. */ ret = is_ancestor(sctx->parent_root, ow_inode, ow_gen, sctx->cur_ino, NULL); if (ret > 0) { orphanized_ancestor = true; fs_path_reset(valid_path); fs_path_reset(&sctx->cur_inode_path); ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, valid_path); } if (ret < 0) goto out; } else { /* * If we previously orphanized a directory that * collided with a new reference that we already * processed, recompute the current path because * that directory may be part of the path. */ if (orphanized_dir) { ret = refresh_ref_path(sctx, cur); if (ret < 0) goto out; } ret = send_unlink(sctx, cur->full_path); if (ret < 0) goto out; } } } list_for_each_entry(cur, &sctx->new_refs, list) { /* * We may have refs where the parent directory does not exist * yet. This happens if the parent directories inum is higher * than the current inum. To handle this case, we create the * parent directory out of order. But we need to check if this * did already happen before due to other refs in the same dir. */ ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen, NULL, NULL); if (ret < 0) goto out; if (ret == inode_state_will_create) { ret = 0; /* * First check if any of the current inodes refs did * already create the dir. */ list_for_each_entry(cur2, &sctx->new_refs, list) { if (cur == cur2) break; if (cur2->dir == cur->dir) { ret = 1; break; } } /* * If that did not happen, check if a previous inode * did already create the dir. */ if (!ret) ret = did_create_dir(sctx, cur->dir); if (ret < 0) goto out; if (!ret) { ret = send_create_inode(sctx, cur->dir); if (ret < 0) goto out; cache_dir_created(sctx, cur->dir); } } if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root) { ret = wait_for_dest_dir_move(sctx, cur, is_orphan); if (ret < 0) goto out; if (ret == 1) { can_rename = false; *pending_move = 1; } } if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root && can_rename) { ret = wait_for_parent_move(sctx, cur, is_orphan); if (ret < 0) goto out; if (ret == 1) { can_rename = false; *pending_move = 1; } } /* * link/move the ref to the new place. If we have an orphan * inode, move it and update valid_path. If not, link or move * it depending on the inode mode. */ if (is_orphan && can_rename) { ret = rename_current_inode(sctx, valid_path, cur->full_path); if (ret < 0) goto out; is_orphan = false; } else if (can_rename) { if (S_ISDIR(sctx->cur_inode_mode)) { /* * Dirs can't be linked, so move it. For moved * dirs, we always have one new and one deleted * ref. The deleted ref is ignored later. */ ret = rename_current_inode(sctx, valid_path, cur->full_path); if (ret < 0) goto out; } else { /* * We might have previously orphanized an inode * which is an ancestor of our current inode, * so our reference's full path, which was * computed before any such orphanizations, must * be updated. */ if (orphanized_dir) { ret = update_ref_path(sctx, cur); if (ret < 0) goto out; } ret = send_link(sctx, cur->full_path, valid_path); if (ret < 0) goto out; } } ret = record_check_dir_ref_in_tree(&rbtree_check_dirs, cur, &check_dirs); if (ret < 0) goto out; } if (S_ISDIR(sctx->cur_inode_mode) && sctx->cur_inode_deleted) { /* * Check if we can already rmdir the directory. If not, * orphanize it. For every dir item inside that gets deleted * later, we do this check again and rmdir it then if possible. * See the use of check_dirs for more details. */ ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_inode_gen); if (ret < 0) goto out; if (ret) { ret = send_rmdir(sctx, valid_path); if (ret < 0) goto out; } else if (!is_orphan) { ret = orphanize_inode(sctx, sctx->cur_ino, sctx->cur_inode_gen, valid_path); if (ret < 0) goto out; is_orphan = true; } list_for_each_entry(cur, &sctx->deleted_refs, list) { ret = record_check_dir_ref_in_tree(&rbtree_check_dirs, cur, &check_dirs); if (ret < 0) goto out; } } else if (S_ISDIR(sctx->cur_inode_mode) && !list_empty(&sctx->deleted_refs)) { /* * We have a moved dir. Add the old parent to check_dirs */ cur = list_first_entry(&sctx->deleted_refs, struct recorded_ref, list); ret = record_check_dir_ref_in_tree(&rbtree_check_dirs, cur, &check_dirs); if (ret < 0) goto out; } else if (!S_ISDIR(sctx->cur_inode_mode)) { /* * We have a non dir inode. Go through all deleted refs and * unlink them if they were not already overwritten by other * inodes. */ list_for_each_entry(cur, &sctx->deleted_refs, list) { ret = did_overwrite_ref(sctx, cur->dir, cur->dir_gen, sctx->cur_ino, sctx->cur_inode_gen, cur->name, cur->name_len); if (ret < 0) goto out; if (!ret) { /* * If we orphanized any ancestor before, we need * to recompute the full path for deleted names, * since any such path was computed before we * processed any references and orphanized any * ancestor inode. */ if (orphanized_ancestor) { ret = update_ref_path(sctx, cur); if (ret < 0) goto out; } ret = send_unlink(sctx, cur->full_path); if (ret < 0) goto out; if (is_current_inode_path(sctx, cur->full_path)) fs_path_reset(&sctx->cur_inode_path); } ret = record_check_dir_ref_in_tree(&rbtree_check_dirs, cur, &check_dirs); if (ret < 0) goto out; } /* * If the inode is still orphan, unlink the orphan. This may * happen when a previous inode did overwrite the first ref * of this inode and no new refs were added for the current * inode. Unlinking does not mean that the inode is deleted in * all cases. There may still be links to this inode in other * places. */ if (is_orphan) { ret = send_unlink(sctx, valid_path); if (ret < 0) goto out; } } /* * We did collect all parent dirs where cur_inode was once located. We * now go through all these dirs and check if they are pending for * deletion and if it's finally possible to perform the rmdir now. * We also update the inode stats of the parent dirs here. */ list_for_each_entry(cur, &check_dirs, list) { /* * In case we had refs into dirs that were not processed yet, * we don't need to do the utime and rmdir logic for these dirs. * The dir will be processed later. */ if (cur->dir > sctx->cur_ino) continue; ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen, NULL, NULL); if (ret < 0) goto out; if (ret == inode_state_did_create || ret == inode_state_no_change) { ret = cache_dir_utimes(sctx, cur->dir, cur->dir_gen); if (ret < 0) goto out; } else if (ret == inode_state_did_delete) { ret = can_rmdir(sctx, cur->dir, cur->dir_gen); if (ret < 0) goto out; if (ret) { ret = get_cur_path(sctx, cur->dir, cur->dir_gen, valid_path); if (ret < 0) goto out; ret = send_rmdir(sctx, valid_path); if (ret < 0) goto out; } } } ret = 0; out: __free_recorded_refs(&check_dirs); free_recorded_refs(sctx); fs_path_free(valid_path); return ret; } static int rbtree_ref_comp(const void *k, const struct rb_node *node) { const struct recorded_ref *data = k; const struct recorded_ref *ref = rb_entry(node, struct recorded_ref, node); if (data->dir > ref->dir) return 1; if (data->dir < ref->dir) return -1; if (data->dir_gen > ref->dir_gen) return 1; if (data->dir_gen < ref->dir_gen) return -1; if (data->name_len > ref->name_len) return 1; if (data->name_len < ref->name_len) return -1; return strcmp(data->name, ref->name); } static bool rbtree_ref_less(struct rb_node *node, const struct rb_node *parent) { const struct recorded_ref *entry = rb_entry(node, struct recorded_ref, node); return rbtree_ref_comp(entry, parent) < 0; } static int record_ref_in_tree(struct rb_root *root, struct list_head *refs, struct fs_path *name, u64 dir, u64 dir_gen, struct send_ctx *sctx) { int ret = 0; struct fs_path *path = NULL; struct recorded_ref *ref = NULL; path = fs_path_alloc(); if (!path) { ret = -ENOMEM; goto out; } ref = recorded_ref_alloc(); if (!ref) { ret = -ENOMEM; goto out; } ret = get_cur_path(sctx, dir, dir_gen, path); if (ret < 0) goto out; ret = fs_path_add_path(path, name); if (ret < 0) goto out; ref->dir = dir; ref->dir_gen = dir_gen; set_ref_path(ref, path); list_add_tail(&ref->list, refs); rb_add(&ref->node, root, rbtree_ref_less); ref->root = root; out: if (ret) { if (path && (!ref || !ref->full_path)) fs_path_free(path); recorded_ref_free(ref); } return ret; } static int record_new_ref_if_needed(u64 dir, struct fs_path *name, void *ctx) { int ret; struct send_ctx *sctx = ctx; struct rb_node *node = NULL; struct recorded_ref data; struct recorded_ref *ref; u64 dir_gen; ret = get_inode_gen(sctx->send_root, dir, &dir_gen); if (ret < 0) return ret; data.dir = dir; data.dir_gen = dir_gen; set_ref_path(&data, name); node = rb_find(&data, &sctx->rbtree_deleted_refs, rbtree_ref_comp); if (node) { ref = rb_entry(node, struct recorded_ref, node); recorded_ref_free(ref); } else { ret = record_ref_in_tree(&sctx->rbtree_new_refs, &sctx->new_refs, name, dir, dir_gen, sctx); } return ret; } static int record_deleted_ref_if_needed(u64 dir, struct fs_path *name, void *ctx) { int ret; struct send_ctx *sctx = ctx; struct rb_node *node = NULL; struct recorded_ref data; struct recorded_ref *ref; u64 dir_gen; ret = get_inode_gen(sctx->parent_root, dir, &dir_gen); if (ret < 0) return ret; data.dir = dir; data.dir_gen = dir_gen; set_ref_path(&data, name); node = rb_find(&data, &sctx->rbtree_new_refs, rbtree_ref_comp); if (node) { ref = rb_entry(node, struct recorded_ref, node); recorded_ref_free(ref); } else { ret = record_ref_in_tree(&sctx->rbtree_deleted_refs, &sctx->deleted_refs, name, dir, dir_gen, sctx); } return ret; } static int record_new_ref(struct send_ctx *sctx) { int ret; ret = iterate_inode_ref(sctx->send_root, sctx->left_path, sctx->cmp_key, false, record_new_ref_if_needed, sctx); if (ret < 0) return ret; return 0; } static int record_deleted_ref(struct send_ctx *sctx) { int ret; ret = iterate_inode_ref(sctx->parent_root, sctx->right_path, sctx->cmp_key, false, record_deleted_ref_if_needed, sctx); if (ret < 0) return ret; return 0; } static int record_changed_ref(struct send_ctx *sctx) { int ret; ret = iterate_inode_ref(sctx->send_root, sctx->left_path, sctx->cmp_key, false, record_new_ref_if_needed, sctx); if (ret < 0) return ret; ret = iterate_inode_ref(sctx->parent_root, sctx->right_path, sctx->cmp_key, false, record_deleted_ref_if_needed, sctx); if (ret < 0) return ret; return 0; } /* * Record and process all refs at once. Needed when an inode changes the * generation number, which means that it was deleted and recreated. */ static int process_all_refs(struct send_ctx *sctx, enum btrfs_compare_tree_result cmd) { int ret = 0; int iter_ret = 0; struct btrfs_root *root; BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_key found_key; iterate_inode_ref_t cb; int pending_move = 0; path = alloc_path_for_send(); if (!path) return -ENOMEM; if (cmd == BTRFS_COMPARE_TREE_NEW) { root = sctx->send_root; cb = record_new_ref_if_needed; } else if (cmd == BTRFS_COMPARE_TREE_DELETED) { root = sctx->parent_root; cb = record_deleted_ref_if_needed; } else { btrfs_err(sctx->send_root->fs_info, "Wrong command %d in process_all_refs", cmd); return -EINVAL; } key.objectid = sctx->cmp_key->objectid; key.type = BTRFS_INODE_REF_KEY; key.offset = 0; btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { if (found_key.objectid != key.objectid || (found_key.type != BTRFS_INODE_REF_KEY && found_key.type != BTRFS_INODE_EXTREF_KEY)) break; ret = iterate_inode_ref(root, path, &found_key, false, cb, sctx); if (ret < 0) return ret; } /* Catch error found during iteration */ if (iter_ret < 0) return iter_ret; btrfs_release_path(path); /* * We don't actually care about pending_move as we are simply * re-creating this inode and will be rename'ing it into place once we * rename the parent directory. */ return process_recorded_refs(sctx, &pending_move); } static int send_set_xattr(struct send_ctx *sctx, const char *name, int name_len, const char *data, int data_len) { struct fs_path *path; int ret; path = get_cur_inode_path(sctx); if (IS_ERR(path)) return PTR_ERR(path); ret = begin_cmd(sctx, BTRFS_SEND_C_SET_XATTR); if (ret < 0) return ret; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len); TLV_PUT(sctx, BTRFS_SEND_A_XATTR_DATA, data, data_len); ret = send_cmd(sctx); tlv_put_failure: return ret; } static int send_remove_xattr(struct send_ctx *sctx, struct fs_path *path, const char *name, int name_len) { int ret; ret = begin_cmd(sctx, BTRFS_SEND_C_REMOVE_XATTR); if (ret < 0) return ret; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len); ret = send_cmd(sctx); tlv_put_failure: return ret; } static int __process_new_xattr(int num, struct btrfs_key *di_key, const char *name, int name_len, const char *data, int data_len, void *ctx) { struct send_ctx *sctx = ctx; struct posix_acl_xattr_header dummy_acl; /* Capabilities are emitted by finish_inode_if_needed */ if (!strncmp(name, XATTR_NAME_CAPS, name_len)) return 0; /* * This hack is needed because empty acls are stored as zero byte * data in xattrs. Problem with that is, that receiving these zero byte * acls will fail later. To fix this, we send a dummy acl list that * only contains the version number and no entries. */ if (!strncmp(name, XATTR_NAME_POSIX_ACL_ACCESS, name_len) || !strncmp(name, XATTR_NAME_POSIX_ACL_DEFAULT, name_len)) { if (data_len == 0) { dummy_acl.a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION); data = (char *)&dummy_acl; data_len = sizeof(dummy_acl); } } return send_set_xattr(sctx, name, name_len, data, data_len); } static int __process_deleted_xattr(int num, struct btrfs_key *di_key, const char *name, int name_len, const char *data, int data_len, void *ctx) { struct send_ctx *sctx = ctx; struct fs_path *p; p = get_cur_inode_path(sctx); if (IS_ERR(p)) return PTR_ERR(p); return send_remove_xattr(sctx, p, name, name_len); } static int process_new_xattr(struct send_ctx *sctx) { return iterate_dir_item(sctx->send_root, sctx->left_path, __process_new_xattr, sctx); } static int process_deleted_xattr(struct send_ctx *sctx) { return iterate_dir_item(sctx->parent_root, sctx->right_path, __process_deleted_xattr, sctx); } struct find_xattr_ctx { const char *name; int name_len; int found_idx; char *found_data; int found_data_len; bool copy_data; }; static int __find_xattr(int num, struct btrfs_key *di_key, const char *name, int name_len, const char *data, int data_len, void *vctx) { struct find_xattr_ctx *ctx = vctx; if (name_len == ctx->name_len && strncmp(name, ctx->name, name_len) == 0) { ctx->found_idx = num; ctx->found_data_len = data_len; if (ctx->copy_data) { ctx->found_data = kmemdup(data, data_len, GFP_KERNEL); if (!ctx->found_data) return -ENOMEM; } return 1; } return 0; } static int find_xattr(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *key, const char *name, int name_len, char **data, int *data_len) { int ret; struct find_xattr_ctx ctx; ctx.name = name; ctx.name_len = name_len; ctx.found_idx = -1; ctx.found_data = NULL; ctx.found_data_len = 0; ctx.copy_data = (data != NULL); ret = iterate_dir_item(root, path, __find_xattr, &ctx); if (ret < 0) return ret; if (ctx.found_idx == -1) return -ENOENT; if (data) { *data = ctx.found_data; *data_len = ctx.found_data_len; } else { ASSERT(ctx.found_data == NULL); } return ctx.found_idx; } static int __process_changed_new_xattr(int num, struct btrfs_key *di_key, const char *name, int name_len, const char *data, int data_len, void *ctx) { int ret; struct send_ctx *sctx = ctx; char AUTO_KFREE(found_data); int found_data_len = 0; ret = find_xattr(sctx->parent_root, sctx->right_path, sctx->cmp_key, name, name_len, &found_data, &found_data_len); if (ret == -ENOENT) { ret = __process_new_xattr(num, di_key, name, name_len, data, data_len, ctx); } else if (ret >= 0) { if (data_len != found_data_len || memcmp(data, found_data, data_len)) { ret = __process_new_xattr(num, di_key, name, name_len, data, data_len, ctx); } else { ret = 0; } } return ret; } static int __process_changed_deleted_xattr(int num, struct btrfs_key *di_key, const char *name, int name_len, const char *data, int data_len, void *ctx) { int ret; struct send_ctx *sctx = ctx; ret = find_xattr(sctx->send_root, sctx->left_path, sctx->cmp_key, name, name_len, NULL, NULL); if (ret == -ENOENT) ret = __process_deleted_xattr(num, di_key, name, name_len, data, data_len, ctx); else if (ret >= 0) ret = 0; return ret; } static int process_changed_xattr(struct send_ctx *sctx) { int ret; ret = iterate_dir_item(sctx->send_root, sctx->left_path, __process_changed_new_xattr, sctx); if (ret < 0) return ret; return iterate_dir_item(sctx->parent_root, sctx->right_path, __process_changed_deleted_xattr, sctx); } static int process_all_new_xattrs(struct send_ctx *sctx) { int ret = 0; int iter_ret = 0; struct btrfs_root *root; BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_key found_key; path = alloc_path_for_send(); if (!path) return -ENOMEM; root = sctx->send_root; key.objectid = sctx->cmp_key->objectid; key.type = BTRFS_XATTR_ITEM_KEY; key.offset = 0; btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { if (found_key.objectid != key.objectid || found_key.type != key.type) { ret = 0; break; } ret = iterate_dir_item(root, path, __process_new_xattr, sctx); if (ret < 0) break; } /* Catch error found during iteration */ if (iter_ret < 0) ret = iter_ret; return ret; } static int send_verity(struct send_ctx *sctx, struct fs_path *path, struct fsverity_descriptor *desc) { int ret; ret = begin_cmd(sctx, BTRFS_SEND_C_ENABLE_VERITY); if (ret < 0) return ret; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); TLV_PUT_U8(sctx, BTRFS_SEND_A_VERITY_ALGORITHM, le8_to_cpu(desc->hash_algorithm)); TLV_PUT_U32(sctx, BTRFS_SEND_A_VERITY_BLOCK_SIZE, 1U << le8_to_cpu(desc->log_blocksize)); TLV_PUT(sctx, BTRFS_SEND_A_VERITY_SALT_DATA, desc->salt, le8_to_cpu(desc->salt_size)); TLV_PUT(sctx, BTRFS_SEND_A_VERITY_SIG_DATA, desc->signature, le32_to_cpu(desc->sig_size)); ret = send_cmd(sctx); tlv_put_failure: return ret; } static int process_verity(struct send_ctx *sctx) { int ret = 0; struct btrfs_inode *inode; struct fs_path *p; inode = btrfs_iget(sctx->cur_ino, sctx->send_root); if (IS_ERR(inode)) return PTR_ERR(inode); ret = btrfs_get_verity_descriptor(&inode->vfs_inode, NULL, 0); if (ret < 0) goto iput; if (unlikely(ret > FS_VERITY_MAX_DESCRIPTOR_SIZE)) { ret = -EMSGSIZE; goto iput; } if (!sctx->verity_descriptor) { sctx->verity_descriptor = kvmalloc(FS_VERITY_MAX_DESCRIPTOR_SIZE, GFP_KERNEL); if (!sctx->verity_descriptor) { ret = -ENOMEM; goto iput; } } ret = btrfs_get_verity_descriptor(&inode->vfs_inode, sctx->verity_descriptor, ret); if (ret < 0) goto iput; p = get_cur_inode_path(sctx); if (IS_ERR(p)) { ret = PTR_ERR(p); goto iput; } ret = send_verity(sctx, p, sctx->verity_descriptor); iput: iput(&inode->vfs_inode); return ret; } static inline u64 max_send_read_size(const struct send_ctx *sctx) { return sctx->send_max_size - SZ_16K; } static int put_data_header(struct send_ctx *sctx, u32 len) { if (WARN_ON_ONCE(sctx->put_data)) return -EINVAL; sctx->put_data = true; if (sctx->proto >= 2) { /* * Since v2, the data attribute header doesn't include a length, * it is implicitly to the end of the command. */ if (unlikely(sctx->send_max_size - sctx->send_size < sizeof(__le16) + len)) return -EOVERFLOW; put_unaligned_le16(BTRFS_SEND_A_DATA, sctx->send_buf + sctx->send_size); sctx->send_size += sizeof(__le16); } else { struct btrfs_tlv_header *hdr; if (unlikely(sctx->send_max_size - sctx->send_size < sizeof(*hdr) + len)) return -EOVERFLOW; hdr = (struct btrfs_tlv_header *)(sctx->send_buf + sctx->send_size); put_unaligned_le16(BTRFS_SEND_A_DATA, &hdr->tlv_type); put_unaligned_le16(len, &hdr->tlv_len); sctx->send_size += sizeof(*hdr); } return 0; } static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len) { struct btrfs_root *root = sctx->send_root; struct btrfs_fs_info *fs_info = root->fs_info; u64 cur = offset; const u64 end = offset + len; const pgoff_t last_index = ((end - 1) >> PAGE_SHIFT); struct address_space *mapping = sctx->cur_inode->i_mapping; int ret; ret = put_data_header(sctx, len); if (ret) return ret; while (cur < end) { pgoff_t index = (cur >> PAGE_SHIFT); unsigned int cur_len; unsigned int pg_offset; struct folio *folio; folio = filemap_lock_folio(mapping, index); if (IS_ERR(folio)) { page_cache_sync_readahead(mapping, &sctx->ra, NULL, index, last_index + 1 - index); folio = filemap_grab_folio(mapping, index); if (IS_ERR(folio)) { ret = PTR_ERR(folio); break; } } pg_offset = offset_in_folio(folio, cur); cur_len = min_t(unsigned int, end - cur, folio_size(folio) - pg_offset); if (folio_test_readahead(folio)) page_cache_async_readahead(mapping, &sctx->ra, NULL, folio, last_index + 1 - index); if (!folio_test_uptodate(folio)) { btrfs_read_folio(NULL, folio); folio_lock(folio); if (unlikely(!folio_test_uptodate(folio))) { folio_unlock(folio); btrfs_err(fs_info, "send: IO error at offset %llu for inode %llu root %llu", folio_pos(folio), sctx->cur_ino, btrfs_root_id(sctx->send_root)); folio_put(folio); ret = -EIO; break; } if (folio->mapping != mapping) { folio_unlock(folio); folio_put(folio); continue; } } memcpy_from_folio(sctx->send_buf + sctx->send_size, folio, pg_offset, cur_len); folio_unlock(folio); folio_put(folio); cur += cur_len; sctx->send_size += cur_len; } return ret; } /* * Read some bytes from the current inode/file and send a write command to * user space. */ static int send_write(struct send_ctx *sctx, u64 offset, u32 len) { int ret = 0; struct fs_path *p; p = get_cur_inode_path(sctx); if (IS_ERR(p)) return PTR_ERR(p); ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE); if (ret < 0) return ret; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); ret = put_file_data(sctx, offset, len); if (ret < 0) return ret; ret = send_cmd(sctx); tlv_put_failure: return ret; } /* * Send a clone command to user space. */ static int send_clone(struct send_ctx *sctx, u64 offset, u32 len, struct clone_root *clone_root) { int ret = 0; struct fs_path *p; struct fs_path *cur_inode_path; u64 gen; cur_inode_path = get_cur_inode_path(sctx); if (IS_ERR(cur_inode_path)) return PTR_ERR(cur_inode_path); p = fs_path_alloc(); if (!p) return -ENOMEM; ret = begin_cmd(sctx, BTRFS_SEND_C_CLONE); if (ret < 0) goto out; TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_LEN, len); TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, cur_inode_path); if (clone_root->root == sctx->send_root) { ret = get_inode_gen(sctx->send_root, clone_root->ino, &gen); if (ret < 0) goto out; ret = get_cur_path(sctx, clone_root->ino, gen, p); } else { ret = get_inode_path(clone_root->root, clone_root->ino, p); } if (ret < 0) goto out; /* * If the parent we're using has a received_uuid set then use that as * our clone source as that is what we will look for when doing a * receive. * * This covers the case that we create a snapshot off of a received * subvolume and then use that as the parent and try to receive on a * different host. */ if (!btrfs_is_empty_uuid(clone_root->root->root_item.received_uuid)) TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, clone_root->root->root_item.received_uuid); else TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, clone_root->root->root_item.uuid); TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID, btrfs_root_ctransid(&clone_root->root->root_item)); TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_OFFSET, clone_root->offset); ret = send_cmd(sctx); tlv_put_failure: out: fs_path_free(p); return ret; } /* * Send an update extent command to user space. */ static int send_update_extent(struct send_ctx *sctx, u64 offset, u32 len) { int ret = 0; struct fs_path *p; p = get_cur_inode_path(sctx); if (IS_ERR(p)) return PTR_ERR(p); ret = begin_cmd(sctx, BTRFS_SEND_C_UPDATE_EXTENT); if (ret < 0) return ret; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, len); ret = send_cmd(sctx); tlv_put_failure: return ret; } static int send_fallocate(struct send_ctx *sctx, u32 mode, u64 offset, u64 len) { struct fs_path *path; int ret; path = get_cur_inode_path(sctx); if (IS_ERR(path)) return PTR_ERR(path); ret = begin_cmd(sctx, BTRFS_SEND_C_FALLOCATE); if (ret < 0) return ret; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); TLV_PUT_U32(sctx, BTRFS_SEND_A_FALLOCATE_MODE, mode); TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, len); ret = send_cmd(sctx); tlv_put_failure: return ret; } static int send_hole(struct send_ctx *sctx, u64 end) { struct fs_path *p = NULL; u64 read_size = max_send_read_size(sctx); u64 offset = sctx->cur_inode_last_extent; int ret = 0; /* * Starting with send stream v2 we have fallocate and can use it to * punch holes instead of sending writes full of zeroes. */ if (proto_cmd_ok(sctx, BTRFS_SEND_C_FALLOCATE)) return send_fallocate(sctx, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, end - offset); /* * A hole that starts at EOF or beyond it. Since we do not yet support * fallocate (for extent preallocation and hole punching), sending a * write of zeroes starting at EOF or beyond would later require issuing * a truncate operation which would undo the write and achieve nothing. */ if (offset >= sctx->cur_inode_size) return 0; /* * Don't go beyond the inode's i_size due to prealloc extents that start * after the i_size. */ end = min_t(u64, end, sctx->cur_inode_size); if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) return send_update_extent(sctx, offset, end - offset); p = get_cur_inode_path(sctx); if (IS_ERR(p)) return PTR_ERR(p); while (offset < end) { u64 len = min(end - offset, read_size); ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE); if (ret < 0) break; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); ret = put_data_header(sctx, len); if (ret < 0) break; memset(sctx->send_buf + sctx->send_size, 0, len); sctx->send_size += len; ret = send_cmd(sctx); if (ret < 0) break; offset += len; } sctx->cur_inode_next_write_offset = offset; tlv_put_failure: return ret; } static int send_encoded_inline_extent(struct send_ctx *sctx, struct btrfs_path *path, u64 offset, u64 len) { struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; struct fs_path *fspath; struct extent_buffer *leaf = path->nodes[0]; struct btrfs_key key; struct btrfs_file_extent_item *ei; u64 ram_bytes; size_t inline_size; int ret; fspath = get_cur_inode_path(sctx); if (IS_ERR(fspath)) return PTR_ERR(fspath); ret = begin_cmd(sctx, BTRFS_SEND_C_ENCODED_WRITE); if (ret < 0) return ret; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); ram_bytes = btrfs_file_extent_ram_bytes(leaf, ei); inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]); TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, fspath); TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_FILE_LEN, min(key.offset + ram_bytes - offset, len)); TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_LEN, ram_bytes); TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_OFFSET, offset - key.offset); ret = btrfs_encoded_io_compression_from_extent(fs_info, btrfs_file_extent_compression(leaf, ei)); if (ret < 0) return ret; TLV_PUT_U32(sctx, BTRFS_SEND_A_COMPRESSION, ret); ret = put_data_header(sctx, inline_size); if (ret < 0) return ret; read_extent_buffer(leaf, sctx->send_buf + sctx->send_size, btrfs_file_extent_inline_start(ei), inline_size); sctx->send_size += inline_size; ret = send_cmd(sctx); tlv_put_failure: return ret; } static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path, u64 offset, u64 len) { struct btrfs_root *root = sctx->send_root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_inode *inode; struct fs_path *fspath; struct extent_buffer *leaf = path->nodes[0]; struct btrfs_key key; struct btrfs_file_extent_item *ei; u64 disk_bytenr, disk_num_bytes; u32 data_offset; struct btrfs_cmd_header *hdr; u32 crc; int ret; inode = btrfs_iget(sctx->cur_ino, root); if (IS_ERR(inode)) return PTR_ERR(inode); fspath = get_cur_inode_path(sctx); if (IS_ERR(fspath)) { ret = PTR_ERR(fspath); goto out; } ret = begin_cmd(sctx, BTRFS_SEND_C_ENCODED_WRITE); if (ret < 0) goto out; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, ei); TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, fspath); TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_FILE_LEN, min(key.offset + btrfs_file_extent_num_bytes(leaf, ei) - offset, len)); TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_LEN, btrfs_file_extent_ram_bytes(leaf, ei)); TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_OFFSET, offset - key.offset + btrfs_file_extent_offset(leaf, ei)); ret = btrfs_encoded_io_compression_from_extent(fs_info, btrfs_file_extent_compression(leaf, ei)); if (ret < 0) goto out; TLV_PUT_U32(sctx, BTRFS_SEND_A_COMPRESSION, ret); TLV_PUT_U32(sctx, BTRFS_SEND_A_ENCRYPTION, 0); ret = put_data_header(sctx, disk_num_bytes); if (ret < 0) goto out; /* * We want to do I/O directly into the send buffer, so get the next page * boundary in the send buffer. This means that there may be a gap * between the beginning of the command and the file data. */ data_offset = PAGE_ALIGN(sctx->send_size); if (unlikely(data_offset > sctx->send_max_size || sctx->send_max_size - data_offset < disk_num_bytes)) { ret = -EOVERFLOW; goto out; } /* * Note that send_buf is a mapping of send_buf_pages, so this is really * reading into send_buf. */ ret = btrfs_encoded_read_regular_fill_pages(inode, disk_bytenr, disk_num_bytes, sctx->send_buf_pages + (data_offset >> PAGE_SHIFT), NULL); if (ret) goto out; hdr = (struct btrfs_cmd_header *)sctx->send_buf; hdr->len = cpu_to_le32(sctx->send_size + disk_num_bytes - sizeof(*hdr)); hdr->crc = 0; crc = crc32c(0, sctx->send_buf, sctx->send_size); crc = crc32c(crc, sctx->send_buf + data_offset, disk_num_bytes); hdr->crc = cpu_to_le32(crc); ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size, &sctx->send_off); if (!ret) { ret = write_buf(sctx->send_filp, sctx->send_buf + data_offset, disk_num_bytes, &sctx->send_off); } sctx->send_size = 0; sctx->put_data = false; tlv_put_failure: out: iput(&inode->vfs_inode); return ret; } static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path, const u64 offset, const u64 len) { const u64 end = offset + len; struct extent_buffer *leaf = path->nodes[0]; struct btrfs_file_extent_item *ei; u64 read_size = max_send_read_size(sctx); u64 sent = 0; if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) return send_update_extent(sctx, offset, len); ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); if ((sctx->flags & BTRFS_SEND_FLAG_COMPRESSED) && btrfs_file_extent_compression(leaf, ei) != BTRFS_COMPRESS_NONE) { bool is_inline = (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE); /* * Send the compressed extent unless the compressed data is * larger than the decompressed data. This can happen if we're * not sending the entire extent, either because it has been * partially overwritten/truncated or because this is a part of * the extent that we couldn't clone in clone_range(). */ if (is_inline && btrfs_file_extent_inline_item_len(leaf, path->slots[0]) <= len) { return send_encoded_inline_extent(sctx, path, offset, len); } else if (!is_inline && btrfs_file_extent_disk_num_bytes(leaf, ei) <= len) { return send_encoded_extent(sctx, path, offset, len); } } if (sctx->cur_inode == NULL) { struct btrfs_inode *btrfs_inode; struct btrfs_root *root = sctx->send_root; btrfs_inode = btrfs_iget(sctx->cur_ino, root); if (IS_ERR(btrfs_inode)) return PTR_ERR(btrfs_inode); sctx->cur_inode = &btrfs_inode->vfs_inode; memset(&sctx->ra, 0, sizeof(struct file_ra_state)); file_ra_state_init(&sctx->ra, sctx->cur_inode->i_mapping); /* * It's very likely there are no pages from this inode in the page * cache, so after reading extents and sending their data, we clean * the page cache to avoid trashing the page cache (adding pressure * to the page cache and forcing eviction of other data more useful * for applications). * * We decide if we should clean the page cache simply by checking * if the inode's mapping nrpages is 0 when we first open it, and * not by using something like filemap_range_has_page() before * reading an extent because when we ask the readahead code to * read a given file range, it may (and almost always does) read * pages from beyond that range (see the documentation for * page_cache_sync_readahead()), so it would not be reliable, * because after reading the first extent future calls to * filemap_range_has_page() would return true because the readahead * on the previous extent resulted in reading pages of the current * extent as well. */ sctx->clean_page_cache = (sctx->cur_inode->i_mapping->nrpages == 0); sctx->page_cache_clear_start = round_down(offset, PAGE_SIZE); } while (sent < len) { u64 size = min(len - sent, read_size); int ret; ret = send_write(sctx, offset + sent, size); if (ret < 0) return ret; sent += size; } if (sctx->clean_page_cache && PAGE_ALIGNED(end)) { /* * Always operate only on ranges that are a multiple of the page * size. This is not only to prevent zeroing parts of a page in * the case of subpage sector size, but also to guarantee we evict * pages, as passing a range that is smaller than page size does * not evict the respective page (only zeroes part of its content). * * Always start from the end offset of the last range cleared. * This is because the readahead code may (and very often does) * reads pages beyond the range we request for readahead. So if * we have an extent layout like this: * * [ extent A ] [ extent B ] [ extent C ] * * When we ask page_cache_sync_readahead() to read extent A, it * may also trigger reads for pages of extent B. If we are doing * an incremental send and extent B has not changed between the * parent and send snapshots, some or all of its pages may end * up being read and placed in the page cache. So when truncating * the page cache we always start from the end offset of the * previously processed extent up to the end of the current * extent. */ truncate_inode_pages_range(&sctx->cur_inode->i_data, sctx->page_cache_clear_start, end - 1); sctx->page_cache_clear_start = end; } return 0; } /* * Search for a capability xattr related to sctx->cur_ino. If the capability is * found, call send_set_xattr function to emit it. * * Return 0 if there isn't a capability, or when the capability was emitted * successfully, or < 0 if an error occurred. */ static int send_capabilities(struct send_ctx *sctx) { BTRFS_PATH_AUTO_FREE(path); struct btrfs_dir_item *di; struct extent_buffer *leaf; unsigned long data_ptr; char AUTO_KFREE(buf); int buf_len; int ret = 0; path = alloc_path_for_send(); if (!path) return -ENOMEM; di = btrfs_lookup_xattr(NULL, sctx->send_root, path, sctx->cur_ino, XATTR_NAME_CAPS, strlen(XATTR_NAME_CAPS), 0); if (!di) { /* There is no xattr for this inode */ return 0; } else if (IS_ERR(di)) { return PTR_ERR(di); } leaf = path->nodes[0]; buf_len = btrfs_dir_data_len(leaf, di); buf = kmalloc(buf_len, GFP_KERNEL); if (!buf) return -ENOMEM; data_ptr = (unsigned long)(di + 1) + btrfs_dir_name_len(leaf, di); read_extent_buffer(leaf, buf, data_ptr, buf_len); ret = send_set_xattr(sctx, XATTR_NAME_CAPS, strlen(XATTR_NAME_CAPS), buf, buf_len); return ret; } static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path, struct clone_root *clone_root, const u64 disk_byte, u64 data_offset, u64 offset, u64 len) { BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; int ret; struct btrfs_inode_info info; u64 clone_src_i_size = 0; /* * Prevent cloning from a zero offset with a length matching the sector * size because in some scenarios this will make the receiver fail. * * For example, if in the source filesystem the extent at offset 0 * has a length of sectorsize and it was written using direct IO, then * it can never be an inline extent (even if compression is enabled). * Then this extent can be cloned in the original filesystem to a non * zero file offset, but it may not be possible to clone in the * destination filesystem because it can be inlined due to compression * on the destination filesystem (as the receiver's write operations are * always done using buffered IO). The same happens when the original * filesystem does not have compression enabled but the destination * filesystem has. */ if (clone_root->offset == 0 && len == sctx->send_root->fs_info->sectorsize) return send_extent_data(sctx, dst_path, offset, len); path = alloc_path_for_send(); if (!path) return -ENOMEM; /* * There are inodes that have extents that lie behind its i_size. Don't * accept clones from these extents. */ ret = get_inode_info(clone_root->root, clone_root->ino, &info); btrfs_release_path(path); if (ret < 0) return ret; clone_src_i_size = info.size; /* * We can't send a clone operation for the entire range if we find * extent items in the respective range in the source file that * refer to different extents or if we find holes. * So check for that and do a mix of clone and regular write/copy * operations if needed. * * Example: * * mkfs.btrfs -f /dev/sda * mount /dev/sda /mnt * xfs_io -f -c "pwrite -S 0xaa 0K 100K" /mnt/foo * cp --reflink=always /mnt/foo /mnt/bar * xfs_io -c "pwrite -S 0xbb 50K 50K" /mnt/foo * btrfs subvolume snapshot -r /mnt /mnt/snap * * If when we send the snapshot and we are processing file bar (which * has a higher inode number than foo) we blindly send a clone operation * for the [0, 100K[ range from foo to bar, the receiver ends up getting * a file bar that matches the content of file foo - iow, doesn't match * the content from bar in the original filesystem. */ key.objectid = clone_root->ino; key.type = BTRFS_EXTENT_DATA_KEY; key.offset = clone_root->offset; ret = btrfs_search_slot(NULL, clone_root->root, &key, path, 0, 0); if (ret < 0) return ret; if (ret > 0 && path->slots[0] > 0) { btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1); if (key.objectid == clone_root->ino && key.type == BTRFS_EXTENT_DATA_KEY) path->slots[0]--; } while (true) { struct extent_buffer *leaf = path->nodes[0]; int slot = path->slots[0]; struct btrfs_file_extent_item *ei; u8 type; u64 ext_len; u64 clone_len; u64 clone_data_offset; bool crossed_src_i_size = false; if (slot >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(clone_root->root, path); if (ret < 0) return ret; else if (ret > 0) break; continue; } btrfs_item_key_to_cpu(leaf, &key, slot); /* * We might have an implicit trailing hole (NO_HOLES feature * enabled). We deal with it after leaving this loop. */ if (key.objectid != clone_root->ino || key.type != BTRFS_EXTENT_DATA_KEY) break; ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); type = btrfs_file_extent_type(leaf, ei); if (type == BTRFS_FILE_EXTENT_INLINE) { ext_len = btrfs_file_extent_ram_bytes(leaf, ei); ext_len = PAGE_ALIGN(ext_len); } else { ext_len = btrfs_file_extent_num_bytes(leaf, ei); } if (key.offset + ext_len <= clone_root->offset) goto next; if (key.offset > clone_root->offset) { /* Implicit hole, NO_HOLES feature enabled. */ u64 hole_len = key.offset - clone_root->offset; if (hole_len > len) hole_len = len; ret = send_extent_data(sctx, dst_path, offset, hole_len); if (ret < 0) return ret; len -= hole_len; if (len == 0) break; offset += hole_len; clone_root->offset += hole_len; data_offset += hole_len; } if (key.offset >= clone_root->offset + len) break; if (key.offset >= clone_src_i_size) break; if (key.offset + ext_len > clone_src_i_size) { ext_len = clone_src_i_size - key.offset; crossed_src_i_size = true; } clone_data_offset = btrfs_file_extent_offset(leaf, ei); if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte) { clone_root->offset = key.offset; if (clone_data_offset < data_offset && clone_data_offset + ext_len > data_offset) { u64 extent_offset; extent_offset = data_offset - clone_data_offset; ext_len -= extent_offset; clone_data_offset += extent_offset; clone_root->offset += extent_offset; } } clone_len = min_t(u64, ext_len, len); if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte && clone_data_offset == data_offset) { const u64 src_end = clone_root->offset + clone_len; const u64 sectorsize = SZ_64K; /* * We can't clone the last block, when its size is not * sector size aligned, into the middle of a file. If we * do so, the receiver will get a failure (-EINVAL) when * trying to clone or will silently corrupt the data in * the destination file if it's on a kernel without the * fix introduced by commit ac765f83f1397646 * ("Btrfs: fix data corruption due to cloning of eof * block). * * So issue a clone of the aligned down range plus a * regular write for the eof block, if we hit that case. * * Also, we use the maximum possible sector size, 64K, * because we don't know what's the sector size of the * filesystem that receives the stream, so we have to * assume the largest possible sector size. */ if (src_end == clone_src_i_size && !IS_ALIGNED(src_end, sectorsize) && offset + clone_len < sctx->cur_inode_size) { u64 slen; slen = ALIGN_DOWN(src_end - clone_root->offset, sectorsize); if (slen > 0) { ret = send_clone(sctx, offset, slen, clone_root); if (ret < 0) return ret; } ret = send_extent_data(sctx, dst_path, offset + slen, clone_len - slen); } else { ret = send_clone(sctx, offset, clone_len, clone_root); } } else if (crossed_src_i_size && clone_len < len) { /* * If we are at i_size of the clone source inode and we * can not clone from it, terminate the loop. This is * to avoid sending two write operations, one with a * length matching clone_len and the final one after * this loop with a length of len - clone_len. * * When using encoded writes (BTRFS_SEND_FLAG_COMPRESSED * was passed to the send ioctl), this helps avoid * sending an encoded write for an offset that is not * sector size aligned, in case the i_size of the source * inode is not sector size aligned. That will make the * receiver fallback to decompression of the data and * writing it using regular buffered IO, therefore while * not incorrect, it's not optimal due decompression and * possible re-compression at the receiver. */ break; } else { ret = send_extent_data(sctx, dst_path, offset, clone_len); } if (ret < 0) return ret; len -= clone_len; if (len == 0) break; offset += clone_len; clone_root->offset += clone_len; /* * If we are cloning from the file we are currently processing, * and using the send root as the clone root, we must stop once * the current clone offset reaches the current eof of the file * at the receiver, otherwise we would issue an invalid clone * operation (source range going beyond eof) and cause the * receiver to fail. So if we reach the current eof, bail out * and fallback to a regular write. */ if (clone_root->root == sctx->send_root && clone_root->ino == sctx->cur_ino && clone_root->offset >= sctx->cur_inode_next_write_offset) break; data_offset += clone_len; next: path->slots[0]++; } if (len > 0) ret = send_extent_data(sctx, dst_path, offset, len); else ret = 0; return ret; } static int send_write_or_clone(struct send_ctx *sctx, struct btrfs_path *path, struct btrfs_key *key, struct clone_root *clone_root) { int ret = 0; u64 offset = key->offset; u64 end; u64 bs = sctx->send_root->fs_info->sectorsize; struct btrfs_file_extent_item *ei; u64 disk_byte; u64 data_offset; u64 num_bytes; struct btrfs_inode_info info = { 0 }; end = min_t(u64, btrfs_file_extent_end(path), sctx->cur_inode_size); if (offset >= end) return 0; num_bytes = end - offset; if (!clone_root) goto write_data; if (IS_ALIGNED(end, bs)) goto clone_data; /* * If the extent end is not aligned, we can clone if the extent ends at * the i_size of the inode and the clone range ends at the i_size of the * source inode, otherwise the clone operation fails with -EINVAL. */ if (end != sctx->cur_inode_size) goto write_data; ret = get_inode_info(clone_root->root, clone_root->ino, &info); if (ret < 0) return ret; if (clone_root->offset + num_bytes == info.size) { /* * The final size of our file matches the end offset, but it may * be that its current size is larger, so we have to truncate it * to any value between the start offset of the range and the * final i_size, otherwise the clone operation is invalid * because it's unaligned and it ends before the current EOF. * We do this truncate to the final i_size when we finish * processing the inode, but it's too late by then. And here we * truncate to the start offset of the range because it's always * sector size aligned while if it were the final i_size it * would result in dirtying part of a page, filling part of a * page with zeroes and then having the clone operation at the * receiver trigger IO and wait for it due to the dirty page. */ if (sctx->parent_root != NULL) { ret = send_truncate(sctx, sctx->cur_ino, sctx->cur_inode_gen, offset); if (ret < 0) return ret; } goto clone_data; } write_data: ret = send_extent_data(sctx, path, offset, num_bytes); sctx->cur_inode_next_write_offset = end; return ret; clone_data: ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_file_extent_item); disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei); data_offset = btrfs_file_extent_offset(path->nodes[0], ei); ret = clone_range(sctx, path, clone_root, disk_byte, data_offset, offset, num_bytes); sctx->cur_inode_next_write_offset = end; return ret; } static int is_extent_unchanged(struct send_ctx *sctx, struct btrfs_path *left_path, struct btrfs_key *ekey) { int ret = 0; struct btrfs_key key; BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *eb; int slot; struct btrfs_key found_key; struct btrfs_file_extent_item *ei; u64 left_disknr; u64 right_disknr; u64 left_offset; u64 right_offset; u64 left_offset_fixed; u64 left_len; u64 right_len; u64 left_gen; u64 right_gen; u8 left_type; u8 right_type; path = alloc_path_for_send(); if (!path) return -ENOMEM; eb = left_path->nodes[0]; slot = left_path->slots[0]; ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); left_type = btrfs_file_extent_type(eb, ei); if (left_type != BTRFS_FILE_EXTENT_REG) return 0; left_disknr = btrfs_file_extent_disk_bytenr(eb, ei); left_len = btrfs_file_extent_num_bytes(eb, ei); left_offset = btrfs_file_extent_offset(eb, ei); left_gen = btrfs_file_extent_generation(eb, ei); /* * Following comments will refer to these graphics. L is the left * extents which we are checking at the moment. 1-8 are the right * extents that we iterate. * * |-----L-----| * |-1-|-2a-|-3-|-4-|-5-|-6-| * * |-----L-----| * |--1--|-2b-|...(same as above) * * Alternative situation. Happens on files where extents got split. * |-----L-----| * |-----------7-----------|-6-| * * Alternative situation. Happens on files which got larger. * |-----L-----| * |-8-| * Nothing follows after 8. */ key.objectid = ekey->objectid; key.type = BTRFS_EXTENT_DATA_KEY; key.offset = ekey->offset; ret = btrfs_search_slot_for_read(sctx->parent_root, &key, path, 0, 0); if (ret < 0) return ret; if (ret) return 0; /* * Handle special case where the right side has no extents at all. */ eb = path->nodes[0]; slot = path->slots[0]; btrfs_item_key_to_cpu(eb, &found_key, slot); if (found_key.objectid != key.objectid || found_key.type != key.type) /* If we're a hole then just pretend nothing changed */ return (left_disknr ? 0 : 1); /* * We're now on 2a, 2b or 7. */ key = found_key; while (key.offset < ekey->offset + left_len) { ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); right_type = btrfs_file_extent_type(eb, ei); if (right_type != BTRFS_FILE_EXTENT_REG && right_type != BTRFS_FILE_EXTENT_INLINE) return 0; if (right_type == BTRFS_FILE_EXTENT_INLINE) { right_len = btrfs_file_extent_ram_bytes(eb, ei); right_len = PAGE_ALIGN(right_len); } else { right_len = btrfs_file_extent_num_bytes(eb, ei); } /* * Are we at extent 8? If yes, we know the extent is changed. * This may only happen on the first iteration. */ if (found_key.offset + right_len <= ekey->offset) /* If we're a hole just pretend nothing changed */ return (left_disknr ? 0 : 1); /* * We just wanted to see if when we have an inline extent, what * follows it is a regular extent (wanted to check the above * condition for inline extents too). This should normally not * happen but it's possible for example when we have an inline * compressed extent representing data with a size matching * the page size (currently the same as sector size). */ if (right_type == BTRFS_FILE_EXTENT_INLINE) return 0; right_disknr = btrfs_file_extent_disk_bytenr(eb, ei); right_offset = btrfs_file_extent_offset(eb, ei); right_gen = btrfs_file_extent_generation(eb, ei); left_offset_fixed = left_offset; if (key.offset < ekey->offset) { /* Fix the right offset for 2a and 7. */ right_offset += ekey->offset - key.offset; } else { /* Fix the left offset for all behind 2a and 2b */ left_offset_fixed += key.offset - ekey->offset; } /* * Check if we have the same extent. */ if (left_disknr != right_disknr || left_offset_fixed != right_offset || left_gen != right_gen) return 0; /* * Go to the next extent. */ ret = btrfs_next_item(sctx->parent_root, path); if (ret < 0) return ret; if (!ret) { eb = path->nodes[0]; slot = path->slots[0]; btrfs_item_key_to_cpu(eb, &found_key, slot); } if (ret || found_key.objectid != key.objectid || found_key.type != key.type) { key.offset += right_len; break; } if (found_key.offset != key.offset + right_len) return 0; key = found_key; } /* * We're now behind the left extent (treat as unchanged) or at the end * of the right side (treat as changed). */ if (key.offset >= ekey->offset + left_len) ret = 1; else ret = 0; return ret; } static int get_last_extent(struct send_ctx *sctx, u64 offset) { BTRFS_PATH_AUTO_FREE(path); struct btrfs_root *root = sctx->send_root; struct btrfs_key key; int ret; path = alloc_path_for_send(); if (!path) return -ENOMEM; sctx->cur_inode_last_extent = 0; key.objectid = sctx->cur_ino; key.type = BTRFS_EXTENT_DATA_KEY; key.offset = offset; ret = btrfs_search_slot_for_read(root, &key, path, 0, 1); if (ret < 0) return ret; ret = 0; btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); if (key.objectid != sctx->cur_ino || key.type != BTRFS_EXTENT_DATA_KEY) return ret; sctx->cur_inode_last_extent = btrfs_file_extent_end(path); return ret; } static int range_is_hole_in_parent(struct send_ctx *sctx, const u64 start, const u64 end) { BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_root *root = sctx->parent_root; u64 search_start = start; int ret; path = alloc_path_for_send(); if (!path) return -ENOMEM; key.objectid = sctx->cur_ino; key.type = BTRFS_EXTENT_DATA_KEY; key.offset = search_start; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) return ret; if (ret > 0 && path->slots[0] > 0) path->slots[0]--; while (search_start < end) { struct extent_buffer *leaf = path->nodes[0]; int slot = path->slots[0]; struct btrfs_file_extent_item *fi; u64 extent_end; if (slot >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(root, path); if (ret < 0) return ret; if (ret > 0) break; continue; } btrfs_item_key_to_cpu(leaf, &key, slot); if (key.objectid < sctx->cur_ino || key.type < BTRFS_EXTENT_DATA_KEY) goto next; if (key.objectid > sctx->cur_ino || key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end) break; fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); extent_end = btrfs_file_extent_end(path); if (extent_end <= start) goto next; if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) return 0; if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0) { search_start = extent_end; goto next; } return 0; next: path->slots[0]++; } return 1; } static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path, struct btrfs_key *key) { int ret = 0; if (sctx->cur_ino != key->objectid || !need_send_hole(sctx)) return 0; /* * Get last extent's end offset (exclusive) if we haven't determined it * yet (we're processing the first file extent item that is new), or if * we're at the first slot of a leaf and the last extent's end is less * than the current extent's offset, because we might have skipped * entire leaves that contained only file extent items for our current * inode. These leaves have a generation number smaller (older) than the * one in the current leaf and the leaf our last extent came from, and * are located between these 2 leaves. */ if ((sctx->cur_inode_last_extent == (u64)-1) || (path->slots[0] == 0 && sctx->cur_inode_last_extent < key->offset)) { ret = get_last_extent(sctx, key->offset - 1); if (ret) return ret; } if (sctx->cur_inode_last_extent < key->offset) { ret = range_is_hole_in_parent(sctx, sctx->cur_inode_last_extent, key->offset); if (ret < 0) return ret; else if (ret == 0) ret = send_hole(sctx, key->offset); else ret = 0; } sctx->cur_inode_last_extent = btrfs_file_extent_end(path); return ret; } static int process_extent(struct send_ctx *sctx, struct btrfs_path *path, struct btrfs_key *key) { struct clone_root *found_clone = NULL; int ret = 0; if (S_ISLNK(sctx->cur_inode_mode)) return 0; if (sctx->parent_root && !sctx->cur_inode_new) { ret = is_extent_unchanged(sctx, path, key); if (ret < 0) return ret; if (ret) goto out_hole; } else { struct btrfs_file_extent_item *ei; u8 type; ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_file_extent_item); type = btrfs_file_extent_type(path->nodes[0], ei); if (type == BTRFS_FILE_EXTENT_PREALLOC || type == BTRFS_FILE_EXTENT_REG) { /* * The send spec does not have a prealloc command yet, * so just leave a hole for prealloc'ed extents until * we have enough commands queued up to justify rev'ing * the send spec. */ if (type == BTRFS_FILE_EXTENT_PREALLOC) return 0; /* Have a hole, just skip it. */ if (btrfs_file_extent_disk_bytenr(path->nodes[0], ei) == 0) return 0; } } ret = find_extent_clone(sctx, path, key->objectid, key->offset, sctx->cur_inode_size, &found_clone); if (ret != -ENOENT && ret < 0) return ret; ret = send_write_or_clone(sctx, path, key, found_clone); if (ret) return ret; out_hole: return maybe_send_hole(sctx, path, key); } static int process_all_extents(struct send_ctx *sctx) { int ret = 0; int iter_ret = 0; struct btrfs_root *root; BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_key found_key; root = sctx->send_root; path = alloc_path_for_send(); if (!path) return -ENOMEM; key.objectid = sctx->cmp_key->objectid; key.type = BTRFS_EXTENT_DATA_KEY; key.offset = 0; btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { if (found_key.objectid != key.objectid || found_key.type != key.type) { ret = 0; break; } ret = process_extent(sctx, path, &found_key); if (ret < 0) break; } /* Catch error found during iteration */ if (iter_ret < 0) ret = iter_ret; return ret; } static int process_recorded_refs_if_needed(struct send_ctx *sctx, bool at_end, int *pending_move, int *refs_processed) { int ret; if (sctx->cur_ino == 0) return 0; if (!at_end && sctx->cur_ino == sctx->cmp_key->objectid && sctx->cmp_key->type <= BTRFS_INODE_EXTREF_KEY) return 0; if (list_empty(&sctx->new_refs) && list_empty(&sctx->deleted_refs)) return 0; ret = process_recorded_refs(sctx, pending_move); if (ret < 0) return ret; *refs_processed = 1; return 0; } static int finish_inode_if_needed(struct send_ctx *sctx, bool at_end) { int ret = 0; struct btrfs_inode_info info; u64 left_mode; u64 left_uid; u64 left_gid; u64 left_fileattr; u64 right_mode; u64 right_uid; u64 right_gid; u64 right_fileattr; int need_chmod = 0; int need_chown = 0; bool need_fileattr = false; int need_truncate = 1; int pending_move = 0; int refs_processed = 0; if (sctx->ignore_cur_inode) return 0; ret = process_recorded_refs_if_needed(sctx, at_end, &pending_move, &refs_processed); if (ret < 0) goto out; /* * We have processed the refs and thus need to advance send_progress. * Now, calls to get_cur_xxx will take the updated refs of the current * inode into account. * * On the other hand, if our current inode is a directory and couldn't * be moved/renamed because its parent was renamed/moved too and it has * a higher inode number, we can only move/rename our current inode * after we moved/renamed its parent. Therefore in this case operate on * the old path (pre move/rename) of our current inode, and the * move/rename will be performed later. */ if (refs_processed && !pending_move) sctx->send_progress = sctx->cur_ino + 1; if (sctx->cur_ino == 0 || sctx->cur_inode_deleted) goto out; if (!at_end && sctx->cmp_key->objectid == sctx->cur_ino) goto out; ret = get_inode_info(sctx->send_root, sctx->cur_ino, &info); if (ret < 0) goto out; left_mode = info.mode; left_uid = info.uid; left_gid = info.gid; left_fileattr = info.fileattr; if (!sctx->parent_root || sctx->cur_inode_new) { need_chown = 1; if (!S_ISLNK(sctx->cur_inode_mode)) need_chmod = 1; if (sctx->cur_inode_next_write_offset == sctx->cur_inode_size) need_truncate = 0; } else { u64 old_size; ret = get_inode_info(sctx->parent_root, sctx->cur_ino, &info); if (ret < 0) goto out; old_size = info.size; right_mode = info.mode; right_uid = info.uid; right_gid = info.gid; right_fileattr = info.fileattr; if (left_uid != right_uid || left_gid != right_gid) need_chown = 1; if (!S_ISLNK(sctx->cur_inode_mode) && left_mode != right_mode) need_chmod = 1; if (!S_ISLNK(sctx->cur_inode_mode) && left_fileattr != right_fileattr) need_fileattr = true; if ((old_size == sctx->cur_inode_size) || (sctx->cur_inode_size > old_size && sctx->cur_inode_next_write_offset == sctx->cur_inode_size)) need_truncate = 0; } if (S_ISREG(sctx->cur_inode_mode)) { if (need_send_hole(sctx)) { if (sctx->cur_inode_last_extent == (u64)-1 || sctx->cur_inode_last_extent < sctx->cur_inode_size) { ret = get_last_extent(sctx, (u64)-1); if (ret) goto out; } if (sctx->cur_inode_last_extent < sctx->cur_inode_size) { ret = range_is_hole_in_parent(sctx, sctx->cur_inode_last_extent, sctx->cur_inode_size); if (ret < 0) { goto out; } else if (ret == 0) { ret = send_hole(sctx, sctx->cur_inode_size); if (ret < 0) goto out; } else { /* Range is already a hole, skip. */ ret = 0; } } } if (need_truncate) { ret = send_truncate(sctx, sctx->cur_ino, sctx->cur_inode_gen, sctx->cur_inode_size); if (ret < 0) goto out; } } if (need_chown) { ret = send_chown(sctx, sctx->cur_ino, sctx->cur_inode_gen, left_uid, left_gid); if (ret < 0) goto out; } if (need_chmod) { ret = send_chmod(sctx, sctx->cur_ino, sctx->cur_inode_gen, left_mode); if (ret < 0) goto out; } if (need_fileattr) { ret = send_fileattr(sctx, sctx->cur_ino, sctx->cur_inode_gen, left_fileattr); if (ret < 0) goto out; } if (proto_cmd_ok(sctx, BTRFS_SEND_C_ENABLE_VERITY) && sctx->cur_inode_needs_verity) { ret = process_verity(sctx); if (ret < 0) goto out; } ret = send_capabilities(sctx); if (ret < 0) goto out; /* * If other directory inodes depended on our current directory * inode's move/rename, now do their move/rename operations. */ if (!is_waiting_for_move(sctx, sctx->cur_ino)) { ret = apply_children_dir_moves(sctx); if (ret) goto out; /* * Need to send that every time, no matter if it actually * changed between the two trees as we have done changes to * the inode before. If our inode is a directory and it's * waiting to be moved/renamed, we will send its utimes when * it's moved/renamed, therefore we don't need to do it here. */ sctx->send_progress = sctx->cur_ino + 1; /* * If the current inode is a non-empty directory, delay issuing * the utimes command for it, as it's very likely we have inodes * with an higher number inside it. We want to issue the utimes * command only after adding all dentries to it. */ if (S_ISDIR(sctx->cur_inode_mode) && sctx->cur_inode_size > 0) ret = cache_dir_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen); else ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen); if (ret < 0) goto out; } out: if (!ret) ret = trim_dir_utimes_cache(sctx); return ret; } static void close_current_inode(struct send_ctx *sctx) { u64 i_size; if (sctx->cur_inode == NULL) return; i_size = i_size_read(sctx->cur_inode); /* * If we are doing an incremental send, we may have extents between the * last processed extent and the i_size that have not been processed * because they haven't changed but we may have read some of their pages * through readahead, see the comments at send_extent_data(). */ if (sctx->clean_page_cache && sctx->page_cache_clear_start < i_size) truncate_inode_pages_range(&sctx->cur_inode->i_data, sctx->page_cache_clear_start, round_up(i_size, PAGE_SIZE) - 1); iput(sctx->cur_inode); sctx->cur_inode = NULL; } static int changed_inode(struct send_ctx *sctx, enum btrfs_compare_tree_result result) { int ret; struct btrfs_key *key = sctx->cmp_key; struct btrfs_inode_item *left_ii = NULL; struct btrfs_inode_item *right_ii = NULL; u64 left_gen = 0; u64 right_gen = 0; close_current_inode(sctx); sctx->cur_ino = key->objectid; sctx->cur_inode_new_gen = false; sctx->cur_inode_last_extent = (u64)-1; sctx->cur_inode_next_write_offset = 0; sctx->ignore_cur_inode = false; fs_path_reset(&sctx->cur_inode_path); /* * Set send_progress to current inode. This will tell all get_cur_xxx * functions that the current inode's refs are not updated yet. Later, * when process_recorded_refs is finished, it is set to cur_ino + 1. */ sctx->send_progress = sctx->cur_ino; if (result == BTRFS_COMPARE_TREE_NEW || result == BTRFS_COMPARE_TREE_CHANGED) { left_ii = btrfs_item_ptr(sctx->left_path->nodes[0], sctx->left_path->slots[0], struct btrfs_inode_item); left_gen = btrfs_inode_generation(sctx->left_path->nodes[0], left_ii); } else { right_ii = btrfs_item_ptr(sctx->right_path->nodes[0], sctx->right_path->slots[0], struct btrfs_inode_item); right_gen = btrfs_inode_generation(sctx->right_path->nodes[0], right_ii); } if (result == BTRFS_COMPARE_TREE_CHANGED) { right_ii = btrfs_item_ptr(sctx->right_path->nodes[0], sctx->right_path->slots[0], struct btrfs_inode_item); right_gen = btrfs_inode_generation(sctx->right_path->nodes[0], right_ii); /* * The cur_ino = root dir case is special here. We can't treat * the inode as deleted+reused because it would generate a * stream that tries to delete/mkdir the root dir. */ if (left_gen != right_gen && sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) sctx->cur_inode_new_gen = true; } /* * Normally we do not find inodes with a link count of zero (orphans) * because the most common case is to create a snapshot and use it * for a send operation. However other less common use cases involve * using a subvolume and send it after turning it to RO mode just * after deleting all hard links of a file while holding an open * file descriptor against it or turning a RO snapshot into RW mode, * keep an open file descriptor against a file, delete it and then * turn the snapshot back to RO mode before using it for a send * operation. The former is what the receiver operation does. * Therefore, if we want to send these snapshots soon after they're * received, we need to handle orphan inodes as well. Moreover, orphans * can appear not only in the send snapshot but also in the parent * snapshot. Here are several cases: * * Case 1: BTRFS_COMPARE_TREE_NEW * | send snapshot | action * -------------------------------- * nlink | 0 | ignore * * Case 2: BTRFS_COMPARE_TREE_DELETED * | parent snapshot | action * ---------------------------------- * nlink | 0 | as usual * Note: No unlinks will be sent because there're no paths for it. * * Case 3: BTRFS_COMPARE_TREE_CHANGED * | | parent snapshot | send snapshot | action * ----------------------------------------------------------------------- * subcase 1 | nlink | 0 | 0 | ignore * subcase 2 | nlink | >0 | 0 | new_gen(deletion) * subcase 3 | nlink | 0 | >0 | new_gen(creation) * */ if (result == BTRFS_COMPARE_TREE_NEW) { if (btrfs_inode_nlink(sctx->left_path->nodes[0], left_ii) == 0) { sctx->ignore_cur_inode = true; return 0; } sctx->cur_inode_gen = left_gen; sctx->cur_inode_new = true; sctx->cur_inode_deleted = false; sctx->cur_inode_size = btrfs_inode_size( sctx->left_path->nodes[0], left_ii); sctx->cur_inode_mode = btrfs_inode_mode( sctx->left_path->nodes[0], left_ii); sctx->cur_inode_rdev = btrfs_inode_rdev( sctx->left_path->nodes[0], left_ii); if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) ret = send_create_inode_if_needed(sctx); } else if (result == BTRFS_COMPARE_TREE_DELETED) { sctx->cur_inode_gen = right_gen; sctx->cur_inode_new = false; sctx->cur_inode_deleted = true; sctx->cur_inode_size = btrfs_inode_size( sctx->right_path->nodes[0], right_ii); sctx->cur_inode_mode = btrfs_inode_mode( sctx->right_path->nodes[0], right_ii); } else if (result == BTRFS_COMPARE_TREE_CHANGED) { u32 new_nlinks, old_nlinks; new_nlinks = btrfs_inode_nlink(sctx->left_path->nodes[0], left_ii); old_nlinks = btrfs_inode_nlink(sctx->right_path->nodes[0], right_ii); if (new_nlinks == 0 && old_nlinks == 0) { sctx->ignore_cur_inode = true; return 0; } else if (new_nlinks == 0 || old_nlinks == 0) { sctx->cur_inode_new_gen = 1; } /* * We need to do some special handling in case the inode was * reported as changed with a changed generation number. This * means that the original inode was deleted and new inode * reused the same inum. So we have to treat the old inode as * deleted and the new one as new. */ if (sctx->cur_inode_new_gen) { /* * First, process the inode as if it was deleted. */ if (old_nlinks > 0) { sctx->cur_inode_gen = right_gen; sctx->cur_inode_new = false; sctx->cur_inode_deleted = true; sctx->cur_inode_size = btrfs_inode_size( sctx->right_path->nodes[0], right_ii); sctx->cur_inode_mode = btrfs_inode_mode( sctx->right_path->nodes[0], right_ii); ret = process_all_refs(sctx, BTRFS_COMPARE_TREE_DELETED); if (ret < 0) return ret; } /* * Now process the inode as if it was new. */ if (new_nlinks > 0) { sctx->cur_inode_gen = left_gen; sctx->cur_inode_new = true; sctx->cur_inode_deleted = false; sctx->cur_inode_size = btrfs_inode_size( sctx->left_path->nodes[0], left_ii); sctx->cur_inode_mode = btrfs_inode_mode( sctx->left_path->nodes[0], left_ii); sctx->cur_inode_rdev = btrfs_inode_rdev( sctx->left_path->nodes[0], left_ii); ret = send_create_inode_if_needed(sctx); if (ret < 0) return ret; ret = process_all_refs(sctx, BTRFS_COMPARE_TREE_NEW); if (ret < 0) return ret; /* * Advance send_progress now as we did not get * into process_recorded_refs_if_needed in the * new_gen case. */ sctx->send_progress = sctx->cur_ino + 1; /* * Now process all extents and xattrs of the * inode as if they were all new. */ ret = process_all_extents(sctx); if (ret < 0) return ret; ret = process_all_new_xattrs(sctx); if (ret < 0) return ret; } } else { sctx->cur_inode_gen = left_gen; sctx->cur_inode_new = false; sctx->cur_inode_new_gen = false; sctx->cur_inode_deleted = false; sctx->cur_inode_size = btrfs_inode_size( sctx->left_path->nodes[0], left_ii); sctx->cur_inode_mode = btrfs_inode_mode( sctx->left_path->nodes[0], left_ii); } } return 0; } /* * We have to process new refs before deleted refs, but compare_trees gives us * the new and deleted refs mixed. To fix this, we record the new/deleted refs * first and later process them in process_recorded_refs. * For the cur_inode_new_gen case, we skip recording completely because * changed_inode did already initiate processing of refs. The reason for this is * that in this case, compare_tree actually compares the refs of 2 different * inodes. To fix this, process_all_refs is used in changed_inode to handle all * refs of the right tree as deleted and all refs of the left tree as new. */ static int changed_ref(struct send_ctx *sctx, enum btrfs_compare_tree_result result) { int ret = 0; if (unlikely(sctx->cur_ino != sctx->cmp_key->objectid)) { inconsistent_snapshot_error(sctx, result, "reference"); return -EIO; } if (!sctx->cur_inode_new_gen && sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) { if (result == BTRFS_COMPARE_TREE_NEW) ret = record_new_ref(sctx); else if (result == BTRFS_COMPARE_TREE_DELETED) ret = record_deleted_ref(sctx); else if (result == BTRFS_COMPARE_TREE_CHANGED) ret = record_changed_ref(sctx); } return ret; } /* * Process new/deleted/changed xattrs. We skip processing in the * cur_inode_new_gen case because changed_inode did already initiate processing * of xattrs. The reason is the same as in changed_ref */ static int changed_xattr(struct send_ctx *sctx, enum btrfs_compare_tree_result result) { int ret = 0; if (unlikely(sctx->cur_ino != sctx->cmp_key->objectid)) { inconsistent_snapshot_error(sctx, result, "xattr"); return -EIO; } if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) { if (result == BTRFS_COMPARE_TREE_NEW) ret = process_new_xattr(sctx); else if (result == BTRFS_COMPARE_TREE_DELETED) ret = process_deleted_xattr(sctx); else if (result == BTRFS_COMPARE_TREE_CHANGED) ret = process_changed_xattr(sctx); } return ret; } /* * Process new/deleted/changed extents. We skip processing in the * cur_inode_new_gen case because changed_inode did already initiate processing * of extents. The reason is the same as in changed_ref */ static int changed_extent(struct send_ctx *sctx, enum btrfs_compare_tree_result result) { int ret = 0; /* * We have found an extent item that changed without the inode item * having changed. This can happen either after relocation (where the * disk_bytenr of an extent item is replaced at * relocation.c:replace_file_extents()) or after deduplication into a * file in both the parent and send snapshots (where an extent item can * get modified or replaced with a new one). Note that deduplication * updates the inode item, but it only changes the iversion (sequence * field in the inode item) of the inode, so if a file is deduplicated * the same amount of times in both the parent and send snapshots, its * iversion becomes the same in both snapshots, whence the inode item is * the same on both snapshots. */ if (sctx->cur_ino != sctx->cmp_key->objectid) return 0; if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) { if (result != BTRFS_COMPARE_TREE_DELETED) ret = process_extent(sctx, sctx->left_path, sctx->cmp_key); } return ret; } static int changed_verity(struct send_ctx *sctx, enum btrfs_compare_tree_result result) { if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) { if (result == BTRFS_COMPARE_TREE_NEW) sctx->cur_inode_needs_verity = true; } return 0; } static int dir_changed(struct send_ctx *sctx, u64 dir) { u64 orig_gen, new_gen; int ret; ret = get_inode_gen(sctx->send_root, dir, &new_gen); if (ret) return ret; ret = get_inode_gen(sctx->parent_root, dir, &orig_gen); if (ret) return ret; return (orig_gen != new_gen) ? 1 : 0; } static int compare_refs(struct send_ctx *sctx, struct btrfs_path *path, struct btrfs_key *key) { struct btrfs_inode_extref *extref; struct extent_buffer *leaf; u64 dirid = 0, last_dirid = 0; unsigned long ptr; u32 item_size; u32 cur_offset = 0; int ref_name_len; /* Easy case, just check this one dirid */ if (key->type == BTRFS_INODE_REF_KEY) { dirid = key->offset; return dir_changed(sctx, dirid); } leaf = path->nodes[0]; item_size = btrfs_item_size(leaf, path->slots[0]); ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); while (cur_offset < item_size) { int ret; extref = (struct btrfs_inode_extref *)(ptr + cur_offset); dirid = btrfs_inode_extref_parent(leaf, extref); ref_name_len = btrfs_inode_extref_name_len(leaf, extref); cur_offset += ref_name_len + sizeof(*extref); if (dirid == last_dirid) continue; ret = dir_changed(sctx, dirid); if (ret) return ret; last_dirid = dirid; } return 0; } /* * Updates compare related fields in sctx and simply forwards to the actual * changed_xxx functions. */ static int changed_cb(struct btrfs_path *left_path, struct btrfs_path *right_path, struct btrfs_key *key, enum btrfs_compare_tree_result result, struct send_ctx *sctx) { int ret; /* * We can not hold the commit root semaphore here. This is because in * the case of sending and receiving to the same filesystem, using a * pipe, could result in a deadlock: * * 1) The task running send blocks on the pipe because it's full; * * 2) The task running receive, which is the only consumer of the pipe, * is waiting for a transaction commit (for example due to a space * reservation when doing a write or triggering a transaction commit * when creating a subvolume); * * 3) The transaction is waiting to write lock the commit root semaphore, * but can not acquire it since it's being held at 1). * * Down this call chain we write to the pipe through kernel_write(). * The same type of problem can also happen when sending to a file that * is stored in the same filesystem - when reserving space for a write * into the file, we can trigger a transaction commit. * * Our caller has supplied us with clones of leaves from the send and * parent roots, so we're safe here from a concurrent relocation and * further reallocation of metadata extents while we are here. Below we * also assert that the leaves are clones. */ lockdep_assert_not_held(&sctx->send_root->fs_info->commit_root_sem); /* * We always have a send root, so left_path is never NULL. We will not * have a leaf when we have reached the end of the send root but have * not yet reached the end of the parent root. */ if (left_path->nodes[0]) ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED, &left_path->nodes[0]->bflags)); /* * When doing a full send we don't have a parent root, so right_path is * NULL. When doing an incremental send, we may have reached the end of * the parent root already, so we don't have a leaf at right_path. */ if (right_path && right_path->nodes[0]) ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED, &right_path->nodes[0]->bflags)); if (result == BTRFS_COMPARE_TREE_SAME) { if (key->type == BTRFS_INODE_REF_KEY || key->type == BTRFS_INODE_EXTREF_KEY) { ret = compare_refs(sctx, left_path, key); if (!ret) return 0; if (ret < 0) return ret; } else if (key->type == BTRFS_EXTENT_DATA_KEY) { return maybe_send_hole(sctx, left_path, key); } else { return 0; } result = BTRFS_COMPARE_TREE_CHANGED; } sctx->left_path = left_path; sctx->right_path = right_path; sctx->cmp_key = key; ret = finish_inode_if_needed(sctx, false); if (ret < 0) return ret; /* Ignore non-FS objects */ if (key->objectid == BTRFS_FREE_INO_OBJECTID || key->objectid == BTRFS_FREE_SPACE_OBJECTID) return 0; if (key->type == BTRFS_INODE_ITEM_KEY) { ret = changed_inode(sctx, result); } else if (!sctx->ignore_cur_inode) { if (key->type == BTRFS_INODE_REF_KEY || key->type == BTRFS_INODE_EXTREF_KEY) ret = changed_ref(sctx, result); else if (key->type == BTRFS_XATTR_ITEM_KEY) ret = changed_xattr(sctx, result); else if (key->type == BTRFS_EXTENT_DATA_KEY) ret = changed_extent(sctx, result); else if (key->type == BTRFS_VERITY_DESC_ITEM_KEY && key->offset == 0) ret = changed_verity(sctx, result); } return ret; } static int search_key_again(const struct send_ctx *sctx, struct btrfs_root *root, struct btrfs_path *path, const struct btrfs_key *key) { int ret; if (!path->need_commit_sem) lockdep_assert_held_read(&root->fs_info->commit_root_sem); /* * Roots used for send operations are readonly and no one can add, * update or remove keys from them, so we should be able to find our * key again. The only exception is deduplication, which can operate on * readonly roots and add, update or remove keys to/from them - but at * the moment we don't allow it to run in parallel with send. */ ret = btrfs_search_slot(NULL, root, key, path, 0, 0); ASSERT(ret <= 0); if (unlikely(ret > 0)) { btrfs_print_tree(path->nodes[path->lowest_level], false); btrfs_err(root->fs_info, "send: key " BTRFS_KEY_FMT" not found in %s root %llu, lowest_level %d, slot %d", BTRFS_KEY_FMT_VALUE(key), (root == sctx->parent_root ? "parent" : "send"), btrfs_root_id(root), path->lowest_level, path->slots[path->lowest_level]); return -EUCLEAN; } return ret; } static int full_send_tree(struct send_ctx *sctx) { int ret; struct btrfs_root *send_root = sctx->send_root; struct btrfs_key key; struct btrfs_fs_info *fs_info = send_root->fs_info; BTRFS_PATH_AUTO_FREE(path); path = alloc_path_for_send(); if (!path) return -ENOMEM; path->reada = READA_FORWARD_ALWAYS; key.objectid = BTRFS_FIRST_FREE_OBJECTID; key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; down_read(&fs_info->commit_root_sem); sctx->last_reloc_trans = fs_info->last_reloc_trans; up_read(&fs_info->commit_root_sem); ret = btrfs_search_slot_for_read(send_root, &key, path, 1, 0); if (ret < 0) return ret; if (ret) goto out_finish; while (1) { btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); ret = changed_cb(path, NULL, &key, BTRFS_COMPARE_TREE_NEW, sctx); if (ret < 0) return ret; down_read(&fs_info->commit_root_sem); if (fs_info->last_reloc_trans > sctx->last_reloc_trans) { sctx->last_reloc_trans = fs_info->last_reloc_trans; up_read(&fs_info->commit_root_sem); /* * A transaction used for relocating a block group was * committed or is about to finish its commit. Release * our path (leaf) and restart the search, so that we * avoid operating on any file extent items that are * stale, with a disk_bytenr that reflects a pre * relocation value. This way we avoid as much as * possible to fallback to regular writes when checking * if we can clone file ranges. */ btrfs_release_path(path); ret = search_key_again(sctx, send_root, path, &key); if (ret < 0) return ret; } else { up_read(&fs_info->commit_root_sem); } ret = btrfs_next_item(send_root, path); if (ret < 0) return ret; if (ret) { ret = 0; break; } } out_finish: return finish_inode_if_needed(sctx, true); } static int replace_node_with_clone(struct btrfs_path *path, int level) { struct extent_buffer *clone; clone = btrfs_clone_extent_buffer(path->nodes[level]); if (!clone) return -ENOMEM; free_extent_buffer(path->nodes[level]); path->nodes[level] = clone; return 0; } static int tree_move_down(struct btrfs_path *path, int *level, u64 reada_min_gen) { struct extent_buffer *eb; struct extent_buffer *parent = path->nodes[*level]; int slot = path->slots[*level]; const int nritems = btrfs_header_nritems(parent); u64 reada_max; u64 reada_done = 0; lockdep_assert_held_read(&parent->fs_info->commit_root_sem); ASSERT(*level != 0); eb = btrfs_read_node_slot(parent, slot); if (IS_ERR(eb)) return PTR_ERR(eb); /* * Trigger readahead for the next leaves we will process, so that it is * very likely that when we need them they are already in memory and we * will not block on disk IO. For nodes we only do readahead for one, * since the time window between processing nodes is typically larger. */ reada_max = (*level == 1 ? SZ_128K : eb->fs_info->nodesize); for (slot++; slot < nritems && reada_done < reada_max; slot++) { if (btrfs_node_ptr_generation(parent, slot) > reada_min_gen) { btrfs_readahead_node_child(parent, slot); reada_done += eb->fs_info->nodesize; } } path->nodes[*level - 1] = eb; path->slots[*level - 1] = 0; (*level)--; if (*level == 0) return replace_node_with_clone(path, 0); return 0; } static int tree_move_next_or_upnext(struct btrfs_path *path, int *level, int root_level) { int ret = 0; int nritems; nritems = btrfs_header_nritems(path->nodes[*level]); path->slots[*level]++; while (path->slots[*level] >= nritems) { if (*level == root_level) { path->slots[*level] = nritems - 1; return -1; } /* move upnext */ path->slots[*level] = 0; free_extent_buffer(path->nodes[*level]); path->nodes[*level] = NULL; (*level)++; path->slots[*level]++; nritems = btrfs_header_nritems(path->nodes[*level]); ret = 1; } return ret; } /* * Returns 1 if it had to move up and next. 0 is returned if it moved only next * or down. */ static int tree_advance(struct btrfs_path *path, int *level, int root_level, int allow_down, struct btrfs_key *key, u64 reada_min_gen) { int ret; if (*level == 0 || !allow_down) { ret = tree_move_next_or_upnext(path, level, root_level); } else { ret = tree_move_down(path, level, reada_min_gen); } /* * Even if we have reached the end of a tree, ret is -1, update the key * anyway, so that in case we need to restart due to a block group * relocation, we can assert that the last key of the root node still * exists in the tree. */ if (*level == 0) btrfs_item_key_to_cpu(path->nodes[*level], key, path->slots[*level]); else btrfs_node_key_to_cpu(path->nodes[*level], key, path->slots[*level]); return ret; } static int tree_compare_item(struct btrfs_path *left_path, struct btrfs_path *right_path, char *tmp_buf) { int cmp; int len1, len2; unsigned long off1, off2; len1 = btrfs_item_size(left_path->nodes[0], left_path->slots[0]); len2 = btrfs_item_size(right_path->nodes[0], right_path->slots[0]); if (len1 != len2) return 1; off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]); off2 = btrfs_item_ptr_offset(right_path->nodes[0], right_path->slots[0]); read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1); cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1); if (cmp) return 1; return 0; } /* * A transaction used for relocating a block group was committed or is about to * finish its commit. Release our paths and restart the search, so that we are * not using stale extent buffers: * * 1) For levels > 0, we are only holding references of extent buffers, without * any locks on them, which does not prevent them from having been relocated * and reallocated after the last time we released the commit root semaphore. * The exception are the root nodes, for which we always have a clone, see * the comment at btrfs_compare_trees(); * * 2) For leaves, level 0, we are holding copies (clones) of extent buffers, so * we are safe from the concurrent relocation and reallocation. However they * can have file extent items with a pre relocation disk_bytenr value, so we * restart the start from the current commit roots and clone the new leaves so * that we get the post relocation disk_bytenr values. Not doing so, could * make us clone the wrong data in case there are new extents using the old * disk_bytenr that happen to be shared. */ static int restart_after_relocation(struct btrfs_path *left_path, struct btrfs_path *right_path, const struct btrfs_key *left_key, const struct btrfs_key *right_key, int left_level, int right_level, const struct send_ctx *sctx) { int root_level; int ret; lockdep_assert_held_read(&sctx->send_root->fs_info->commit_root_sem); btrfs_release_path(left_path); btrfs_release_path(right_path); /* * Since keys can not be added or removed to/from our roots because they * are readonly and we do not allow deduplication to run in parallel * (which can add, remove or change keys), the layout of the trees should * not change. */ left_path->lowest_level = left_level; ret = search_key_again(sctx, sctx->send_root, left_path, left_key); if (ret < 0) return ret; right_path->lowest_level = right_level; ret = search_key_again(sctx, sctx->parent_root, right_path, right_key); if (ret < 0) return ret; /* * If the lowest level nodes are leaves, clone them so that they can be * safely used by changed_cb() while not under the protection of the * commit root semaphore, even if relocation and reallocation happens in * parallel. */ if (left_level == 0) { ret = replace_node_with_clone(left_path, 0); if (ret < 0) return ret; } if (right_level == 0) { ret = replace_node_with_clone(right_path, 0); if (ret < 0) return ret; } /* * Now clone the root nodes (unless they happen to be the leaves we have * already cloned). This is to protect against concurrent snapshotting of * the send and parent roots (see the comment at btrfs_compare_trees()). */ root_level = btrfs_header_level(sctx->send_root->commit_root); if (root_level > 0) { ret = replace_node_with_clone(left_path, root_level); if (ret < 0) return ret; } root_level = btrfs_header_level(sctx->parent_root->commit_root); if (root_level > 0) { ret = replace_node_with_clone(right_path, root_level); if (ret < 0) return ret; } return 0; } /* * This function compares two trees and calls the provided callback for * every changed/new/deleted item it finds. * If shared tree blocks are encountered, whole subtrees are skipped, making * the compare pretty fast on snapshotted subvolumes. * * This currently works on commit roots only. As commit roots are read only, * we don't do any locking. The commit roots are protected with transactions. * Transactions are ended and rejoined when a commit is tried in between. * * This function checks for modifications done to the trees while comparing. * If it detects a change, it aborts immediately. */ static int btrfs_compare_trees(struct btrfs_root *left_root, struct btrfs_root *right_root, struct send_ctx *sctx) { struct btrfs_fs_info *fs_info = left_root->fs_info; int ret; int cmp; BTRFS_PATH_AUTO_FREE(left_path); BTRFS_PATH_AUTO_FREE(right_path); struct btrfs_key left_key; struct btrfs_key right_key; char *tmp_buf = NULL; int left_root_level; int right_root_level; int left_level; int right_level; int left_end_reached = 0; int right_end_reached = 0; int advance_left = 0; int advance_right = 0; u64 left_blockptr; u64 right_blockptr; u64 left_gen; u64 right_gen; u64 reada_min_gen; left_path = btrfs_alloc_path(); if (!left_path) { ret = -ENOMEM; goto out; } right_path = btrfs_alloc_path(); if (!right_path) { ret = -ENOMEM; goto out; } tmp_buf = kvmalloc(fs_info->nodesize, GFP_KERNEL); if (!tmp_buf) { ret = -ENOMEM; goto out; } left_path->search_commit_root = true; left_path->skip_locking = true; right_path->search_commit_root = true; right_path->skip_locking = true; /* * Strategy: Go to the first items of both trees. Then do * * If both trees are at level 0 * Compare keys of current items * If left < right treat left item as new, advance left tree * and repeat * If left > right treat right item as deleted, advance right tree * and repeat * If left == right do deep compare of items, treat as changed if * needed, advance both trees and repeat * If both trees are at the same level but not at level 0 * Compare keys of current nodes/leafs * If left < right advance left tree and repeat * If left > right advance right tree and repeat * If left == right compare blockptrs of the next nodes/leafs * If they match advance both trees but stay at the same level * and repeat * If they don't match advance both trees while allowing to go * deeper and repeat * If tree levels are different * Advance the tree that needs it and repeat * * Advancing a tree means: * If we are at level 0, try to go to the next slot. If that's not * possible, go one level up and repeat. Stop when we found a level * where we could go to the next slot. We may at this point be on a * node or a leaf. * * If we are not at level 0 and not on shared tree blocks, go one * level deeper. * * If we are not at level 0 and on shared tree blocks, go one slot to * the right if possible or go up and right. */ down_read(&fs_info->commit_root_sem); left_level = btrfs_header_level(left_root->commit_root); left_root_level = left_level; /* * We clone the root node of the send and parent roots to prevent races * with snapshot creation of these roots. Snapshot creation COWs the * root node of a tree, so after the transaction is committed the old * extent can be reallocated while this send operation is still ongoing. * So we clone them, under the commit root semaphore, to be race free. */ left_path->nodes[left_level] = btrfs_clone_extent_buffer(left_root->commit_root); if (!left_path->nodes[left_level]) { ret = -ENOMEM; goto out_unlock; } right_level = btrfs_header_level(right_root->commit_root); right_root_level = right_level; right_path->nodes[right_level] = btrfs_clone_extent_buffer(right_root->commit_root); if (!right_path->nodes[right_level]) { ret = -ENOMEM; goto out_unlock; } /* * Our right root is the parent root, while the left root is the "send" * root. We know that all new nodes/leaves in the left root must have * a generation greater than the right root's generation, so we trigger * readahead for those nodes and leaves of the left root, as we know we * will need to read them at some point. */ reada_min_gen = btrfs_header_generation(right_root->commit_root); if (left_level == 0) btrfs_item_key_to_cpu(left_path->nodes[left_level], &left_key, left_path->slots[left_level]); else btrfs_node_key_to_cpu(left_path->nodes[left_level], &left_key, left_path->slots[left_level]); if (right_level == 0) btrfs_item_key_to_cpu(right_path->nodes[right_level], &right_key, right_path->slots[right_level]); else btrfs_node_key_to_cpu(right_path->nodes[right_level], &right_key, right_path->slots[right_level]); sctx->last_reloc_trans = fs_info->last_reloc_trans; while (1) { if (need_resched() || rwsem_is_contended(&fs_info->commit_root_sem)) { up_read(&fs_info->commit_root_sem); cond_resched(); down_read(&fs_info->commit_root_sem); } if (fs_info->last_reloc_trans > sctx->last_reloc_trans) { ret = restart_after_relocation(left_path, right_path, &left_key, &right_key, left_level, right_level, sctx); if (ret < 0) goto out_unlock; sctx->last_reloc_trans = fs_info->last_reloc_trans; } if (advance_left && !left_end_reached) { ret = tree_advance(left_path, &left_level, left_root_level, advance_left != ADVANCE_ONLY_NEXT, &left_key, reada_min_gen); if (ret == -1) left_end_reached = ADVANCE; else if (ret < 0) goto out_unlock; advance_left = 0; } if (advance_right && !right_end_reached) { ret = tree_advance(right_path, &right_level, right_root_level, advance_right != ADVANCE_ONLY_NEXT, &right_key, reada_min_gen); if (ret == -1) right_end_reached = ADVANCE; else if (ret < 0) goto out_unlock; advance_right = 0; } if (left_end_reached && right_end_reached) { ret = 0; goto out_unlock; } else if (left_end_reached) { if (right_level == 0) { up_read(&fs_info->commit_root_sem); ret = changed_cb(left_path, right_path, &right_key, BTRFS_COMPARE_TREE_DELETED, sctx); if (ret < 0) goto out; down_read(&fs_info->commit_root_sem); } advance_right = ADVANCE; continue; } else if (right_end_reached) { if (left_level == 0) { up_read(&fs_info->commit_root_sem); ret = changed_cb(left_path, right_path, &left_key, BTRFS_COMPARE_TREE_NEW, sctx); if (ret < 0) goto out; down_read(&fs_info->commit_root_sem); } advance_left = ADVANCE; continue; } if (left_level == 0 && right_level == 0) { up_read(&fs_info->commit_root_sem); cmp = btrfs_comp_cpu_keys(&left_key, &right_key); if (cmp < 0) { ret = changed_cb(left_path, right_path, &left_key, BTRFS_COMPARE_TREE_NEW, sctx); advance_left = ADVANCE; } else if (cmp > 0) { ret = changed_cb(left_path, right_path, &right_key, BTRFS_COMPARE_TREE_DELETED, sctx); advance_right = ADVANCE; } else { enum btrfs_compare_tree_result result; WARN_ON(!extent_buffer_uptodate(left_path->nodes[0])); ret = tree_compare_item(left_path, right_path, tmp_buf); if (ret) result = BTRFS_COMPARE_TREE_CHANGED; else result = BTRFS_COMPARE_TREE_SAME; ret = changed_cb(left_path, right_path, &left_key, result, sctx); advance_left = ADVANCE; advance_right = ADVANCE; } if (ret < 0) goto out; down_read(&fs_info->commit_root_sem); } else if (left_level == right_level) { cmp = btrfs_comp_cpu_keys(&left_key, &right_key); if (cmp < 0) { advance_left = ADVANCE; } else if (cmp > 0) { advance_right = ADVANCE; } else { left_blockptr = btrfs_node_blockptr( left_path->nodes[left_level], left_path->slots[left_level]); right_blockptr = btrfs_node_blockptr( right_path->nodes[right_level], right_path->slots[right_level]); left_gen = btrfs_node_ptr_generation( left_path->nodes[left_level], left_path->slots[left_level]); right_gen = btrfs_node_ptr_generation( right_path->nodes[right_level], right_path->slots[right_level]); if (left_blockptr == right_blockptr && left_gen == right_gen) { /* * As we're on a shared block, don't * allow to go deeper. */ advance_left = ADVANCE_ONLY_NEXT; advance_right = ADVANCE_ONLY_NEXT; } else { advance_left = ADVANCE; advance_right = ADVANCE; } } } else if (left_level < right_level) { advance_right = ADVANCE; } else { advance_left = ADVANCE; } } out_unlock: up_read(&fs_info->commit_root_sem); out: kvfree(tmp_buf); return ret; } static int send_subvol(struct send_ctx *sctx) { int ret; if (!(sctx->flags & BTRFS_SEND_FLAG_OMIT_STREAM_HEADER)) { ret = send_header(sctx); if (ret < 0) goto out; } ret = send_subvol_begin(sctx); if (ret < 0) goto out; if (sctx->parent_root) { ret = btrfs_compare_trees(sctx->send_root, sctx->parent_root, sctx); if (ret < 0) goto out; ret = finish_inode_if_needed(sctx, true); if (ret < 0) goto out; } else { ret = full_send_tree(sctx); if (ret < 0) goto out; } out: free_recorded_refs(sctx); return ret; } /* * If orphan cleanup did remove any orphans from a root, it means the tree * was modified and therefore the commit root is not the same as the current * root anymore. This is a problem, because send uses the commit root and * therefore can see inode items that don't exist in the current root anymore, * and for example make calls to btrfs_iget, which will do tree lookups based * on the current root and not on the commit root. Those lookups will fail, * returning a -ESTALE error, and making send fail with that error. So make * sure a send does not see any orphans we have just removed, and that it will * see the same inodes regardless of whether a transaction commit happened * before it started (meaning that the commit root will be the same as the * current root) or not. */ static int ensure_commit_roots_uptodate(struct send_ctx *sctx) { struct btrfs_root *root = sctx->parent_root; if (root && root->node != root->commit_root) return btrfs_commit_current_transaction(root); for (int i = 0; i < sctx->clone_roots_cnt; i++) { root = sctx->clone_roots[i].root; if (root->node != root->commit_root) return btrfs_commit_current_transaction(root); } return 0; } /* * Make sure any existing delalloc is flushed for any root used by a send * operation so that we do not miss any data and we do not race with writeback * finishing and changing a tree while send is using the tree. This could * happen if a subvolume is in RW mode, has delalloc, is turned to RO mode and * a send operation then uses the subvolume. * After flushing delalloc ensure_commit_roots_uptodate() must be called. */ static int flush_delalloc_roots(struct send_ctx *sctx) { struct btrfs_root *root = sctx->parent_root; int ret; int i; if (root) { ret = btrfs_start_delalloc_snapshot(root, false); if (ret) return ret; btrfs_wait_ordered_extents(root, U64_MAX, NULL); } for (i = 0; i < sctx->clone_roots_cnt; i++) { root = sctx->clone_roots[i].root; ret = btrfs_start_delalloc_snapshot(root, false); if (ret) return ret; btrfs_wait_ordered_extents(root, U64_MAX, NULL); } return 0; } static void btrfs_root_dec_send_in_progress(struct btrfs_root* root) { spin_lock(&root->root_item_lock); root->send_in_progress--; /* * Not much left to do, we don't know why it's unbalanced and * can't blindly reset it to 0. */ if (root->send_in_progress < 0) btrfs_err(root->fs_info, "send_in_progress unbalanced %d root %llu", root->send_in_progress, btrfs_root_id(root)); spin_unlock(&root->root_item_lock); } static void dedupe_in_progress_warn(const struct btrfs_root *root) { btrfs_warn_rl(root->fs_info, "cannot use root %llu for send while deduplications on it are in progress (%d in progress)", btrfs_root_id(root), root->dedupe_in_progress); } long btrfs_ioctl_send(struct btrfs_root *send_root, const struct btrfs_ioctl_send_args *arg) { int ret = 0; struct btrfs_fs_info *fs_info = send_root->fs_info; struct btrfs_root *clone_root; struct send_ctx *sctx = NULL; u32 i; u64 *clone_sources_tmp = NULL; int clone_sources_to_rollback = 0; size_t alloc_size; int sort_clone_roots = 0; struct btrfs_lru_cache_entry *entry; struct btrfs_lru_cache_entry *tmp; if (!capable(CAP_SYS_ADMIN)) return -EPERM; /* * The subvolume must remain read-only during send, protect against * making it RW. This also protects against deletion. */ spin_lock(&send_root->root_item_lock); /* * Unlikely but possible, if the subvolume is marked for deletion but * is slow to remove the directory entry, send can still be started. */ if (btrfs_root_dead(send_root)) { spin_unlock(&send_root->root_item_lock); return -EPERM; } /* Userspace tools do the checks and warn the user if it's not RO. */ if (!btrfs_root_readonly(send_root)) { spin_unlock(&send_root->root_item_lock); return -EPERM; } if (send_root->dedupe_in_progress) { dedupe_in_progress_warn(send_root); spin_unlock(&send_root->root_item_lock); return -EAGAIN; } send_root->send_in_progress++; spin_unlock(&send_root->root_item_lock); /* * Check that we don't overflow at later allocations, we request * clone_sources_count + 1 items, and compare to unsigned long inside * access_ok. Also set an upper limit for allocation size so this can't * easily exhaust memory. Max number of clone sources is about 200K. */ if (arg->clone_sources_count > SZ_8M / sizeof(struct clone_root)) { ret = -EINVAL; goto out; } if (arg->flags & ~BTRFS_SEND_FLAG_MASK) { ret = -EOPNOTSUPP; goto out; } sctx = kzalloc_obj(struct send_ctx); if (!sctx) { ret = -ENOMEM; goto out; } init_path(&sctx->cur_inode_path); INIT_LIST_HEAD(&sctx->new_refs); INIT_LIST_HEAD(&sctx->deleted_refs); btrfs_lru_cache_init(&sctx->name_cache, SEND_MAX_NAME_CACHE_SIZE); btrfs_lru_cache_init(&sctx->backref_cache, SEND_MAX_BACKREF_CACHE_SIZE); btrfs_lru_cache_init(&sctx->dir_created_cache, SEND_MAX_DIR_CREATED_CACHE_SIZE); /* * This cache is periodically trimmed to a fixed size elsewhere, see * cache_dir_utimes() and trim_dir_utimes_cache(). */ btrfs_lru_cache_init(&sctx->dir_utimes_cache, 0); sctx->pending_dir_moves = RB_ROOT; sctx->waiting_dir_moves = RB_ROOT; sctx->orphan_dirs = RB_ROOT; sctx->rbtree_new_refs = RB_ROOT; sctx->rbtree_deleted_refs = RB_ROOT; sctx->flags = arg->flags; if (arg->flags & BTRFS_SEND_FLAG_VERSION) { if (arg->version > BTRFS_SEND_STREAM_VERSION) { ret = -EPROTO; goto out; } /* Zero means "use the highest version" */ sctx->proto = arg->version ?: BTRFS_SEND_STREAM_VERSION; } else { sctx->proto = 1; } if ((arg->flags & BTRFS_SEND_FLAG_COMPRESSED) && sctx->proto < 2) { ret = -EINVAL; goto out; } sctx->send_filp = fget(arg->send_fd); if (!sctx->send_filp || !(sctx->send_filp->f_mode & FMODE_WRITE)) { ret = -EBADF; goto out; } sctx->send_root = send_root; sctx->clone_roots_cnt = arg->clone_sources_count; if (sctx->proto >= 2) { u32 send_buf_num_pages; sctx->send_max_size = BTRFS_SEND_BUF_SIZE_V2; sctx->send_buf = vmalloc(sctx->send_max_size); if (!sctx->send_buf) { ret = -ENOMEM; goto out; } send_buf_num_pages = sctx->send_max_size >> PAGE_SHIFT; sctx->send_buf_pages = kzalloc_objs(*sctx->send_buf_pages, send_buf_num_pages); if (!sctx->send_buf_pages) { ret = -ENOMEM; goto out; } for (i = 0; i < send_buf_num_pages; i++) { sctx->send_buf_pages[i] = vmalloc_to_page(sctx->send_buf + (i << PAGE_SHIFT)); } } else { sctx->send_max_size = BTRFS_SEND_BUF_SIZE_V1; sctx->send_buf = kvmalloc(sctx->send_max_size, GFP_KERNEL); } if (!sctx->send_buf) { ret = -ENOMEM; goto out; } sctx->clone_roots = kvzalloc_objs(*sctx->clone_roots, arg->clone_sources_count + 1); if (!sctx->clone_roots) { ret = -ENOMEM; goto out; } alloc_size = array_size(sizeof(*arg->clone_sources), arg->clone_sources_count); if (arg->clone_sources_count) { clone_sources_tmp = kvmalloc(alloc_size, GFP_KERNEL); if (!clone_sources_tmp) { ret = -ENOMEM; goto out; } ret = copy_from_user(clone_sources_tmp, arg->clone_sources, alloc_size); if (ret) { ret = -EFAULT; goto out; } for (i = 0; i < arg->clone_sources_count; i++) { clone_root = btrfs_get_fs_root(fs_info, clone_sources_tmp[i], true); if (IS_ERR(clone_root)) { ret = PTR_ERR(clone_root); goto out; } spin_lock(&clone_root->root_item_lock); if (!btrfs_root_readonly(clone_root) || btrfs_root_dead(clone_root)) { spin_unlock(&clone_root->root_item_lock); btrfs_put_root(clone_root); ret = -EPERM; goto out; } if (clone_root->dedupe_in_progress) { dedupe_in_progress_warn(clone_root); spin_unlock(&clone_root->root_item_lock); btrfs_put_root(clone_root); ret = -EAGAIN; goto out; } clone_root->send_in_progress++; spin_unlock(&clone_root->root_item_lock); sctx->clone_roots[i].root = clone_root; clone_sources_to_rollback = i + 1; } kvfree(clone_sources_tmp); clone_sources_tmp = NULL; } if (arg->parent_root) { sctx->parent_root = btrfs_get_fs_root(fs_info, arg->parent_root, true); if (IS_ERR(sctx->parent_root)) { ret = PTR_ERR(sctx->parent_root); goto out; } spin_lock(&sctx->parent_root->root_item_lock); sctx->parent_root->send_in_progress++; if (!btrfs_root_readonly(sctx->parent_root) || btrfs_root_dead(sctx->parent_root)) { spin_unlock(&sctx->parent_root->root_item_lock); ret = -EPERM; goto out; } if (sctx->parent_root->dedupe_in_progress) { dedupe_in_progress_warn(sctx->parent_root); spin_unlock(&sctx->parent_root->root_item_lock); ret = -EAGAIN; goto out; } spin_unlock(&sctx->parent_root->root_item_lock); } /* * Clones from send_root are allowed, but only if the clone source * is behind the current send position. This is checked while searching * for possible clone sources. */ sctx->clone_roots[sctx->clone_roots_cnt++].root = btrfs_grab_root(sctx->send_root); /* We do a bsearch later */ sort(sctx->clone_roots, sctx->clone_roots_cnt, sizeof(*sctx->clone_roots), __clone_root_cmp_sort, NULL); sort_clone_roots = 1; ret = flush_delalloc_roots(sctx); if (ret) goto out; ret = ensure_commit_roots_uptodate(sctx); if (ret) goto out; ret = send_subvol(sctx); if (ret < 0) goto out; btrfs_lru_cache_for_each_entry_safe(&sctx->dir_utimes_cache, entry, tmp) { ret = send_utimes(sctx, entry->key, entry->gen); if (ret < 0) goto out; btrfs_lru_cache_remove(&sctx->dir_utimes_cache, entry); } if (!(sctx->flags & BTRFS_SEND_FLAG_OMIT_END_CMD)) { ret = begin_cmd(sctx, BTRFS_SEND_C_END); if (ret < 0) goto out; ret = send_cmd(sctx); if (ret < 0) goto out; } out: WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->pending_dir_moves)); while (sctx && !RB_EMPTY_ROOT(&sctx->pending_dir_moves)) { struct rb_node *n; struct pending_dir_move *pm; n = rb_first(&sctx->pending_dir_moves); pm = rb_entry(n, struct pending_dir_move, node); while (!list_empty(&pm->list)) { struct pending_dir_move *pm2; pm2 = list_first_entry(&pm->list, struct pending_dir_move, list); free_pending_move(sctx, pm2); } free_pending_move(sctx, pm); } WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->waiting_dir_moves)); while (sctx && !RB_EMPTY_ROOT(&sctx->waiting_dir_moves)) { struct rb_node *n; struct waiting_dir_move *dm; n = rb_first(&sctx->waiting_dir_moves); dm = rb_entry(n, struct waiting_dir_move, node); rb_erase(&dm->node, &sctx->waiting_dir_moves); kfree(dm); } WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->orphan_dirs)); while (sctx && !RB_EMPTY_ROOT(&sctx->orphan_dirs)) { struct rb_node *n; struct orphan_dir_info *odi; n = rb_first(&sctx->orphan_dirs); odi = rb_entry(n, struct orphan_dir_info, node); free_orphan_dir_info(sctx, odi); } if (sort_clone_roots) { for (i = 0; i < sctx->clone_roots_cnt; i++) { btrfs_root_dec_send_in_progress( sctx->clone_roots[i].root); btrfs_put_root(sctx->clone_roots[i].root); } } else { for (i = 0; sctx && i < clone_sources_to_rollback; i++) { btrfs_root_dec_send_in_progress( sctx->clone_roots[i].root); btrfs_put_root(sctx->clone_roots[i].root); } btrfs_root_dec_send_in_progress(send_root); } if (sctx && !IS_ERR_OR_NULL(sctx->parent_root)) { btrfs_root_dec_send_in_progress(sctx->parent_root); btrfs_put_root(sctx->parent_root); } kvfree(clone_sources_tmp); if (sctx) { if (sctx->send_filp) fput(sctx->send_filp); kvfree(sctx->clone_roots); kfree(sctx->send_buf_pages); kvfree(sctx->send_buf); kvfree(sctx->verity_descriptor); close_current_inode(sctx); btrfs_lru_cache_clear(&sctx->name_cache); btrfs_lru_cache_clear(&sctx->backref_cache); btrfs_lru_cache_clear(&sctx->dir_created_cache); btrfs_lru_cache_clear(&sctx->dir_utimes_cache); if (sctx->cur_inode_path.buf != sctx->cur_inode_path.inline_buf) kfree(sctx->cur_inode_path.buf); kfree(sctx); } return ret; }
3 3 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 // SPDX-License-Identifier: GPL-2.0 /* * io_uring opcode handling table */ #include <linux/kernel.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/io_uring.h> #include <linux/io_uring/cmd.h> #include "io_uring.h" #include "opdef.h" #include "refs.h" #include "tctx.h" #include "sqpoll.h" #include "fdinfo.h" #include "kbuf.h" #include "rsrc.h" #include "xattr.h" #include "nop.h" #include "fs.h" #include "splice.h" #include "sync.h" #include "advise.h" #include "openclose.h" #include "uring_cmd.h" #include "epoll.h" #include "statx.h" #include "net.h" #include "msg_ring.h" #include "timeout.h" #include "poll.h" #include "cancel.h" #include "rw.h" #include "waitid.h" #include "futex.h" #include "truncate.h" #include "zcrx.h" static int io_no_issue(struct io_kiocb *req, unsigned int issue_flags) { WARN_ON_ONCE(1); return -ECANCELED; } static __maybe_unused int io_eopnotsupp_prep(struct io_kiocb *kiocb, const struct io_uring_sqe *sqe) { return -EOPNOTSUPP; } const struct io_issue_def io_issue_defs[] = { [IORING_OP_NOP] = { .audit_skip = 1, .iopoll = 1, .prep = io_nop_prep, .issue = io_nop, }, [IORING_OP_READV] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollin = 1, .buffer_select = 1, .plug = 1, .audit_skip = 1, .ioprio = 1, .iopoll = 1, .vectored = 1, .async_size = sizeof(struct io_async_rw), .prep = io_prep_readv, .issue = io_read, }, [IORING_OP_WRITEV] = { .needs_file = 1, .hash_reg_file = 1, .unbound_nonreg_file = 1, .pollout = 1, .plug = 1, .audit_skip = 1, .ioprio = 1, .iopoll = 1, .vectored = 1, .async_size = sizeof(struct io_async_rw), .prep = io_prep_writev, .issue = io_write, }, [IORING_OP_FSYNC] = { .needs_file = 1, .audit_skip = 1, .prep = io_fsync_prep, .issue = io_fsync, }, [IORING_OP_READ_FIXED] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollin = 1, .plug = 1, .audit_skip = 1, .ioprio = 1, .iopoll = 1, .async_size = sizeof(struct io_async_rw), .prep = io_prep_read_fixed, .issue = io_read_fixed, }, [IORING_OP_WRITE_FIXED] = { .needs_file = 1, .hash_reg_file = 1, .unbound_nonreg_file = 1, .pollout = 1, .plug = 1, .audit_skip = 1, .ioprio = 1, .iopoll = 1, .async_size = sizeof(struct io_async_rw), .prep = io_prep_write_fixed, .issue = io_write_fixed, }, [IORING_OP_POLL_ADD] = { .needs_file = 1, .unbound_nonreg_file = 1, .audit_skip = 1, .prep = io_poll_add_prep, .issue = io_poll_add, }, [IORING_OP_POLL_REMOVE] = { .audit_skip = 1, .prep = io_poll_remove_prep, .issue = io_poll_remove, }, [IORING_OP_SYNC_FILE_RANGE] = { .needs_file = 1, .audit_skip = 1, .prep = io_sfr_prep, .issue = io_sync_file_range, }, [IORING_OP_SENDMSG] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollout = 1, .ioprio = 1, #if defined(CONFIG_NET) .async_size = sizeof(struct io_async_msghdr), .prep = io_sendmsg_prep, .issue = io_sendmsg, #else .prep = io_eopnotsupp_prep, #endif }, [IORING_OP_RECVMSG] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollin = 1, .buffer_select = 1, .ioprio = 1, #if defined(CONFIG_NET) .async_size = sizeof(struct io_async_msghdr), .prep = io_recvmsg_prep, .issue = io_recvmsg, #else .prep = io_eopnotsupp_prep, #endif }, [IORING_OP_TIMEOUT] = { .audit_skip = 1, .async_size = sizeof(struct io_timeout_data), .prep = io_timeout_prep, .issue = io_timeout, }, [IORING_OP_TIMEOUT_REMOVE] = { /* used by timeout updates' prep() */ .audit_skip = 1, .prep = io_timeout_remove_prep, .issue = io_timeout_remove, }, [IORING_OP_ACCEPT] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollin = 1, .poll_exclusive = 1, .ioprio = 1, /* used for flags */ #if defined(CONFIG_NET) .prep = io_accept_prep, .issue = io_accept, #else .prep = io_eopnotsupp_prep, #endif }, [IORING_OP_ASYNC_CANCEL] = { .audit_skip = 1, .prep = io_async_cancel_prep, .issue = io_async_cancel, }, [IORING_OP_LINK_TIMEOUT] = { .audit_skip = 1, .async_size = sizeof(struct io_timeout_data), .prep = io_link_timeout_prep, .issue = io_no_issue, }, [IORING_OP_CONNECT] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollout = 1, #if defined(CONFIG_NET) .async_size = sizeof(struct io_async_msghdr), .prep = io_connect_prep, .issue = io_connect, #else .prep = io_eopnotsupp_prep, #endif }, [IORING_OP_FALLOCATE] = { .needs_file = 1, .hash_reg_file = 1, .prep = io_fallocate_prep, .issue = io_fallocate, }, [IORING_OP_OPENAT] = { .filter_pdu_size = sizeof_field(struct io_uring_bpf_ctx, open), .prep = io_openat_prep, .issue = io_openat, .filter_populate = io_openat_bpf_populate, }, [IORING_OP_CLOSE] = { .prep = io_close_prep, .issue = io_close, }, [IORING_OP_FILES_UPDATE] = { .audit_skip = 1, .iopoll = 1, .prep = io_files_update_prep, .issue = io_files_update, }, [IORING_OP_STATX] = { .audit_skip = 1, .prep = io_statx_prep, .issue = io_statx, }, [IORING_OP_READ] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollin = 1, .buffer_select = 1, .plug = 1, .audit_skip = 1, .ioprio = 1, .iopoll = 1, .async_size = sizeof(struct io_async_rw), .prep = io_prep_read, .issue = io_read, }, [IORING_OP_WRITE] = { .needs_file = 1, .hash_reg_file = 1, .unbound_nonreg_file = 1, .pollout = 1, .plug = 1, .audit_skip = 1, .ioprio = 1, .iopoll = 1, .async_size = sizeof(struct io_async_rw), .prep = io_prep_write, .issue = io_write, }, [IORING_OP_FADVISE] = { .needs_file = 1, .audit_skip = 1, .prep = io_fadvise_prep, .issue = io_fadvise, }, [IORING_OP_MADVISE] = { .audit_skip = 1, .prep = io_madvise_prep, .issue = io_madvise, }, [IORING_OP_SEND] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollout = 1, .audit_skip = 1, .ioprio = 1, .buffer_select = 1, #if defined(CONFIG_NET) .async_size = sizeof(struct io_async_msghdr), .prep = io_sendmsg_prep, .issue = io_send, #else .prep = io_eopnotsupp_prep, #endif }, [IORING_OP_RECV] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollin = 1, .buffer_select = 1, .audit_skip = 1, .ioprio = 1, #if defined(CONFIG_NET) .async_size = sizeof(struct io_async_msghdr), .prep = io_recvmsg_prep, .issue = io_recv, #else .prep = io_eopnotsupp_prep, #endif }, [IORING_OP_OPENAT2] = { .filter_pdu_size = sizeof_field(struct io_uring_bpf_ctx, open), .prep = io_openat2_prep, .issue = io_openat2, .filter_populate = io_openat_bpf_populate, }, [IORING_OP_EPOLL_CTL] = { .unbound_nonreg_file = 1, .audit_skip = 1, #if defined(CONFIG_EPOLL) .prep = io_epoll_ctl_prep, .issue = io_epoll_ctl, #else .prep = io_eopnotsupp_prep, #endif }, [IORING_OP_SPLICE] = { .needs_file = 1, .hash_reg_file = 1, .unbound_nonreg_file = 1, .audit_skip = 1, .prep = io_splice_prep, .issue = io_splice, }, [IORING_OP_PROVIDE_BUFFERS] = { .audit_skip = 1, .iopoll = 1, .prep = io_provide_buffers_prep, .issue = io_manage_buffers_legacy, }, [IORING_OP_REMOVE_BUFFERS] = { .audit_skip = 1, .iopoll = 1, .prep = io_remove_buffers_prep, .issue = io_manage_buffers_legacy, }, [IORING_OP_TEE] = { .needs_file = 1, .hash_reg_file = 1, .unbound_nonreg_file = 1, .audit_skip = 1, .prep = io_tee_prep, .issue = io_tee, }, [IORING_OP_SHUTDOWN] = { .needs_file = 1, #if defined(CONFIG_NET) .prep = io_shutdown_prep, .issue = io_shutdown, #else .prep = io_eopnotsupp_prep, #endif }, [IORING_OP_RENAMEAT] = { .prep = io_renameat_prep, .issue = io_renameat, }, [IORING_OP_UNLINKAT] = { .prep = io_unlinkat_prep, .issue = io_unlinkat, }, [IORING_OP_MKDIRAT] = { .prep = io_mkdirat_prep, .issue = io_mkdirat, }, [IORING_OP_SYMLINKAT] = { .prep = io_symlinkat_prep, .issue = io_symlinkat, }, [IORING_OP_LINKAT] = { .prep = io_linkat_prep, .issue = io_linkat, }, [IORING_OP_MSG_RING] = { .needs_file = 1, .iopoll = 1, .prep = io_msg_ring_prep, .issue = io_msg_ring, }, [IORING_OP_FSETXATTR] = { .needs_file = 1, .prep = io_fsetxattr_prep, .issue = io_fsetxattr, }, [IORING_OP_SETXATTR] = { .prep = io_setxattr_prep, .issue = io_setxattr, }, [IORING_OP_FGETXATTR] = { .needs_file = 1, .prep = io_fgetxattr_prep, .issue = io_fgetxattr, }, [IORING_OP_GETXATTR] = { .prep = io_getxattr_prep, .issue = io_getxattr, }, [IORING_OP_SOCKET] = { .audit_skip = 1, #if defined(CONFIG_NET) .filter_pdu_size = sizeof_field(struct io_uring_bpf_ctx, socket), .prep = io_socket_prep, .issue = io_socket, .filter_populate = io_socket_bpf_populate, #else .prep = io_eopnotsupp_prep, #endif }, [IORING_OP_URING_CMD] = { .buffer_select = 1, .needs_file = 1, .plug = 1, .iopoll = 1, .async_size = sizeof(struct io_async_cmd), .prep = io_uring_cmd_prep, .issue = io_uring_cmd, }, [IORING_OP_SEND_ZC] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollout = 1, .audit_skip = 1, .ioprio = 1, #if defined(CONFIG_NET) .async_size = sizeof(struct io_async_msghdr), .prep = io_send_zc_prep, .issue = io_sendmsg_zc, #else .prep = io_eopnotsupp_prep, #endif }, [IORING_OP_SENDMSG_ZC] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollout = 1, .ioprio = 1, #if defined(CONFIG_NET) .async_size = sizeof(struct io_async_msghdr), .prep = io_send_zc_prep, .issue = io_sendmsg_zc, #else .prep = io_eopnotsupp_prep, #endif }, [IORING_OP_READ_MULTISHOT] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollin = 1, .buffer_select = 1, .audit_skip = 1, .async_size = sizeof(struct io_async_rw), .prep = io_read_mshot_prep, .issue = io_read_mshot, }, [IORING_OP_WAITID] = { .async_size = sizeof(struct io_waitid_async), .prep = io_waitid_prep, .issue = io_waitid, }, [IORING_OP_FUTEX_WAIT] = { #if defined(CONFIG_FUTEX) .prep = io_futex_prep, .issue = io_futex_wait, #else .prep = io_eopnotsupp_prep, #endif }, [IORING_OP_FUTEX_WAKE] = { #if defined(CONFIG_FUTEX) .prep = io_futex_prep, .issue = io_futex_wake, #else .prep = io_eopnotsupp_prep, #endif }, [IORING_OP_FUTEX_WAITV] = { #if defined(CONFIG_FUTEX) .prep = io_futexv_prep, .issue = io_futexv_wait, #else .prep = io_eopnotsupp_prep, #endif }, [IORING_OP_FIXED_FD_INSTALL] = { .needs_file = 1, .prep = io_install_fixed_fd_prep, .issue = io_install_fixed_fd, }, [IORING_OP_FTRUNCATE] = { .needs_file = 1, .hash_reg_file = 1, .prep = io_ftruncate_prep, .issue = io_ftruncate, }, [IORING_OP_BIND] = { #if defined(CONFIG_NET) .needs_file = 1, .prep = io_bind_prep, .issue = io_bind, .async_size = sizeof(struct io_async_msghdr), #else .prep = io_eopnotsupp_prep, #endif }, [IORING_OP_LISTEN] = { #if defined(CONFIG_NET) .needs_file = 1, .prep = io_listen_prep, .issue = io_listen, .async_size = sizeof(struct io_async_msghdr), #else .prep = io_eopnotsupp_prep, #endif }, [IORING_OP_RECV_ZC] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollin = 1, .ioprio = 1, #if defined(CONFIG_NET) .prep = io_recvzc_prep, .issue = io_recvzc, #else .prep = io_eopnotsupp_prep, #endif }, [IORING_OP_EPOLL_WAIT] = { .needs_file = 1, .audit_skip = 1, .pollin = 1, #if defined(CONFIG_EPOLL) .prep = io_epoll_wait_prep, .issue = io_epoll_wait, #else .prep = io_eopnotsupp_prep, #endif }, [IORING_OP_READV_FIXED] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollin = 1, .plug = 1, .audit_skip = 1, .ioprio = 1, .iopoll = 1, .vectored = 1, .async_size = sizeof(struct io_async_rw), .prep = io_prep_readv_fixed, .issue = io_read, }, [IORING_OP_WRITEV_FIXED] = { .needs_file = 1, .hash_reg_file = 1, .unbound_nonreg_file = 1, .pollout = 1, .plug = 1, .audit_skip = 1, .ioprio = 1, .iopoll = 1, .vectored = 1, .async_size = sizeof(struct io_async_rw), .prep = io_prep_writev_fixed, .issue = io_write, }, [IORING_OP_PIPE] = { .prep = io_pipe_prep, .issue = io_pipe, }, [IORING_OP_NOP128] = { .audit_skip = 1, .iopoll = 1, .is_128 = 1, .prep = io_nop_prep, .issue = io_nop, }, [IORING_OP_URING_CMD128] = { .buffer_select = 1, .needs_file = 1, .plug = 1, .iopoll = 1, .is_128 = 1, .async_size = sizeof(struct io_async_cmd), .prep = io_uring_cmd_prep, .issue = io_uring_cmd, }, }; const struct io_cold_def io_cold_defs[] = { [IORING_OP_NOP] = { .name = "NOP", }, [IORING_OP_READV] = { .name = "READV", .cleanup = io_readv_writev_cleanup, .fail = io_rw_fail, }, [IORING_OP_WRITEV] = { .name = "WRITEV", .cleanup = io_readv_writev_cleanup, .fail = io_rw_fail, }, [IORING_OP_FSYNC] = { .name = "FSYNC", }, [IORING_OP_READ_FIXED] = { .name = "READ_FIXED", .cleanup = io_readv_writev_cleanup, .fail = io_rw_fail, }, [IORING_OP_WRITE_FIXED] = { .name = "WRITE_FIXED", .cleanup = io_readv_writev_cleanup, .fail = io_rw_fail, }, [IORING_OP_POLL_ADD] = { .name = "POLL_ADD", }, [IORING_OP_POLL_REMOVE] = { .name = "POLL_REMOVE", }, [IORING_OP_SYNC_FILE_RANGE] = { .name = "SYNC_FILE_RANGE", }, [IORING_OP_SENDMSG] = { .name = "SENDMSG", #if defined(CONFIG_NET) .cleanup = io_sendmsg_recvmsg_cleanup, .fail = io_sendrecv_fail, #endif }, [IORING_OP_RECVMSG] = { .name = "RECVMSG", #if defined(CONFIG_NET) .cleanup = io_sendmsg_recvmsg_cleanup, .fail = io_sendrecv_fail, #endif }, [IORING_OP_TIMEOUT] = { .name = "TIMEOUT", }, [IORING_OP_TIMEOUT_REMOVE] = { .name = "TIMEOUT_REMOVE", }, [IORING_OP_ACCEPT] = { .name = "ACCEPT", }, [IORING_OP_ASYNC_CANCEL] = { .name = "ASYNC_CANCEL", }, [IORING_OP_LINK_TIMEOUT] = { .name = "LINK_TIMEOUT", }, [IORING_OP_CONNECT] = { .name = "CONNECT", }, [IORING_OP_FALLOCATE] = { .name = "FALLOCATE", }, [IORING_OP_OPENAT] = { .name = "OPENAT", .cleanup = io_open_cleanup, }, [IORING_OP_CLOSE] = { .name = "CLOSE", }, [IORING_OP_FILES_UPDATE] = { .name = "FILES_UPDATE", }, [IORING_OP_STATX] = { .name = "STATX", .cleanup = io_statx_cleanup, }, [IORING_OP_READ] = { .name = "READ", .cleanup = io_readv_writev_cleanup, .fail = io_rw_fail, }, [IORING_OP_WRITE] = { .name = "WRITE", .cleanup = io_readv_writev_cleanup, .fail = io_rw_fail, }, [IORING_OP_FADVISE] = { .name = "FADVISE", }, [IORING_OP_MADVISE] = { .name = "MADVISE", }, [IORING_OP_SEND] = { .name = "SEND", #if defined(CONFIG_NET) .cleanup = io_sendmsg_recvmsg_cleanup, .fail = io_sendrecv_fail, #endif }, [IORING_OP_RECV] = { .name = "RECV", #if defined(CONFIG_NET) .cleanup = io_sendmsg_recvmsg_cleanup, .fail = io_sendrecv_fail, #endif }, [IORING_OP_OPENAT2] = { .name = "OPENAT2", .cleanup = io_open_cleanup, }, [IORING_OP_EPOLL_CTL] = { .name = "EPOLL", }, [IORING_OP_SPLICE] = { .name = "SPLICE", .cleanup = io_splice_cleanup, }, [IORING_OP_PROVIDE_BUFFERS] = { .name = "PROVIDE_BUFFERS", }, [IORING_OP_REMOVE_BUFFERS] = { .name = "REMOVE_BUFFERS", }, [IORING_OP_TEE] = { .name = "TEE", .cleanup = io_splice_cleanup, }, [IORING_OP_SHUTDOWN] = { .name = "SHUTDOWN", }, [IORING_OP_RENAMEAT] = { .name = "RENAMEAT", .cleanup = io_renameat_cleanup, }, [IORING_OP_UNLINKAT] = { .name = "UNLINKAT", .cleanup = io_unlinkat_cleanup, }, [IORING_OP_MKDIRAT] = { .name = "MKDIRAT", .cleanup = io_mkdirat_cleanup, }, [IORING_OP_SYMLINKAT] = { .name = "SYMLINKAT", .cleanup = io_link_cleanup, }, [IORING_OP_LINKAT] = { .name = "LINKAT", .cleanup = io_link_cleanup, }, [IORING_OP_MSG_RING] = { .name = "MSG_RING", .cleanup = io_msg_ring_cleanup, }, [IORING_OP_FSETXATTR] = { .name = "FSETXATTR", .cleanup = io_xattr_cleanup, }, [IORING_OP_SETXATTR] = { .name = "SETXATTR", .cleanup = io_xattr_cleanup, }, [IORING_OP_FGETXATTR] = { .name = "FGETXATTR", .cleanup = io_xattr_cleanup, }, [IORING_OP_GETXATTR] = { .name = "GETXATTR", .cleanup = io_xattr_cleanup, }, [IORING_OP_SOCKET] = { .name = "SOCKET", }, [IORING_OP_URING_CMD] = { .name = "URING_CMD", .sqe_copy = io_uring_cmd_sqe_copy, .cleanup = io_uring_cmd_cleanup, }, [IORING_OP_SEND_ZC] = { .name = "SEND_ZC", #if defined(CONFIG_NET) .cleanup = io_send_zc_cleanup, .fail = io_sendrecv_fail, #endif }, [IORING_OP_SENDMSG_ZC] = { .name = "SENDMSG_ZC", #if defined(CONFIG_NET) .cleanup = io_send_zc_cleanup, .fail = io_sendrecv_fail, #endif }, [IORING_OP_READ_MULTISHOT] = { .name = "READ_MULTISHOT", .cleanup = io_readv_writev_cleanup, }, [IORING_OP_WAITID] = { .name = "WAITID", }, [IORING_OP_FUTEX_WAIT] = { .name = "FUTEX_WAIT", }, [IORING_OP_FUTEX_WAKE] = { .name = "FUTEX_WAKE", }, [IORING_OP_FUTEX_WAITV] = { .name = "FUTEX_WAITV", }, [IORING_OP_FIXED_FD_INSTALL] = { .name = "FIXED_FD_INSTALL", }, [IORING_OP_FTRUNCATE] = { .name = "FTRUNCATE", }, [IORING_OP_BIND] = { .name = "BIND", }, [IORING_OP_LISTEN] = { .name = "LISTEN", }, [IORING_OP_RECV_ZC] = { .name = "RECV_ZC", }, [IORING_OP_EPOLL_WAIT] = { .name = "EPOLL_WAIT", }, [IORING_OP_READV_FIXED] = { .name = "READV_FIXED", .cleanup = io_readv_writev_cleanup, .fail = io_rw_fail, }, [IORING_OP_WRITEV_FIXED] = { .name = "WRITEV_FIXED", .cleanup = io_readv_writev_cleanup, .fail = io_rw_fail, }, [IORING_OP_PIPE] = { .name = "PIPE", }, [IORING_OP_NOP128] = { .name = "NOP128", }, [IORING_OP_URING_CMD128] = { .name = "URING_CMD128", .sqe_copy = io_uring_cmd_sqe_copy, .cleanup = io_uring_cmd_cleanup, }, }; const char *io_uring_get_opcode(u8 opcode) { if (opcode < IORING_OP_LAST) return io_cold_defs[opcode].name; return "INVALID"; } bool io_uring_op_supported(u8 opcode) { if (opcode < IORING_OP_LAST && io_issue_defs[opcode].prep != io_eopnotsupp_prep) return true; return false; } void __init io_uring_optable_init(void) { int i; BUILD_BUG_ON(ARRAY_SIZE(io_cold_defs) != IORING_OP_LAST); BUILD_BUG_ON(ARRAY_SIZE(io_issue_defs) != IORING_OP_LAST); for (i = 0; i < ARRAY_SIZE(io_issue_defs); i++) { BUG_ON(!io_issue_defs[i].prep); if (io_issue_defs[i].prep != io_eopnotsupp_prep) BUG_ON(!io_issue_defs[i].issue); WARN_ON_ONCE(!io_cold_defs[i].name); } }
31 31 29 28 29 29 4 2 2 1 56 56 56 56 48 48 1 1 7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/proc/net.c * * Copyright (C) 2007 * * Author: Eric Biederman <ebiederm@xmission.com> * * proc net directory handling functions */ #include <linux/errno.h> #include <linux/time.h> #include <linux/proc_fs.h> #include <linux/stat.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/sched.h> #include <linux/sched/task.h> #include <linux/module.h> #include <linux/bitops.h> #include <linux/mount.h> #include <linux/nsproxy.h> #include <linux/uidgid.h> #include <net/net_namespace.h> #include <linux/seq_file.h> #include "internal.h" static inline struct net *PDE_NET(struct proc_dir_entry *pde) { return pde->parent->data; } static struct net *get_proc_net(const struct inode *inode) { return maybe_get_net(PDE_NET(PDE(inode))); } static int seq_open_net(struct inode *inode, struct file *file) { unsigned int state_size = PDE(inode)->state_size; struct seq_net_private *p; struct net *net; WARN_ON_ONCE(state_size < sizeof(*p)); if (file->f_mode & FMODE_WRITE && !PDE(inode)->write) return -EACCES; net = get_proc_net(inode); if (!net) return -ENXIO; p = __seq_open_private(file, PDE(inode)->seq_ops, state_size); if (!p) { put_net(net); return -ENOMEM; } #ifdef CONFIG_NET_NS p->net = net; netns_tracker_alloc(net, &p->ns_tracker, GFP_KERNEL); #endif return 0; } static void seq_file_net_put_net(struct seq_file *seq) { #ifdef CONFIG_NET_NS struct seq_net_private *priv = seq->private; put_net_track(priv->net, &priv->ns_tracker); #else put_net(&init_net); #endif } static int seq_release_net(struct inode *ino, struct file *f) { struct seq_file *seq = f->private_data; seq_file_net_put_net(seq); seq_release_private(ino, f); return 0; } static const struct proc_ops proc_net_seq_ops = { .proc_open = seq_open_net, .proc_read = seq_read, .proc_write = proc_simple_write, .proc_lseek = seq_lseek, .proc_release = seq_release_net, }; int bpf_iter_init_seq_net(void *priv_data, struct bpf_iter_aux_info *aux) { #ifdef CONFIG_NET_NS struct seq_net_private *p = priv_data; p->net = get_net_track(current->nsproxy->net_ns, &p->ns_tracker, GFP_KERNEL); #endif return 0; } void bpf_iter_fini_seq_net(void *priv_data) { #ifdef CONFIG_NET_NS struct seq_net_private *p = priv_data; put_net_track(p->net, &p->ns_tracker); #endif } struct proc_dir_entry *proc_create_net_data(const char *name, umode_t mode, struct proc_dir_entry *parent, const struct seq_operations *ops, unsigned int state_size, void *data) { struct proc_dir_entry *p; p = proc_create_reg(name, mode, &parent, data); if (!p) return NULL; pde_force_lookup(p); p->proc_ops = &proc_net_seq_ops; p->seq_ops = ops; p->state_size = state_size; return proc_register(parent, p); } EXPORT_SYMBOL_GPL(proc_create_net_data); /** * proc_create_net_data_write - Create a writable net_ns-specific proc file * @name: The name of the file. * @mode: The file's access mode. * @parent: The parent directory in which to create. * @ops: The seq_file ops with which to read the file. * @write: The write method with which to 'modify' the file. * @state_size: The size of the per-file private state to allocate. * @data: Data for retrieval by pde_data(). * * Create a network namespaced proc file in the @parent directory with the * specified @name and @mode that allows reading of a file that displays a * series of elements and also provides for the file accepting writes that have * some arbitrary effect. * * The functions in the @ops table are used to iterate over items to be * presented and extract the readable content using the seq_file interface. * * The @write function is called with the data copied into a kernel space * scratch buffer and has a NUL appended for convenience. The buffer may be * modified by the @write function. @write should return 0 on success. * * The @data value is accessible from the @show and @write functions by calling * pde_data() on the file inode. The network namespace must be accessed by * calling seq_file_net() on the seq_file struct. */ struct proc_dir_entry *proc_create_net_data_write(const char *name, umode_t mode, struct proc_dir_entry *parent, const struct seq_operations *ops, proc_write_t write, unsigned int state_size, void *data) { struct proc_dir_entry *p; p = proc_create_reg(name, mode, &parent, data); if (!p) return NULL; pde_force_lookup(p); p->proc_ops = &proc_net_seq_ops; p->seq_ops = ops; p->state_size = state_size; p->write = write; return proc_register(parent, p); } EXPORT_SYMBOL_GPL(proc_create_net_data_write); static int single_open_net(struct inode *inode, struct file *file) { struct proc_dir_entry *de = PDE(inode); struct net *net; int err; net = get_proc_net(inode); if (!net) return -ENXIO; err = single_open(file, de->single_show, net); if (err) put_net(net); return err; } static int single_release_net(struct inode *ino, struct file *f) { struct seq_file *seq = f->private_data; put_net(seq->private); return single_release(ino, f); } static const struct proc_ops proc_net_single_ops = { .proc_open = single_open_net, .proc_read = seq_read, .proc_write = proc_simple_write, .proc_lseek = seq_lseek, .proc_release = single_release_net, }; struct proc_dir_entry *proc_create_net_single(const char *name, umode_t mode, struct proc_dir_entry *parent, int (*show)(struct seq_file *, void *), void *data) { struct proc_dir_entry *p; p = proc_create_reg(name, mode, &parent, data); if (!p) return NULL; pde_force_lookup(p); p->proc_ops = &proc_net_single_ops; p->single_show = show; return proc_register(parent, p); } EXPORT_SYMBOL_GPL(proc_create_net_single); /** * proc_create_net_single_write - Create a writable net_ns-specific proc file * @name: The name of the file. * @mode: The file's access mode. * @parent: The parent directory in which to create. * @show: The seqfile show method with which to read the file. * @write: The write method with which to 'modify' the file. * @data: Data for retrieval by pde_data(). * * Create a network-namespaced proc file in the @parent directory with the * specified @name and @mode that allows reading of a file that displays a * single element rather than a series and also provides for the file accepting * writes that have some arbitrary effect. * * The @show function is called to extract the readable content via the * seq_file interface. * * The @write function is called with the data copied into a kernel space * scratch buffer and has a NUL appended for convenience. The buffer may be * modified by the @write function. @write should return 0 on success. * * The @data value is accessible from the @show and @write functions by calling * pde_data() on the file inode. The network namespace must be accessed by * calling seq_file_single_net() on the seq_file struct. */ struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mode, struct proc_dir_entry *parent, int (*show)(struct seq_file *, void *), proc_write_t write, void *data) { struct proc_dir_entry *p; p = proc_create_reg(name, mode, &parent, data); if (!p) return NULL; pde_force_lookup(p); p->proc_ops = &proc_net_single_ops; p->single_show = show; p->write = write; return proc_register(parent, p); } EXPORT_SYMBOL_GPL(proc_create_net_single_write); static struct net *get_proc_task_net(struct inode *dir) { struct task_struct *task; struct nsproxy *ns; struct net *net = NULL; rcu_read_lock(); task = pid_task(proc_pid(dir), PIDTYPE_PID); if (task != NULL) { task_lock(task); ns = task->nsproxy; if (ns != NULL) net = get_net(ns->net_ns); task_unlock(task); } rcu_read_unlock(); return net; } static struct dentry *proc_tgid_net_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct dentry *de; struct net *net; de = ERR_PTR(-ENOENT); net = get_proc_task_net(dir); if (net != NULL) { de = proc_lookup_de(dir, dentry, net->proc_net); put_net(net); } return de; } static int proc_tgid_net_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); struct net *net; net = get_proc_task_net(inode); generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); if (net != NULL) { stat->nlink = net->proc_net->nlink; put_net(net); } return 0; } const struct inode_operations proc_net_inode_operations = { .lookup = proc_tgid_net_lookup, .getattr = proc_tgid_net_getattr, .setattr = proc_nochmod_setattr, }; static int proc_tgid_net_readdir(struct file *file, struct dir_context *ctx) { int ret; struct net *net; ret = -EINVAL; net = get_proc_task_net(file_inode(file)); if (net != NULL) { ret = proc_readdir_de(file, ctx, net->proc_net); put_net(net); } return ret; } const struct file_operations proc_net_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, .iterate_shared = proc_tgid_net_readdir, }; static __net_init int proc_net_ns_init(struct net *net) { struct proc_dir_entry *netd, *net_statd; kuid_t uid; kgid_t gid; int err; /* * This PDE acts only as an anchor for /proc/${pid}/net hierarchy. * Corresponding inode (PDE(inode) == net->proc_net) is never * instantiated therefore blanket zeroing is fine. * net->proc_net_stat inode is instantiated normally. */ err = -ENOMEM; netd = kmem_cache_zalloc(proc_dir_entry_cache, GFP_KERNEL); if (!netd) goto out; netd->subdir = RB_ROOT; netd->data = net; netd->nlink = 2; netd->namelen = 3; netd->parent = &proc_root; netd->name = netd->inline_name; memcpy(netd->name, "net", 4); uid = make_kuid(net->user_ns, 0); if (!uid_valid(uid)) uid = netd->uid; gid = make_kgid(net->user_ns, 0); if (!gid_valid(gid)) gid = netd->gid; proc_set_user(netd, uid, gid); /* Seed dentry revalidation for /proc/${pid}/net */ pde_force_lookup(netd); err = -EEXIST; net_statd = proc_net_mkdir(net, "stat", netd); if (!net_statd) goto free_net; net->proc_net = netd; net->proc_net_stat = net_statd; return 0; free_net: pde_free(netd); out: return err; } static __net_exit void proc_net_ns_exit(struct net *net) { remove_proc_entry("stat", net->proc_net); pde_free(net->proc_net); } static struct pernet_operations __net_initdata proc_net_ns_ops = { .init = proc_net_ns_init, .exit = proc_net_ns_exit, }; int __init proc_net_init(void) { proc_symlink("net", NULL, "self/net"); return register_pernet_subsys(&proc_net_ns_ops); }
54 38 53 28 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 /* * linux/fs/nls/nls_koi8-u.c * * Charset koi8-u translation tables. * The Unicode to charset table has only exact mappings. */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/nls.h> #include <linux/errno.h> static const wchar_t charset2uni[256] = { /* 0x00*/ 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, /* 0x10*/ 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, /* 0x20*/ 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, /* 0x30*/ 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, /* 0x40*/ 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, /* 0x50*/ 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, /* 0x60*/ 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, /* 0x70*/ 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x007f, /* 0x80*/ 0x2500, 0x2502, 0x250c, 0x2510, 0x2514, 0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590, /* 0x90*/ 0x2591, 0x2592, 0x2593, 0x2320, 0x25a0, 0x2219, 0x221a, 0x2248, 0x2264, 0x2265, 0x00a0, 0x2321, 0x00b0, 0x00b2, 0x00b7, 0x00f7, /* 0xa0*/ 0x2550, 0x2551, 0x2552, 0x0451, 0x0454, 0x2554, 0x0456, 0x0457, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x0491, 0x255d, 0x255e, /* 0xb0*/ 0x255f, 0x2560, 0x2561, 0x0401, 0x0404, 0x2563, 0x0406, 0x0407, 0x2566, 0x2567, 0x2568, 0x2569, 0x256a, 0x0490, 0x256c, 0x00a9, /* 0xc0*/ 0x044e, 0x0430, 0x0431, 0x0446, 0x0434, 0x0435, 0x0444, 0x0433, 0x0445, 0x0438, 0x0439, 0x043a, 0x043b, 0x043c, 0x043d, 0x043e, /* 0xd0*/ 0x043f, 0x044f, 0x0440, 0x0441, 0x0442, 0x0443, 0x0436, 0x0432, 0x044c, 0x044b, 0x0437, 0x0448, 0x044d, 0x0449, 0x0447, 0x044a, /* 0xe0*/ 0x042e, 0x0410, 0x0411, 0x0426, 0x0414, 0x0415, 0x0424, 0x0413, 0x0425, 0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e, /* 0xf0*/ 0x041f, 0x042f, 0x0420, 0x0421, 0x0422, 0x0423, 0x0416, 0x0412, 0x042c, 0x042b, 0x0417, 0x0428, 0x042d, 0x0429, 0x0427, 0x042a, }; static const unsigned char page00[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x9a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0xbf, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x9c, 0x00, 0x9d, 0x00, 0x00, 0x00, 0x00, 0x9e, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9f, /* 0xf0-0xf7 */ }; static const unsigned char page04[256] = { 0x00, 0xb3, 0x00, 0x00, 0xb4, 0x00, 0xb6, 0xb7, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0xe1, 0xe2, 0xf7, 0xe7, 0xe4, 0xe5, 0xf6, 0xfa, /* 0x10-0x17 */ 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, /* 0x18-0x1f */ 0xf2, 0xf3, 0xf4, 0xf5, 0xe6, 0xe8, 0xe3, 0xfe, /* 0x20-0x27 */ 0xfb, 0xfd, 0xff, 0xf9, 0xf8, 0xfc, 0xe0, 0xf1, /* 0x28-0x2f */ 0xc1, 0xc2, 0xd7, 0xc7, 0xc4, 0xc5, 0xd6, 0xda, /* 0x30-0x37 */ 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0, /* 0x38-0x3f */ 0xd2, 0xd3, 0xd4, 0xd5, 0xc6, 0xc8, 0xc3, 0xde, /* 0x40-0x47 */ 0xdb, 0xdd, 0xdf, 0xd9, 0xd8, 0xdc, 0xc0, 0xd1, /* 0x48-0x4f */ 0x00, 0xa3, 0x00, 0x00, 0xa4, 0x00, 0xa6, 0xa7, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0xbd, 0xad, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ }; static const unsigned char page22[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x95, 0x96, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x97, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x98, 0x99, 0x00, 0x00, /* 0x60-0x67 */ }; static const unsigned char page23[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x93, 0x9b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ }; static const unsigned char page25[256] = { 0x80, 0x00, 0x81, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x82, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x83, 0x00, 0x00, 0x00, 0x84, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x85, 0x00, 0x00, 0x00, 0x86, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x87, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x88, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x89, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x8a, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0xa0, 0xa1, 0xa2, 0x00, 0xa5, 0x00, 0x00, 0xa8, /* 0x50-0x57 */ 0xa9, 0xaa, 0xab, 0xac, 0x00, 0xae, 0xaf, 0xb0, /* 0x58-0x5f */ 0xb1, 0xb2, 0x00, 0xb5, 0x00, 0x00, 0xb8, 0xb9, /* 0x60-0x67 */ 0xba, 0xbb, 0xbc, 0x00, 0xbe, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x8b, 0x00, 0x00, 0x00, 0x8c, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x8d, 0x00, 0x00, 0x00, 0x8e, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x8f, 0x90, 0x91, 0x92, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x94, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ }; static const unsigned char *const page_uni2charset[256] = { page00, NULL, NULL, NULL, page04, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, page22, page23, NULL, page25, NULL, NULL, }; static const unsigned char charset2lower[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ 0xb0, 0xb1, 0xb2, 0xa3, 0xa4, 0xb5, 0xa6, 0xa7, /* 0xb0-0xb7 */ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xad, 0xbe, 0xbf, /* 0xb8-0xbf */ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xf0-0xf7 */ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xf8-0xff */ }; static const unsigned char charset2upper[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ 0xa0, 0xa1, 0xa2, 0xb3, 0xb4, 0xa5, 0xb6, 0xb7, /* 0xa0-0xa7 */ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xbd, 0xae, 0xaf, /* 0xa8-0xaf */ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xd0-0xd7 */ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xd8-0xdf */ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ }; static int uni2char(wchar_t uni, unsigned char *out, int boundlen) { const unsigned char *uni2charset; unsigned char cl = uni & 0x00ff; unsigned char ch = (uni & 0xff00) >> 8; if (boundlen <= 0) return -ENAMETOOLONG; uni2charset = page_uni2charset[ch]; if (uni2charset && uni2charset[cl]) out[0] = uni2charset[cl]; else return -EINVAL; return 1; } static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) { *uni = charset2uni[*rawstring]; if (*uni == 0x0000) return -EINVAL; return 1; } static struct nls_table table = { .charset = "koi8-u", .uni2char = uni2char, .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, }; static int __init init_nls_koi8_u(void) { return register_nls(&table); } static void __exit exit_nls_koi8_u(void) { unregister_nls(&table); } module_init(init_nls_koi8_u) module_exit(exit_nls_koi8_u) MODULE_DESCRIPTION("NLS KOI8-U (Ukrainian)"); MODULE_LICENSE("Dual BSD/GPL");
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 /* SPDX-License-Identifier: GPL-1.0+ */ /* * Bond several ethernet interfaces into a Cisco, running 'Etherchannel'. * * Portions are (c) Copyright 1995 Simon "Guru Aleph-Null" Janes * NCM: Network and Communications Management, Inc. * * BUT, I'm the one who modified it for ethernet, so: * (c) Copyright 1999, Thomas Davis, tadavis@lbl.gov * */ #ifndef _NET_BONDING_H #define _NET_BONDING_H #include <linux/timer.h> #include <linux/proc_fs.h> #include <linux/if_bonding.h> #include <linux/cpumask.h> #include <linux/in6.h> #include <linux/netpoll.h> #include <linux/inetdevice.h> #include <linux/etherdevice.h> #include <linux/reciprocal_div.h> #include <linux/if_link.h> #include <net/bond_3ad.h> #include <net/bond_alb.h> #include <net/bond_options.h> #include <net/ipv6.h> #include <net/addrconf.h> #define BOND_MAX_ARP_TARGETS 16 #define BOND_MAX_NS_TARGETS BOND_MAX_ARP_TARGETS #define BOND_DEFAULT_MIIMON 100 #ifndef __long_aligned #define __long_aligned __attribute__((aligned((sizeof(long))))) #endif #define slave_info(bond_dev, slave_dev, fmt, ...) \ netdev_info(bond_dev, "(slave %s): " fmt, (slave_dev)->name, ##__VA_ARGS__) #define slave_warn(bond_dev, slave_dev, fmt, ...) \ netdev_warn(bond_dev, "(slave %s): " fmt, (slave_dev)->name, ##__VA_ARGS__) #define slave_dbg(bond_dev, slave_dev, fmt, ...) \ netdev_dbg(bond_dev, "(slave %s): " fmt, (slave_dev)->name, ##__VA_ARGS__) #define slave_err(bond_dev, slave_dev, fmt, ...) \ netdev_err(bond_dev, "(slave %s): " fmt, (slave_dev)->name, ##__VA_ARGS__) #define BOND_MODE(bond) ((bond)->params.mode) /* slave list primitives */ #define bond_slave_list(bond) (&(bond)->dev->adj_list.lower) #define bond_has_slaves(bond) !list_empty(bond_slave_list(bond)) /* IMPORTANT: bond_first/last_slave can return NULL in case of an empty list */ #define bond_first_slave(bond) \ (bond_has_slaves(bond) ? \ netdev_adjacent_get_private(bond_slave_list(bond)->next) : \ NULL) #define bond_last_slave(bond) \ (bond_has_slaves(bond) ? \ netdev_adjacent_get_private(bond_slave_list(bond)->prev) : \ NULL) /* Caller must have rcu_read_lock */ #define bond_first_slave_rcu(bond) \ netdev_lower_get_first_private_rcu(bond->dev) /** * bond_for_each_slave - iterate over all slaves * @bond: the bond holding this list * @pos: current slave * @iter: list_head * iterator * * Caller must hold RTNL */ #define bond_for_each_slave(bond, pos, iter) \ netdev_for_each_lower_private((bond)->dev, pos, iter) /* Caller must have rcu_read_lock */ #define bond_for_each_slave_rcu(bond, pos, iter) \ netdev_for_each_lower_private_rcu((bond)->dev, pos, iter) #define BOND_XFRM_FEATURES (NETIF_F_HW_ESP | NETIF_F_HW_ESP_TX_CSUM | \ NETIF_F_GSO_ESP) #ifdef CONFIG_NET_POLL_CONTROLLER DECLARE_STATIC_KEY_FALSE(netpoll_block_tx); static inline void block_netpoll_tx(void) { static_branch_inc(&netpoll_block_tx); } static inline void unblock_netpoll_tx(void) { static_branch_dec(&netpoll_block_tx); } static inline int is_netpoll_tx_blocked(struct net_device *dev) { if (static_branch_unlikely(&netpoll_block_tx)) return netpoll_tx_running(dev); return 0; } #else #define block_netpoll_tx() #define unblock_netpoll_tx() #define is_netpoll_tx_blocked(dev) (0) #endif DECLARE_STATIC_KEY_FALSE(bond_bcast_neigh_enabled); struct bond_params { int mode; int xmit_policy; int miimon; u8 num_peer_notif; u8 missed_max; int arp_interval; int arp_validate; int arp_all_targets; int fail_over_mac; int updelay; int downdelay; int peer_notif_delay; int lacp_active; int lacp_fast; unsigned int min_links; int ad_select; char primary[IFNAMSIZ]; int primary_reselect; __be32 arp_targets[BOND_MAX_ARP_TARGETS]; int tx_queues; int all_slaves_active; int resend_igmp; int lp_interval; int packets_per_slave; int tlb_dynamic_lb; struct reciprocal_value reciprocal_packets_per_slave; u16 ad_actor_sys_prio; u16 ad_user_port_key; #if IS_ENABLED(CONFIG_IPV6) struct in6_addr ns_targets[BOND_MAX_NS_TARGETS]; #endif int coupled_control; int broadcast_neighbor; /* 2 bytes of padding : see ether_addr_equal_64bits() */ u8 ad_actor_system[ETH_ALEN + 2]; }; struct slave { struct net_device *dev; /* first - useful for panic debug */ struct bonding *bond; /* our master */ int delay; /* all 4 in jiffies */ unsigned long last_link_up; unsigned long last_tx; unsigned long last_rx; unsigned long target_last_arp_rx[BOND_MAX_ARP_TARGETS]; s8 link; /* one of BOND_LINK_XXXX */ s8 link_new_state; /* one of BOND_LINK_XXXX */ u8 backup:1, /* indicates backup slave. Value corresponds with BOND_STATE_ACTIVE and BOND_STATE_BACKUP */ inactive:1, /* indicates inactive slave */ rx_disabled:1, /* indicates whether slave's Rx is disabled */ should_notify:1, /* indicates whether the state changed */ should_notify_link:1; /* indicates whether the link changed */ u8 duplex; u32 original_mtu; u32 link_failure_count; u32 speed; u16 queue_id; u8 perm_hwaddr[MAX_ADDR_LEN]; int prio; struct ad_slave_info *ad_info; struct tlb_slave_info tlb_info; #ifdef CONFIG_NET_POLL_CONTROLLER struct netpoll *np; #endif struct delayed_work notify_work; struct kobject kobj; struct rtnl_link_stats64 slave_stats; }; static inline struct slave *to_slave(struct kobject *kobj) { return container_of(kobj, struct slave, kobj); } struct bond_up_slave { unsigned int count; struct rcu_head rcu; struct slave *arr[]; }; /* * Link pseudo-state only used internally by monitors */ #define BOND_LINK_NOCHANGE -1 struct bond_ipsec { struct list_head list; struct xfrm_state *xs; }; /* * Here are the locking policies for the two bonding locks: * Get rcu_read_lock when reading or RTNL when writing slave list. */ struct bonding { struct net_device *dev; /* first - useful for panic debug */ struct slave __rcu *curr_active_slave; struct slave __rcu *current_arp_slave; struct slave __rcu *primary_slave; struct bond_up_slave __rcu *usable_slaves; struct bond_up_slave __rcu *all_slaves; bool force_primary; bool notifier_ctx; s32 slave_cnt; /* never change this value outside the attach/detach wrappers */ int (*recv_probe)(const struct sk_buff *, struct bonding *, struct slave *); /* mode_lock is used for mode-specific locking needs, currently used by: * 3ad mode (4) - protect against running bond_3ad_unbind_slave() and * bond_3ad_state_machine_handler() concurrently and also * the access to the state machine shared variables. * TLB mode (5) - to sync the use and modifications of its hash table * ALB mode (6) - to sync the use and modifications of its hash table */ spinlock_t mode_lock; spinlock_t stats_lock; u32 send_peer_notif; u8 igmp_retrans; #ifdef CONFIG_PROC_FS struct proc_dir_entry *proc_entry; char proc_file_name[IFNAMSIZ]; #endif /* CONFIG_PROC_FS */ struct list_head bond_list; u32 __percpu *rr_tx_counter; struct ad_bond_info ad_info; struct alb_bond_info alb_info; struct bond_params params; struct workqueue_struct *wq; struct delayed_work mii_work; struct delayed_work arp_work; struct delayed_work alb_work; struct delayed_work ad_work; struct delayed_work mcast_work; struct delayed_work slave_arr_work; struct delayed_work peer_notify_work; #ifdef CONFIG_DEBUG_FS /* debugging support via debugfs */ struct dentry *debug_dir; #endif /* CONFIG_DEBUG_FS */ struct rtnl_link_stats64 bond_stats; #ifdef CONFIG_XFRM_OFFLOAD struct list_head ipsec_list; /* protecting ipsec_list */ struct mutex ipsec_lock; #endif /* CONFIG_XFRM_OFFLOAD */ struct bpf_prog *xdp_prog; }; #define bond_slave_get_rcu(dev) \ ((struct slave *) rcu_dereference(dev->rx_handler_data)) #define bond_slave_get_rtnl(dev) \ ((struct slave *) rtnl_dereference(dev->rx_handler_data)) void bond_queue_slave_event(struct slave *slave); void bond_lower_state_changed(struct slave *slave); struct bond_vlan_tag { __be16 vlan_proto; unsigned short vlan_id; }; /* * Returns NULL if the net_device does not belong to any of the bond's slaves * * Caller must hold bond lock for read */ static inline struct slave *bond_get_slave_by_dev(struct bonding *bond, struct net_device *slave_dev) { return netdev_lower_dev_get_private(bond->dev, slave_dev); } static inline struct bonding *bond_get_bond_by_slave(struct slave *slave) { return slave->bond; } static inline bool bond_should_override_tx_queue(struct bonding *bond) { return BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP || BOND_MODE(bond) == BOND_MODE_ROUNDROBIN; } static inline bool bond_is_lb(const struct bonding *bond) { return BOND_MODE(bond) == BOND_MODE_TLB || BOND_MODE(bond) == BOND_MODE_ALB; } static inline bool bond_needs_speed_duplex(const struct bonding *bond) { return BOND_MODE(bond) == BOND_MODE_8023AD || bond_is_lb(bond); } static inline bool bond_is_nondyn_tlb(const struct bonding *bond) { return (bond_is_lb(bond) && bond->params.tlb_dynamic_lb == 0); } static inline bool bond_mode_can_use_xmit_hash(const struct bonding *bond) { return (BOND_MODE(bond) == BOND_MODE_8023AD || BOND_MODE(bond) == BOND_MODE_XOR || BOND_MODE(bond) == BOND_MODE_TLB || BOND_MODE(bond) == BOND_MODE_ALB); } static inline bool bond_mode_uses_xmit_hash(const struct bonding *bond) { return (BOND_MODE(bond) == BOND_MODE_8023AD || BOND_MODE(bond) == BOND_MODE_XOR || bond_is_nondyn_tlb(bond)); } static inline bool bond_mode_uses_arp(int mode) { return mode != BOND_MODE_8023AD && mode != BOND_MODE_TLB && mode != BOND_MODE_ALB; } static inline bool bond_mode_uses_primary(int mode) { return mode == BOND_MODE_ACTIVEBACKUP || mode == BOND_MODE_TLB || mode == BOND_MODE_ALB; } static inline bool bond_uses_primary(struct bonding *bond) { return bond_mode_uses_primary(BOND_MODE(bond)); } static inline struct net_device *bond_option_active_slave_get_rcu(struct bonding *bond) { struct slave *slave = rcu_dereference_rtnl(bond->curr_active_slave); return bond_uses_primary(bond) && slave ? slave->dev : NULL; } static inline bool bond_slave_is_up(struct slave *slave) { return netif_running(slave->dev) && netif_carrier_ok(slave->dev); } static inline void bond_set_active_slave(struct slave *slave) { if (slave->backup) { slave->backup = 0; bond_queue_slave_event(slave); bond_lower_state_changed(slave); } } static inline void bond_set_backup_slave(struct slave *slave) { if (!slave->backup) { slave->backup = 1; bond_queue_slave_event(slave); bond_lower_state_changed(slave); } } static inline void bond_set_slave_state(struct slave *slave, int slave_state, bool notify) { if (slave->backup == slave_state) return; slave->backup = slave_state; if (notify) { bond_lower_state_changed(slave); bond_queue_slave_event(slave); slave->should_notify = 0; } else { if (slave->should_notify) slave->should_notify = 0; else slave->should_notify = 1; } } static inline void bond_slave_state_change(struct bonding *bond) { struct list_head *iter; struct slave *tmp; bond_for_each_slave(bond, tmp, iter) { if (tmp->link == BOND_LINK_UP) bond_set_active_slave(tmp); else if (tmp->link == BOND_LINK_DOWN) bond_set_backup_slave(tmp); } } static inline void bond_slave_state_notify(struct bonding *bond) { struct list_head *iter; struct slave *tmp; bond_for_each_slave(bond, tmp, iter) { if (tmp->should_notify) { bond_lower_state_changed(tmp); tmp->should_notify = 0; } } } static inline int bond_slave_state(struct slave *slave) { return slave->backup; } static inline bool bond_is_active_slave(struct slave *slave) { return !bond_slave_state(slave); } static inline bool bond_slave_can_tx(struct slave *slave) { return bond_slave_is_up(slave) && slave->link == BOND_LINK_UP && bond_is_active_slave(slave); } static inline bool bond_is_active_slave_dev(const struct net_device *slave_dev) { struct slave *slave; bool active; rcu_read_lock(); slave = bond_slave_get_rcu(slave_dev); active = bond_is_active_slave(slave); rcu_read_unlock(); return active; } static inline void bond_hw_addr_copy(u8 *dst, const u8 *src, unsigned int len) { if (len == ETH_ALEN) { ether_addr_copy(dst, src); return; } memcpy(dst, src, len); } #define BOND_PRI_RESELECT_ALWAYS 0 #define BOND_PRI_RESELECT_BETTER 1 #define BOND_PRI_RESELECT_FAILURE 2 #define BOND_FOM_NONE 0 #define BOND_FOM_ACTIVE 1 #define BOND_FOM_FOLLOW 2 #define BOND_ARP_TARGETS_ANY 0 #define BOND_ARP_TARGETS_ALL 1 #define BOND_ARP_VALIDATE_NONE 0 #define BOND_ARP_VALIDATE_ACTIVE (1 << BOND_STATE_ACTIVE) #define BOND_ARP_VALIDATE_BACKUP (1 << BOND_STATE_BACKUP) #define BOND_ARP_VALIDATE_ALL (BOND_ARP_VALIDATE_ACTIVE | \ BOND_ARP_VALIDATE_BACKUP) #define BOND_ARP_FILTER (BOND_ARP_VALIDATE_ALL + 1) #define BOND_ARP_FILTER_ACTIVE (BOND_ARP_VALIDATE_ACTIVE | \ BOND_ARP_FILTER) #define BOND_ARP_FILTER_BACKUP (BOND_ARP_VALIDATE_BACKUP | \ BOND_ARP_FILTER) #define BOND_SLAVE_NOTIFY_NOW true #define BOND_SLAVE_NOTIFY_LATER false static inline int slave_do_arp_validate(struct bonding *bond, struct slave *slave) { return bond->params.arp_validate & (1 << bond_slave_state(slave)); } static inline int slave_do_arp_validate_only(struct bonding *bond) { return bond->params.arp_validate & BOND_ARP_FILTER; } static inline int bond_is_ip_target_ok(__be32 addr) { return !ipv4_is_lbcast(addr) && !ipv4_is_zeronet(addr); } #if IS_ENABLED(CONFIG_IPV6) static inline int bond_is_ip6_target_ok(struct in6_addr *addr) { return !ipv6_addr_any(addr) && !ipv6_addr_loopback(addr) && !ipv6_addr_is_multicast(addr); } #endif /* Get the oldest arp which we've received on this slave for bond's * arp_targets. */ static inline unsigned long slave_oldest_target_arp_rx(struct bonding *bond, struct slave *slave) { unsigned long tmp, ret = READ_ONCE(slave->target_last_arp_rx[0]); int i = 1; for (; (i < BOND_MAX_ARP_TARGETS) && bond->params.arp_targets[i]; i++) { tmp = READ_ONCE(slave->target_last_arp_rx[i]); if (time_before(tmp, ret)) ret = tmp; } return ret; } static inline unsigned long slave_last_rx(struct bonding *bond, struct slave *slave) { if (bond->params.arp_all_targets == BOND_ARP_TARGETS_ALL) return slave_oldest_target_arp_rx(bond, slave); return READ_ONCE(slave->last_rx); } static inline void slave_update_last_tx(struct slave *slave) { WRITE_ONCE(slave->last_tx, jiffies); } static inline unsigned long slave_last_tx(struct slave *slave) { return READ_ONCE(slave->last_tx); } #ifdef CONFIG_NET_POLL_CONTROLLER static inline netdev_tx_t bond_netpoll_send_skb(const struct slave *slave, struct sk_buff *skb) { return netpoll_send_skb(slave->np, skb); } #else static inline netdev_tx_t bond_netpoll_send_skb(const struct slave *slave, struct sk_buff *skb) { BUG(); return NETDEV_TX_OK; } #endif static inline void bond_set_slave_inactive_flags(struct slave *slave, bool notify) { if (!bond_is_lb(slave->bond)) bond_set_slave_state(slave, BOND_STATE_BACKUP, notify); if (!slave->bond->params.all_slaves_active) slave->inactive = 1; if (BOND_MODE(slave->bond) == BOND_MODE_8023AD) slave->rx_disabled = 1; } static inline void bond_set_slave_tx_disabled_flags(struct slave *slave, bool notify) { bond_set_slave_state(slave, BOND_STATE_BACKUP, notify); } static inline void bond_set_slave_active_flags(struct slave *slave, bool notify) { bond_set_slave_state(slave, BOND_STATE_ACTIVE, notify); slave->inactive = 0; if (BOND_MODE(slave->bond) == BOND_MODE_8023AD) slave->rx_disabled = 0; } static inline void bond_set_slave_rx_enabled_flags(struct slave *slave, bool notify) { slave->rx_disabled = 0; } static inline bool bond_is_slave_inactive(struct slave *slave) { return slave->inactive; } static inline bool bond_is_slave_rx_disabled(struct slave *slave) { return slave->rx_disabled; } static inline void bond_propose_link_state(struct slave *slave, int state) { slave->link_new_state = state; } static inline void bond_commit_link_state(struct slave *slave, bool notify) { if (slave->link_new_state == BOND_LINK_NOCHANGE) return; slave->link = slave->link_new_state; if (notify) { bond_queue_slave_event(slave); bond_lower_state_changed(slave); slave->should_notify_link = 0; } else { if (slave->should_notify_link) slave->should_notify_link = 0; else slave->should_notify_link = 1; } } static inline void bond_set_slave_link_state(struct slave *slave, int state, bool notify) { bond_propose_link_state(slave, state); bond_commit_link_state(slave, notify); } static inline void bond_slave_link_notify(struct bonding *bond) { struct list_head *iter; struct slave *tmp; bond_for_each_slave(bond, tmp, iter) { if (tmp->should_notify_link) { bond_queue_slave_event(tmp); bond_lower_state_changed(tmp); tmp->should_notify_link = 0; } } } static inline __be32 bond_confirm_addr(struct net_device *dev, __be32 dst, __be32 local) { struct in_device *in_dev; __be32 addr = 0; rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); if (in_dev) addr = inet_confirm_addr(dev_net(dev), in_dev, dst, local, RT_SCOPE_HOST); rcu_read_unlock(); return addr; } struct bond_net { struct net *net; /* Associated network namespace */ struct list_head dev_list; #ifdef CONFIG_PROC_FS struct proc_dir_entry *proc_dir; #endif struct class_attribute class_attr_bonding_masters; }; int bond_rcv_validate(const struct sk_buff *skb, struct bonding *bond, struct slave *slave); netdev_tx_t bond_dev_queue_xmit(struct bonding *bond, struct sk_buff *skb, struct net_device *slave_dev); int bond_create(struct net *net, const char *name); int bond_create_sysfs(struct bond_net *net); void bond_destroy_sysfs(struct bond_net *net); void bond_prepare_sysfs_group(struct bonding *bond); int bond_sysfs_slave_add(struct slave *slave); void bond_sysfs_slave_del(struct slave *slave); void bond_xdp_set_features(struct net_device *bond_dev); int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev, struct netlink_ext_ack *extack); int bond_release(struct net_device *bond_dev, struct net_device *slave_dev); u32 bond_xmit_hash(struct bonding *bond, struct sk_buff *skb); int bond_set_carrier(struct bonding *bond); void bond_select_active_slave(struct bonding *bond); void bond_change_active_slave(struct bonding *bond, struct slave *new_active); void bond_create_debugfs(void); void bond_destroy_debugfs(void); void bond_debug_register(struct bonding *bond); void bond_debug_unregister(struct bonding *bond); void bond_debug_reregister(struct bonding *bond); const char *bond_mode_name(int mode); bool __bond_xdp_check(int mode, int xmit_policy); bool bond_xdp_check(struct bonding *bond, int mode); void bond_setup(struct net_device *bond_dev); unsigned int bond_get_num_tx_queues(void); int bond_netlink_init(void); void bond_netlink_fini(void); struct net_device *bond_option_active_slave_get_rcu(struct bonding *bond); const char *bond_slave_link_status(s8 link); struct bond_vlan_tag *bond_verify_device_path(struct net_device *start_dev, struct net_device *end_dev, int level); int bond_update_slave_arr(struct bonding *bond, struct slave *skipslave); void bond_slave_arr_work_rearm(struct bonding *bond, unsigned long delay); void bond_peer_notify_work_rearm(struct bonding *bond, unsigned long delay); void bond_work_init_all(struct bonding *bond); void bond_work_cancel_all(struct bonding *bond); #ifdef CONFIG_PROC_FS void bond_create_proc_entry(struct bonding *bond); void bond_remove_proc_entry(struct bonding *bond); void bond_create_proc_dir(struct bond_net *bn); void bond_destroy_proc_dir(struct bond_net *bn); #else static inline void bond_create_proc_entry(struct bonding *bond) { } static inline void bond_remove_proc_entry(struct bonding *bond) { } static inline void bond_create_proc_dir(struct bond_net *bn) { } static inline void bond_destroy_proc_dir(struct bond_net *bn) { } #endif static inline struct slave *bond_slave_has_mac(struct bonding *bond, const u8 *mac) { struct list_head *iter; struct slave *tmp; bond_for_each_slave(bond, tmp, iter) if (ether_addr_equal_64bits(mac, tmp->dev->dev_addr)) return tmp; return NULL; } /* Caller must hold rcu_read_lock() for read */ static inline bool bond_slave_has_mac_rcu(struct bonding *bond, const u8 *mac) { struct list_head *iter; struct slave *tmp; bond_for_each_slave_rcu(bond, tmp, iter) if (ether_addr_equal_64bits(mac, tmp->dev->dev_addr)) return true; return false; } /* Check if the ip is present in arp ip list, or first free slot if ip == 0 * Returns -1 if not found, index if found */ static inline int bond_get_targets_ip(__be32 *targets, __be32 ip) { int i; for (i = 0; i < BOND_MAX_ARP_TARGETS; i++) if (targets[i] == ip) return i; else if (targets[i] == 0) break; return -1; } #if IS_ENABLED(CONFIG_IPV6) static inline int bond_get_targets_ip6(struct in6_addr *targets, struct in6_addr *ip) { struct in6_addr mcaddr; int i; for (i = 0; i < BOND_MAX_NS_TARGETS; i++) { addrconf_addr_solict_mult(&targets[i], &mcaddr); if ((ipv6_addr_equal(&targets[i], ip)) || (ipv6_addr_equal(&mcaddr, ip))) return i; else if (ipv6_addr_any(&targets[i])) break; } return -1; } #endif /* exported from bond_main.c */ extern unsigned int bond_net_id; /* exported from bond_netlink.c */ extern struct rtnl_link_ops bond_link_ops; /* exported from bond_sysfs_slave.c */ extern const struct sysfs_ops slave_sysfs_ops; /* exported from bond_3ad.c */ extern const u8 lacpdu_mcast_addr[]; static inline netdev_tx_t bond_tx_drop(struct net_device *dev, struct sk_buff *skb) { dev_core_stats_tx_dropped_inc(dev); dev_kfree_skb_any(skb); return NET_XMIT_DROP; } #endif /* _NET_BONDING_H */
1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 2 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 // SPDX-License-Identifier: GPL-2.0 #include "backref.h" #include "btrfs_inode.h" #include "fiemap.h" #include "file.h" #include "file-item.h" struct btrfs_fiemap_entry { u64 offset; u64 phys; u64 len; u32 flags; }; /* * Indicate the caller of emit_fiemap_extent() that it needs to unlock the file * range from the inode's io tree, unlock the subvolume tree search path, flush * the fiemap cache and relock the file range and research the subvolume tree. * The value here is something negative that can't be confused with a valid * errno value and different from 1 because that's also a return value from * fiemap_fill_next_extent() and also it's often used to mean some btree search * did not find a key, so make it some distinct negative value. */ #define BTRFS_FIEMAP_FLUSH_CACHE (-(MAX_ERRNO + 1)) /* * Used to: * * - Cache the next entry to be emitted to the fiemap buffer, so that we can * merge extents that are contiguous and can be grouped as a single one; * * - Store extents ready to be written to the fiemap buffer in an intermediary * buffer. This intermediary buffer is to ensure that in case the fiemap * buffer is memory mapped to the fiemap target file, we don't deadlock * during btrfs_page_mkwrite(). This is because during fiemap we are locking * an extent range in order to prevent races with delalloc flushing and * ordered extent completion, which is needed in order to reliably detect * delalloc in holes and prealloc extents. And this can lead to a deadlock * if the fiemap buffer is memory mapped to the file we are running fiemap * against (a silly, useless in practice scenario, but possible) because * btrfs_page_mkwrite() will try to lock the same extent range. */ struct fiemap_cache { /* An array of ready fiemap entries. */ struct btrfs_fiemap_entry *entries; /* Number of entries in the entries array. */ int entries_size; /* Index of the next entry in the entries array to write to. */ int entries_pos; /* * Once the entries array is full, this indicates what's the offset for * the next file extent item we must search for in the inode's subvolume * tree after unlocking the extent range in the inode's io tree and * releasing the search path. */ u64 next_search_offset; /* * This matches struct fiemap_extent_info::fi_mapped_extents, we use it * to count ourselves emitted extents and stop instead of relying on * fiemap_fill_next_extent() because we buffer ready fiemap entries at * the @entries array, and we want to stop as soon as we hit the max * amount of extents to map, not just to save time but also to make the * logic at extent_fiemap() simpler. */ unsigned int extents_mapped; /* Fields for the cached extent (unsubmitted, not ready, extent). */ u64 offset; u64 phys; u64 len; u32 flags; bool cached; }; static int flush_fiemap_cache(struct fiemap_extent_info *fieinfo, struct fiemap_cache *cache) { for (int i = 0; i < cache->entries_pos; i++) { struct btrfs_fiemap_entry *entry = &cache->entries[i]; int ret; ret = fiemap_fill_next_extent(fieinfo, entry->offset, entry->phys, entry->len, entry->flags); /* * Ignore 1 (reached max entries) because we keep track of that * ourselves in emit_fiemap_extent(). */ if (ret < 0) return ret; } cache->entries_pos = 0; return 0; } /* * Helper to submit fiemap extent. * * Will try to merge current fiemap extent specified by @offset, @phys, * @len and @flags with cached one. * And only when we fails to merge, cached one will be submitted as * fiemap extent. * * Return value is the same as fiemap_fill_next_extent(). */ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, struct fiemap_cache *cache, u64 offset, u64 phys, u64 len, u32 flags) { struct btrfs_fiemap_entry *entry; u64 cache_end; /* Set at the end of extent_fiemap(). */ ASSERT((flags & FIEMAP_EXTENT_LAST) == 0); if (!cache->cached) goto assign; /* * When iterating the extents of the inode, at extent_fiemap(), we may * find an extent that starts at an offset behind the end offset of the * previous extent we processed. This happens if fiemap is called * without FIEMAP_FLAG_SYNC and there are ordered extents completing * after we had to unlock the file range, release the search path, emit * the fiemap extents stored in the buffer (cache->entries array) and * the lock the remainder of the range and re-search the btree. * * For example we are in leaf X processing its last item, which is the * file extent item for file range [512K, 1M[, and after * btrfs_next_leaf() releases the path, there's an ordered extent that * completes for the file range [768K, 2M[, and that results in trimming * the file extent item so that it now corresponds to the file range * [512K, 768K[ and a new file extent item is inserted for the file * range [768K, 2M[, which may end up as the last item of leaf X or as * the first item of the next leaf - in either case btrfs_next_leaf() * will leave us with a path pointing to the new extent item, for the * file range [768K, 2M[, since that's the first key that follows the * last one we processed. So in order not to report overlapping extents * to user space, we trim the length of the previously cached extent and * emit it. * * Upon calling btrfs_next_leaf() we may also find an extent with an * offset smaller than or equals to cache->offset, and this happens * when we had a hole or prealloc extent with several delalloc ranges in * it, but after btrfs_next_leaf() released the path, delalloc was * flushed and the resulting ordered extents were completed, so we can * now have found a file extent item for an offset that is smaller than * or equals to what we have in cache->offset. We deal with this as * described below. */ cache_end = cache->offset + cache->len; if (cache_end > offset) { if (offset == cache->offset) { /* * We cached a delalloc range (found in the io tree) for * a hole or prealloc extent and we have now found a * file extent item for the same offset. What we have * now is more recent and up to date, so discard what * we had in the cache and use what we have just found. */ goto assign; } else if (offset > cache->offset) { /* * The extent range we previously found ends after the * offset of the file extent item we found and that * offset falls somewhere in the middle of that previous * extent range. So adjust the range we previously found * to end at the offset of the file extent item we have * just found, since this extent is more up to date. * Emit that adjusted range and cache the file extent * item we have just found. This corresponds to the case * where a previously found file extent item was split * due to an ordered extent completing. */ cache->len = offset - cache->offset; goto emit; } else { const u64 range_end = offset + len; /* * The offset of the file extent item we have just found * is behind the cached offset. This means we were * processing a hole or prealloc extent for which we * have found delalloc ranges (in the io tree), so what * we have in the cache is the last delalloc range we * found while the file extent item we found can be * either for a whole delalloc range we previously * emitted or only a part of that range. * * We have two cases here: * * 1) The file extent item's range ends at or behind the * cached extent's end. In this case just ignore the * current file extent item because we don't want to * overlap with previous ranges that may have been * emitted already; * * 2) The file extent item starts behind the currently * cached extent but its end offset goes beyond the * end offset of the cached extent. We don't want to * overlap with a previous range that may have been * emitted already, so we emit the currently cached * extent and then partially store the current file * extent item's range in the cache, for the subrange * going the cached extent's end to the end of the * file extent item. */ if (range_end <= cache_end) return 0; if (!(flags & (FIEMAP_EXTENT_ENCODED | FIEMAP_EXTENT_DELALLOC))) phys += cache_end - offset; offset = cache_end; len = range_end - cache_end; goto emit; } } /* * Only merges fiemap extents if * 1) Their logical addresses are continuous * * 2) Their physical addresses are continuous * So truly compressed (physical size smaller than logical size) * extents won't get merged with each other * * 3) Share same flags */ if (cache->offset + cache->len == offset && cache->phys + cache->len == phys && cache->flags == flags) { cache->len += len; return 0; } emit: /* Not mergeable, need to submit cached one */ if (cache->entries_pos == cache->entries_size) { /* * We will need to research for the end offset of the last * stored extent and not from the current offset, because after * unlocking the range and releasing the path, if there's a hole * between that end offset and this current offset, a new extent * may have been inserted due to a new write, so we don't want * to miss it. */ entry = &cache->entries[cache->entries_size - 1]; cache->next_search_offset = entry->offset + entry->len; cache->cached = false; return BTRFS_FIEMAP_FLUSH_CACHE; } entry = &cache->entries[cache->entries_pos]; entry->offset = cache->offset; entry->phys = cache->phys; entry->len = cache->len; entry->flags = cache->flags; cache->entries_pos++; cache->extents_mapped++; if (cache->extents_mapped == fieinfo->fi_extents_max) { cache->cached = false; return 1; } assign: cache->cached = true; cache->offset = offset; cache->phys = phys; cache->len = len; cache->flags = flags; return 0; } /* * Emit last fiemap cache * * The last fiemap cache may still be cached in the following case: * 0 4k 8k * |<- Fiemap range ->| * |<------------ First extent ----------->| * * In this case, the first extent range will be cached but not emitted. * So we must emit it before ending extent_fiemap(). */ static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo, struct fiemap_cache *cache) { int ret; if (!cache->cached) return 0; ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys, cache->len, cache->flags); cache->cached = false; if (ret > 0) ret = 0; return ret; } static int fiemap_next_leaf_item(struct btrfs_inode *inode, struct btrfs_path *path) { struct extent_buffer *clone = path->nodes[0]; struct btrfs_key key; int slot; int ret; path->slots[0]++; if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) return 0; /* * Add a temporary extra ref to an already cloned extent buffer to * prevent btrfs_next_leaf() freeing it, we want to reuse it to avoid * the cost of allocating a new one. */ ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED, &clone->bflags)); refcount_inc(&clone->refs); ret = btrfs_next_leaf(inode->root, path); if (ret != 0) goto out; /* * Don't bother with cloning if there are no more file extent items for * our inode. */ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY) { ret = 1; goto out; } /* * Important to preserve the start field, for the optimizations when * checking if extents are shared (see extent_fiemap()). * * We must set ->start before calling copy_extent_buffer_full(). If we * are on sub-pagesize blocksize, we use ->start to determine the offset * into the folio where our eb exists, and if we update ->start after * the fact then any subsequent reads of the eb may read from a * different offset in the folio than where we originally copied into. */ clone->start = path->nodes[0]->start; /* See the comment at fiemap_search_slot() about why we clone. */ copy_extent_buffer_full(clone, path->nodes[0]); slot = path->slots[0]; btrfs_release_path(path); path->nodes[0] = clone; path->slots[0] = slot; out: if (ret) free_extent_buffer(clone); return ret; } /* * Search for the first file extent item that starts at a given file offset or * the one that starts immediately before that offset. * Returns: 0 on success, < 0 on error, 1 if not found. */ static int fiemap_search_slot(struct btrfs_inode *inode, struct btrfs_path *path, u64 file_offset) { const u64 ino = btrfs_ino(inode); struct btrfs_root *root = inode->root; struct extent_buffer *clone; struct btrfs_key key; int slot; int ret; key.objectid = ino; key.type = BTRFS_EXTENT_DATA_KEY; key.offset = file_offset; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) return ret; if (ret > 0 && path->slots[0] > 0) { btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1); if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY) path->slots[0]--; } if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { ret = btrfs_next_leaf(root, path); if (ret != 0) return ret; btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) return 1; } /* * We clone the leaf and use it during fiemap. This is because while * using the leaf we do expensive things like checking if an extent is * shared, which can take a long time. In order to prevent blocking * other tasks for too long, we use a clone of the leaf. We have locked * the file range in the inode's io tree, so we know none of our file * extent items can change. This way we avoid blocking other tasks that * want to insert items for other inodes in the same leaf or b+tree * rebalance operations (triggered for example when someone is trying * to push items into this leaf when trying to insert an item in a * neighbour leaf). * We also need the private clone because holding a read lock on an * extent buffer of the subvolume's b+tree will make lockdep unhappy * when we check if extents are shared, as backref walking may need to * lock the same leaf we are processing. */ clone = btrfs_clone_extent_buffer(path->nodes[0]); if (!clone) return -ENOMEM; slot = path->slots[0]; btrfs_release_path(path); path->nodes[0] = clone; path->slots[0] = slot; return 0; } /* * Process a range which is a hole or a prealloc extent in the inode's subvolume * btree. If @disk_bytenr is 0, we are dealing with a hole, otherwise a prealloc * extent. The end offset (@end) is inclusive. */ static int fiemap_process_hole(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, struct fiemap_cache *cache, struct extent_state **delalloc_cached_state, struct btrfs_backref_share_check_ctx *backref_ctx, u64 disk_bytenr, u64 extent_offset, u64 extent_gen, u64 start, u64 end) { const u64 i_size = i_size_read(&inode->vfs_inode); u64 cur_offset = start; u64 last_delalloc_end = 0; u32 prealloc_flags = FIEMAP_EXTENT_UNWRITTEN; bool checked_extent_shared = false; int ret; /* * There can be no delalloc past i_size, so don't waste time looking for * it beyond i_size. */ while (cur_offset < end && cur_offset < i_size) { u64 delalloc_start; u64 delalloc_end; u64 prealloc_start; u64 prealloc_len = 0; bool delalloc; delalloc = btrfs_find_delalloc_in_range(inode, cur_offset, end, delalloc_cached_state, &delalloc_start, &delalloc_end); if (!delalloc) break; /* * If this is a prealloc extent we have to report every section * of it that has no delalloc. */ if (disk_bytenr != 0) { if (last_delalloc_end == 0) { prealloc_start = start; prealloc_len = delalloc_start - start; } else { prealloc_start = last_delalloc_end + 1; prealloc_len = delalloc_start - prealloc_start; } } if (prealloc_len > 0) { if (!checked_extent_shared && fieinfo->fi_extents_max) { ret = btrfs_is_data_extent_shared(inode, disk_bytenr, extent_gen, backref_ctx); if (ret < 0) return ret; else if (ret > 0) prealloc_flags |= FIEMAP_EXTENT_SHARED; checked_extent_shared = true; } ret = emit_fiemap_extent(fieinfo, cache, prealloc_start, disk_bytenr + extent_offset, prealloc_len, prealloc_flags); if (ret) return ret; extent_offset += prealloc_len; } ret = emit_fiemap_extent(fieinfo, cache, delalloc_start, 0, delalloc_end + 1 - delalloc_start, FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN); if (ret) return ret; last_delalloc_end = delalloc_end; cur_offset = delalloc_end + 1; extent_offset += cur_offset - delalloc_start; cond_resched(); } /* * Either we found no delalloc for the whole prealloc extent or we have * a prealloc extent that spans i_size or starts at or after i_size. */ if (disk_bytenr != 0 && last_delalloc_end < end) { u64 prealloc_start; u64 prealloc_len; if (last_delalloc_end == 0) { prealloc_start = start; prealloc_len = end + 1 - start; } else { prealloc_start = last_delalloc_end + 1; prealloc_len = end + 1 - prealloc_start; } if (!checked_extent_shared && fieinfo->fi_extents_max) { ret = btrfs_is_data_extent_shared(inode, disk_bytenr, extent_gen, backref_ctx); if (ret < 0) return ret; else if (ret > 0) prealloc_flags |= FIEMAP_EXTENT_SHARED; } ret = emit_fiemap_extent(fieinfo, cache, prealloc_start, disk_bytenr + extent_offset, prealloc_len, prealloc_flags); if (ret) return ret; } return 0; } static int fiemap_find_last_extent_offset(struct btrfs_inode *inode, struct btrfs_path *path, u64 *last_extent_end_ret) { const u64 ino = btrfs_ino(inode); struct btrfs_root *root = inode->root; struct extent_buffer *leaf; struct btrfs_file_extent_item *ei; struct btrfs_key key; u64 disk_bytenr; int ret; /* * Lookup the last file extent. We're not using i_size here because * there might be preallocation past i_size. */ ret = btrfs_lookup_file_extent(NULL, root, path, ino, (u64)-1, 0); /* There can't be a file extent item at offset (u64)-1 */ ASSERT(ret != 0); if (ret < 0) return ret; /* * For a non-existing key, btrfs_search_slot() always leaves us at a * slot > 0, except if the btree is empty, which is impossible because * at least it has the inode item for this inode and all the items for * the root inode 256. */ ASSERT(path->slots[0] > 0); path->slots[0]--; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) { /* No file extent items in the subvolume tree. */ *last_extent_end_ret = 0; return 0; } /* * For an inline extent, the disk_bytenr is where inline data starts at, * so first check if we have an inline extent item before checking if we * have an implicit hole (disk_bytenr == 0). */ ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE) { *last_extent_end_ret = btrfs_file_extent_end(path); return 0; } /* * Find the last file extent item that is not a hole (when NO_HOLES is * not enabled). This should take at most 2 iterations in the worst * case: we have one hole file extent item at slot 0 of a leaf and * another hole file extent item as the last item in the previous leaf. * This is because we merge file extent items that represent holes. */ disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); while (disk_bytenr == 0) { ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY); if (ret < 0) { return ret; } else if (ret > 0) { /* No file extent items that are not holes. */ *last_extent_end_ret = 0; return 0; } leaf = path->nodes[0]; ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); } *last_extent_end_ret = btrfs_file_extent_end(path); return 0; } static int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { const u64 ino = btrfs_ino(inode); struct extent_state *cached_state = NULL; struct extent_state *delalloc_cached_state = NULL; BTRFS_PATH_AUTO_FREE(path); struct fiemap_cache cache = { 0 }; struct btrfs_backref_share_check_ctx *backref_ctx; u64 last_extent_end = 0; u64 prev_extent_end; u64 range_start; u64 range_end; const u64 sectorsize = inode->root->fs_info->sectorsize; bool stopped = false; int ret; cache.entries_size = PAGE_SIZE / sizeof(struct btrfs_fiemap_entry); cache.entries = kmalloc_objs(struct btrfs_fiemap_entry, cache.entries_size); backref_ctx = btrfs_alloc_backref_share_check_ctx(); path = btrfs_alloc_path(); if (!cache.entries || !backref_ctx || !path) { ret = -ENOMEM; goto out; } restart: range_start = round_down(start, sectorsize); range_end = round_up(start + len, sectorsize); prev_extent_end = range_start; btrfs_lock_extent(&inode->io_tree, range_start, range_end, &cached_state); ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end); if (ret < 0) goto out_unlock; btrfs_release_path(path); path->reada = READA_FORWARD; ret = fiemap_search_slot(inode, path, range_start); if (ret < 0) { goto out_unlock; } else if (ret > 0) { /* * No file extent item found, but we may have delalloc between * the current offset and i_size. So check for that. */ ret = 0; goto check_eof_delalloc; } while (prev_extent_end < range_end) { struct extent_buffer *leaf = path->nodes[0]; struct btrfs_file_extent_item *ei; struct btrfs_key key; u64 extent_end; u64 extent_len; u64 extent_offset = 0; u64 extent_gen; u64 disk_bytenr = 0; u64 flags = 0; int extent_type; u8 compression; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) break; extent_end = btrfs_file_extent_end(path); /* * The first iteration can leave us at an extent item that ends * before our range's start. Move to the next item. */ if (extent_end <= range_start) goto next_item; backref_ctx->curr_leaf_bytenr = leaf->start; /* We have in implicit hole (NO_HOLES feature enabled). */ if (prev_extent_end < key.offset) { const u64 hole_end = min(key.offset, range_end) - 1; ret = fiemap_process_hole(inode, fieinfo, &cache, &delalloc_cached_state, backref_ctx, 0, 0, 0, prev_extent_end, hole_end); if (ret < 0) { goto out_unlock; } else if (ret > 0) { /* fiemap_fill_next_extent() told us to stop. */ stopped = true; break; } /* We've reached the end of the fiemap range, stop. */ if (key.offset >= range_end) { stopped = true; break; } } extent_len = extent_end - key.offset; ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); compression = btrfs_file_extent_compression(leaf, ei); extent_type = btrfs_file_extent_type(leaf, ei); extent_gen = btrfs_file_extent_generation(leaf, ei); if (extent_type != BTRFS_FILE_EXTENT_INLINE) { disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); if (compression == BTRFS_COMPRESS_NONE) extent_offset = btrfs_file_extent_offset(leaf, ei); } if (compression != BTRFS_COMPRESS_NONE) flags |= FIEMAP_EXTENT_ENCODED; if (extent_type == BTRFS_FILE_EXTENT_INLINE) { flags |= FIEMAP_EXTENT_DATA_INLINE; flags |= FIEMAP_EXTENT_NOT_ALIGNED; ret = emit_fiemap_extent(fieinfo, &cache, key.offset, 0, extent_len, flags); } else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) { ret = fiemap_process_hole(inode, fieinfo, &cache, &delalloc_cached_state, backref_ctx, disk_bytenr, extent_offset, extent_gen, key.offset, extent_end - 1); } else if (disk_bytenr == 0) { /* We have an explicit hole. */ ret = fiemap_process_hole(inode, fieinfo, &cache, &delalloc_cached_state, backref_ctx, 0, 0, 0, key.offset, extent_end - 1); } else { /* We have a regular extent. */ if (fieinfo->fi_extents_max) { ret = btrfs_is_data_extent_shared(inode, disk_bytenr, extent_gen, backref_ctx); if (ret < 0) goto out_unlock; else if (ret > 0) flags |= FIEMAP_EXTENT_SHARED; } ret = emit_fiemap_extent(fieinfo, &cache, key.offset, disk_bytenr + extent_offset, extent_len, flags); } if (ret < 0) { goto out_unlock; } else if (ret > 0) { /* emit_fiemap_extent() told us to stop. */ stopped = true; break; } prev_extent_end = extent_end; next_item: if (fatal_signal_pending(current)) { ret = -EINTR; goto out_unlock; } ret = fiemap_next_leaf_item(inode, path); if (ret < 0) { goto out_unlock; } else if (ret > 0) { /* No more file extent items for this inode. */ break; } cond_resched(); } check_eof_delalloc: if (!stopped && prev_extent_end < range_end) { ret = fiemap_process_hole(inode, fieinfo, &cache, &delalloc_cached_state, backref_ctx, 0, 0, 0, prev_extent_end, range_end - 1); if (ret < 0) goto out_unlock; prev_extent_end = range_end; } if (cache.cached && cache.offset + cache.len >= last_extent_end) { const u64 i_size = i_size_read(&inode->vfs_inode); if (prev_extent_end < i_size) { u64 delalloc_start; u64 delalloc_end; bool delalloc; delalloc = btrfs_find_delalloc_in_range(inode, prev_extent_end, i_size - 1, &delalloc_cached_state, &delalloc_start, &delalloc_end); if (!delalloc) cache.flags |= FIEMAP_EXTENT_LAST; } else { cache.flags |= FIEMAP_EXTENT_LAST; } } out_unlock: btrfs_unlock_extent(&inode->io_tree, range_start, range_end, &cached_state); if (ret == BTRFS_FIEMAP_FLUSH_CACHE) { btrfs_release_path(path); ret = flush_fiemap_cache(fieinfo, &cache); if (ret) goto out; len -= cache.next_search_offset - start; start = cache.next_search_offset; goto restart; } else if (ret < 0) { goto out; } /* * Must free the path before emitting to the fiemap buffer because we * may have a non-cloned leaf and if the fiemap buffer is memory mapped * to a file, a write into it (through btrfs_page_mkwrite()) may trigger * waiting for an ordered extent that in order to complete needs to * modify that leaf, therefore leading to a deadlock. */ btrfs_free_path(path); path = NULL; ret = flush_fiemap_cache(fieinfo, &cache); if (ret) goto out; ret = emit_last_fiemap_cache(fieinfo, &cache); out: btrfs_free_extent_state(delalloc_cached_state); kfree(cache.entries); btrfs_free_backref_share_ctx(backref_ctx); return ret; } int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { struct btrfs_inode *btrfs_inode = BTRFS_I(inode); int ret; ret = fiemap_prep(inode, fieinfo, start, &len, 0); if (ret) return ret; /* * fiemap_prep() called filemap_write_and_wait() for the whole possible * file range (0 to LLONG_MAX), but that is not enough if we have * compression enabled. The first filemap_fdatawrite_range() only kicks * in the compression of data (in an async thread) and will return * before the compression is done and writeback is started. A second * filemap_fdatawrite_range() is needed to wait for the compression to * complete and writeback to start. We also need to wait for ordered * extents to complete, because our fiemap implementation uses mainly * file extent items to list the extents, searching for extent maps * only for file ranges with holes or prealloc extents to figure out * if we have delalloc in those ranges. */ if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) { ret = btrfs_wait_ordered_range(btrfs_inode, 0, LLONG_MAX); if (ret) return ret; } btrfs_inode_lock(btrfs_inode, BTRFS_ILOCK_SHARED); /* * We did an initial flush to avoid holding the inode's lock while * triggering writeback and waiting for the completion of IO and ordered * extents. Now after we locked the inode we do it again, because it's * possible a new write may have happened in between those two steps. */ if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) { ret = btrfs_wait_ordered_range(btrfs_inode, 0, LLONG_MAX); if (ret) { btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED); return ret; } } ret = extent_fiemap(btrfs_inode, fieinfo, start, len); btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED); return ret; }
122 3278 3543 91 1 3450 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 /* SPDX-License-Identifier: GPL-2.0-only */ /* * kref.h - library routines for handling generic reference counted objects * * Copyright (C) 2004 Greg Kroah-Hartman <greg@kroah.com> * Copyright (C) 2004 IBM Corp. * * based on kobject.h which was: * Copyright (C) 2002-2003 Patrick Mochel <mochel@osdl.org> * Copyright (C) 2002-2003 Open Source Development Labs */ #ifndef _KREF_H_ #define _KREF_H_ #include <linux/spinlock.h> #include <linux/refcount.h> struct kref { refcount_t refcount; }; #define KREF_INIT(n) { .refcount = REFCOUNT_INIT(n), } /** * kref_init - initialize object. * @kref: object in question. */ static inline void kref_init(struct kref *kref) { refcount_set(&kref->refcount, 1); } static inline unsigned int kref_read(const struct kref *kref) { return refcount_read(&kref->refcount); } /** * kref_get - increment refcount for object. * @kref: object. */ static inline void kref_get(struct kref *kref) { refcount_inc(&kref->refcount); } /** * kref_put - Decrement refcount for object * @kref: Object * @release: Pointer to the function that will clean up the object when the * last reference to the object is released. * * Decrement the refcount, and if 0, call @release. The caller may not * pass NULL or kfree() as the release function. * * Return: 1 if this call removed the object, otherwise return 0. Beware, * if this function returns 0, another caller may have removed the object * by the time this function returns. The return value is only certain * if you want to see if the object is definitely released. */ static inline int kref_put(struct kref *kref, void (*release)(struct kref *kref)) { if (refcount_dec_and_test(&kref->refcount)) { release(kref); return 1; } return 0; } /** * kref_put_mutex - Decrement refcount for object * @kref: Object * @release: Pointer to the function that will clean up the object when the * last reference to the object is released. * @mutex: Mutex which protects the release function. * * This variant of kref_lock() calls the @release function with the @mutex * held. The @release function will release the mutex. */ static inline int kref_put_mutex(struct kref *kref, void (*release)(struct kref *kref), struct mutex *mutex) __cond_acquires(true, mutex) { if (refcount_dec_and_mutex_lock(&kref->refcount, mutex)) { release(kref); return 1; } return 0; } /** * kref_put_lock - Decrement refcount for object * @kref: Object * @release: Pointer to the function that will clean up the object when the * last reference to the object is released. * @lock: Spinlock which protects the release function. * * This variant of kref_lock() calls the @release function with the @lock * held. The @release function will release the lock. */ static inline int kref_put_lock(struct kref *kref, void (*release)(struct kref *kref), spinlock_t *lock) __cond_acquires(true, lock) { if (refcount_dec_and_lock(&kref->refcount, lock)) { release(kref); return 1; } return 0; } /** * kref_get_unless_zero - Increment refcount for object unless it is zero. * @kref: object. * * This function is intended to simplify locking around refcounting for * objects that can be looked up from a lookup structure, and which are * removed from that lookup structure in the object destructor. * Operations on such objects require at least a read lock around * lookup + kref_get, and a write lock around kref_put + remove from lookup * structure. Furthermore, RCU implementations become extremely tricky. * With a lookup followed by a kref_get_unless_zero *with return value check* * locking in the kref_put path can be deferred to the actual removal from * the lookup structure and RCU lookups become trivial. * * Return: non-zero if the increment succeeded. Otherwise return 0. */ static inline int __must_check kref_get_unless_zero(struct kref *kref) { return refcount_inc_not_zero(&kref->refcount); } #endif /* _KREF_H_ */
1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 // SPDX-License-Identifier: GPL-2.0-or-later /* * Cryptographic API. * * Single-block cipher operations. * * Copyright (c) 2002 James Morris <jmorris@intercode.com.au> * Copyright (c) 2005 Herbert Xu <herbert@gondor.apana.org.au> */ #include <crypto/algapi.h> #include <crypto/internal/cipher.h> #include <linux/kernel.h> #include <linux/crypto.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/string.h> #include "internal.h" static int setkey_unaligned(struct crypto_cipher *tfm, const u8 *key, unsigned int keylen) { struct cipher_alg *cia = crypto_cipher_alg(tfm); unsigned long alignmask = crypto_cipher_alignmask(tfm); int ret; u8 *buffer, *alignbuffer; unsigned long absize; absize = keylen + alignmask; buffer = kmalloc(absize, GFP_ATOMIC); if (!buffer) return -ENOMEM; alignbuffer = (u8 *)ALIGN((unsigned long)buffer, alignmask + 1); memcpy(alignbuffer, key, keylen); ret = cia->cia_setkey(crypto_cipher_tfm(tfm), alignbuffer, keylen); kfree_sensitive(buffer); return ret; } int crypto_cipher_setkey(struct crypto_cipher *tfm, const u8 *key, unsigned int keylen) { struct cipher_alg *cia = crypto_cipher_alg(tfm); unsigned long alignmask = crypto_cipher_alignmask(tfm); if (keylen < cia->cia_min_keysize || keylen > cia->cia_max_keysize) return -EINVAL; if ((unsigned long)key & alignmask) return setkey_unaligned(tfm, key, keylen); return cia->cia_setkey(crypto_cipher_tfm(tfm), key, keylen); } EXPORT_SYMBOL_NS_GPL(crypto_cipher_setkey, "CRYPTO_INTERNAL"); static inline void cipher_crypt_one(struct crypto_cipher *tfm, u8 *dst, const u8 *src, bool enc) { unsigned long alignmask = crypto_cipher_alignmask(tfm); struct cipher_alg *cia = crypto_cipher_alg(tfm); void (*fn)(struct crypto_tfm *, u8 *, const u8 *) = enc ? cia->cia_encrypt : cia->cia_decrypt; if (unlikely(((unsigned long)dst | (unsigned long)src) & alignmask)) { unsigned int bs = crypto_cipher_blocksize(tfm); u8 buffer[MAX_CIPHER_BLOCKSIZE + MAX_CIPHER_ALIGNMASK]; u8 *tmp = (u8 *)ALIGN((unsigned long)buffer, alignmask + 1); memcpy(tmp, src, bs); fn(crypto_cipher_tfm(tfm), tmp, tmp); memcpy(dst, tmp, bs); } else { fn(crypto_cipher_tfm(tfm), dst, src); } } void crypto_cipher_encrypt_one(struct crypto_cipher *tfm, u8 *dst, const u8 *src) { cipher_crypt_one(tfm, dst, src, true); } EXPORT_SYMBOL_NS_GPL(crypto_cipher_encrypt_one, "CRYPTO_INTERNAL"); void crypto_cipher_decrypt_one(struct crypto_cipher *tfm, u8 *dst, const u8 *src) { cipher_crypt_one(tfm, dst, src, false); } EXPORT_SYMBOL_NS_GPL(crypto_cipher_decrypt_one, "CRYPTO_INTERNAL"); struct crypto_cipher *crypto_clone_cipher(struct crypto_cipher *cipher) { struct crypto_tfm *tfm = crypto_cipher_tfm(cipher); struct crypto_alg *alg = tfm->__crt_alg; struct crypto_cipher *ncipher; struct crypto_tfm *ntfm; if (alg->cra_init) return ERR_PTR(-ENOSYS); if (unlikely(!crypto_mod_get(alg))) return ERR_PTR(-ESTALE); ntfm = __crypto_alloc_tfmgfp(alg, CRYPTO_ALG_TYPE_CIPHER, CRYPTO_ALG_TYPE_MASK, GFP_ATOMIC); if (IS_ERR(ntfm)) { crypto_mod_put(alg); return ERR_CAST(ntfm); } ntfm->crt_flags = tfm->crt_flags; ncipher = __crypto_cipher_cast(ntfm); return ncipher; } EXPORT_SYMBOL_GPL(crypto_clone_cipher);
12 1 13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * AES block cipher using AES-NI instructions * * Copyright 2026 Google LLC */ #include <asm/fpu/api.h> static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_aes); void aes128_expandkey_aesni(u32 rndkeys[], u32 *inv_rndkeys, const u8 in_key[AES_KEYSIZE_128]); void aes256_expandkey_aesni(u32 rndkeys[], u32 *inv_rndkeys, const u8 in_key[AES_KEYSIZE_256]); void aes_encrypt_aesni(const u32 rndkeys[], int nrounds, u8 out[AES_BLOCK_SIZE], const u8 in[AES_BLOCK_SIZE]); void aes_decrypt_aesni(const u32 inv_rndkeys[], int nrounds, u8 out[AES_BLOCK_SIZE], const u8 in[AES_BLOCK_SIZE]); /* * Expand an AES key using AES-NI if supported and usable or generic code * otherwise. The expanded key format is compatible between the two cases. The * outputs are @k->rndkeys (required) and @inv_k->inv_rndkeys (optional). * * We could just always use the generic key expansion code. AES key expansion * is usually less performance-critical than AES en/decryption. However, * there's still *some* value in speed here, as well as in non-key-dependent * execution time which AES-NI provides. So, do use AES-NI to expand AES-128 * and AES-256 keys. (Don't bother with AES-192, as it's almost never used.) */ static void aes_preparekey_arch(union aes_enckey_arch *k, union aes_invkey_arch *inv_k, const u8 *in_key, int key_len, int nrounds) { u32 *rndkeys = k->rndkeys; u32 *inv_rndkeys = inv_k ? inv_k->inv_rndkeys : NULL; if (static_branch_likely(&have_aes) && key_len != AES_KEYSIZE_192 && irq_fpu_usable()) { kernel_fpu_begin(); if (key_len == AES_KEYSIZE_128) aes128_expandkey_aesni(rndkeys, inv_rndkeys, in_key); else aes256_expandkey_aesni(rndkeys, inv_rndkeys, in_key); kernel_fpu_end(); } else { aes_expandkey_generic(rndkeys, inv_rndkeys, in_key, key_len); } } static void aes_encrypt_arch(const struct aes_enckey *key, u8 out[AES_BLOCK_SIZE], const u8 in[AES_BLOCK_SIZE]) { if (static_branch_likely(&have_aes) && irq_fpu_usable()) { kernel_fpu_begin(); aes_encrypt_aesni(key->k.rndkeys, key->nrounds, out, in); kernel_fpu_end(); } else { aes_encrypt_generic(key->k.rndkeys, key->nrounds, out, in); } } static void aes_decrypt_arch(const struct aes_key *key, u8 out[AES_BLOCK_SIZE], const u8 in[AES_BLOCK_SIZE]) { if (static_branch_likely(&have_aes) && irq_fpu_usable()) { kernel_fpu_begin(); aes_decrypt_aesni(key->inv_k.inv_rndkeys, key->nrounds, out, in); kernel_fpu_end(); } else { aes_decrypt_generic(key->inv_k.inv_rndkeys, key->nrounds, out, in); } } #define aes_mod_init_arch aes_mod_init_arch static void aes_mod_init_arch(void) { if (boot_cpu_has(X86_FEATURE_AES)) static_branch_enable(&have_aes); }
15 61 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_GENERIC_BITOPS_LE_H_ #define _ASM_GENERIC_BITOPS_LE_H_ #include <asm/types.h> #include <asm/byteorder.h> #if defined(__LITTLE_ENDIAN) #define BITOP_LE_SWIZZLE 0 #elif defined(__BIG_ENDIAN) #define BITOP_LE_SWIZZLE ((BITS_PER_LONG-1) & ~0x7) #endif static inline int test_bit_le(int nr, const void *addr) { return test_bit(nr ^ BITOP_LE_SWIZZLE, addr); } static inline void set_bit_le(int nr, void *addr) { set_bit(nr ^ BITOP_LE_SWIZZLE, addr); } static inline void clear_bit_le(int nr, void *addr) { clear_bit(nr ^ BITOP_LE_SWIZZLE, addr); } static inline void __set_bit_le(int nr, void *addr) { __set_bit(nr ^ BITOP_LE_SWIZZLE, addr); } static inline void __clear_bit_le(int nr, void *addr) { __clear_bit(nr ^ BITOP_LE_SWIZZLE, addr); } static inline int test_and_set_bit_le(int nr, void *addr) { return test_and_set_bit(nr ^ BITOP_LE_SWIZZLE, addr); } static inline int test_and_clear_bit_le(int nr, void *addr) { return test_and_clear_bit(nr ^ BITOP_LE_SWIZZLE, addr); } static inline int __test_and_set_bit_le(int nr, void *addr) { return __test_and_set_bit(nr ^ BITOP_LE_SWIZZLE, addr); } static inline int __test_and_clear_bit_le(int nr, void *addr) { return __test_and_clear_bit(nr ^ BITOP_LE_SWIZZLE, addr); } #endif /* _ASM_GENERIC_BITOPS_LE_H_ */
25 25 25 23 25 25 83 62 83 4 81 84 83 17 25 1 4 4 4 4 69 45 43 69 28 28 23 27 26 18 3 3 3 84 25 25 25 25 17 2 25 25 51 51 51 16 40 37 13 22 16 49 16 40 23 23 23 23 6 6 6 6 3 1 3 3 3 3 8 8 8 4 4 3 8 33 33 33 33 33 33 5 1 1 3 3 3 1 2 3 13 13 6 8 5 13 13 17 17 16 17 16 17 17 17 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 // SPDX-License-Identifier: GPL-2.0 /* * /proc/sys support */ #include <linux/init.h> #include <linux/sysctl.h> #include <linux/poll.h> #include <linux/proc_fs.h> #include <linux/printk.h> #include <linux/security.h> #include <linux/sched.h> #include <linux/cred.h> #include <linux/namei.h> #include <linux/mm.h> #include <linux/uio.h> #include <linux/module.h> #include <linux/bpf-cgroup.h> #include <linux/mount.h> #include <linux/kmemleak.h> #include <linux/lockdep.h> #include "internal.h" #define list_for_each_table_entry(entry, header) \ entry = header->ctl_table; \ for (size_t i = 0 ; i < header->ctl_table_size; ++i, entry++) static const struct dentry_operations proc_sys_dentry_operations; static const struct file_operations proc_sys_file_operations; static const struct inode_operations proc_sys_inode_operations; static const struct file_operations proc_sys_dir_file_operations; static const struct inode_operations proc_sys_dir_operations; /* * Support for permanently empty directories. * Must be non-empty to avoid sharing an address with other tables. */ static const struct ctl_table sysctl_mount_point[] = { { } }; /** * register_sysctl_mount_point() - registers a sysctl mount point * @path: path for the mount point * * Used to create a permanently empty directory to serve as mount point. * There are some subtle but important permission checks this allows in the * case of unprivileged mounts. */ struct ctl_table_header *register_sysctl_mount_point(const char *path) { return register_sysctl_sz(path, sysctl_mount_point, 0); } EXPORT_SYMBOL(register_sysctl_mount_point); #define sysctl_is_perm_empty_ctl_header(hptr) \ (hptr->type == SYSCTL_TABLE_TYPE_PERMANENTLY_EMPTY) #define sysctl_set_perm_empty_ctl_header(hptr) \ (hptr->type = SYSCTL_TABLE_TYPE_PERMANENTLY_EMPTY) #define sysctl_clear_perm_empty_ctl_header(hptr) \ (hptr->type = SYSCTL_TABLE_TYPE_DEFAULT) void proc_sys_poll_notify(struct ctl_table_poll *poll) { if (!poll) return; atomic_inc(&poll->event); wake_up_interruptible(&poll->wait); } static const struct ctl_table root_table[] = { { .procname = "", .mode = S_IFDIR|S_IRUGO|S_IXUGO, }, }; static struct ctl_table_root sysctl_table_root = { .default_set.dir.header = { {{.count = 1, .nreg = 1, .ctl_table = root_table }}, .ctl_table_arg = root_table, .root = &sysctl_table_root, .set = &sysctl_table_root.default_set, }, }; static DEFINE_SPINLOCK(sysctl_lock); static void drop_sysctl_table(struct ctl_table_header *header); static int sysctl_follow_link(struct ctl_table_header **phead, const struct ctl_table **pentry); static int insert_links(struct ctl_table_header *head); static void put_links(struct ctl_table_header *header); static void sysctl_print_dir(struct ctl_dir *dir) { if (dir->header.parent) sysctl_print_dir(dir->header.parent); pr_cont("%s/", dir->header.ctl_table[0].procname); } static int namecmp(const char *name1, int len1, const char *name2, int len2) { int cmp; cmp = memcmp(name1, name2, min(len1, len2)); if (cmp == 0) cmp = len1 - len2; return cmp; } static const struct ctl_table *find_entry(struct ctl_table_header **phead, struct ctl_dir *dir, const char *name, int namelen) { struct ctl_table_header *head; const struct ctl_table *entry; struct rb_node *node = dir->root.rb_node; lockdep_assert_held(&sysctl_lock); while (node) { struct ctl_node *ctl_node; const char *procname; int cmp; ctl_node = rb_entry(node, struct ctl_node, node); head = ctl_node->header; entry = &head->ctl_table[ctl_node - head->node]; procname = entry->procname; cmp = namecmp(name, namelen, procname, strlen(procname)); if (cmp < 0) node = node->rb_left; else if (cmp > 0) node = node->rb_right; else { *phead = head; return entry; } } return NULL; } static int insert_entry(struct ctl_table_header *head, const struct ctl_table *entry) { struct rb_node *node = &head->node[entry - head->ctl_table].node; struct rb_node **p = &head->parent->root.rb_node; struct rb_node *parent = NULL; const char *name = entry->procname; int namelen = strlen(name); while (*p) { struct ctl_table_header *parent_head; const struct ctl_table *parent_entry; struct ctl_node *parent_node; const char *parent_name; int cmp; parent = *p; parent_node = rb_entry(parent, struct ctl_node, node); parent_head = parent_node->header; parent_entry = &parent_head->ctl_table[parent_node - parent_head->node]; parent_name = parent_entry->procname; cmp = namecmp(name, namelen, parent_name, strlen(parent_name)); if (cmp < 0) p = &(*p)->rb_left; else if (cmp > 0) p = &(*p)->rb_right; else { pr_err("sysctl duplicate entry: "); sysctl_print_dir(head->parent); pr_cont("%s\n", entry->procname); return -EEXIST; } } rb_link_node(node, parent, p); rb_insert_color(node, &head->parent->root); return 0; } static void erase_entry(struct ctl_table_header *head, const struct ctl_table *entry) { struct rb_node *node = &head->node[entry - head->ctl_table].node; rb_erase(node, &head->parent->root); } static void init_header(struct ctl_table_header *head, struct ctl_table_root *root, struct ctl_table_set *set, struct ctl_node *node, const struct ctl_table *table, size_t table_size) { head->ctl_table = table; head->ctl_table_size = table_size; head->ctl_table_arg = table; head->used = 0; head->count = 1; head->nreg = 1; head->unregistering = NULL; head->root = root; head->set = set; head->parent = NULL; head->node = node; INIT_HLIST_HEAD(&head->inodes); if (node) { const struct ctl_table *entry; list_for_each_table_entry(entry, head) { node->header = head; node++; } } if (table == sysctl_mount_point) sysctl_set_perm_empty_ctl_header(head); } static void erase_header(struct ctl_table_header *head) { const struct ctl_table *entry; list_for_each_table_entry(entry, head) erase_entry(head, entry); } static int insert_header(struct ctl_dir *dir, struct ctl_table_header *header) { const struct ctl_table *entry; struct ctl_table_header *dir_h = &dir->header; int err; /* Is this a permanently empty directory? */ if (sysctl_is_perm_empty_ctl_header(dir_h)) return -EROFS; /* Am I creating a permanently empty directory? */ if (sysctl_is_perm_empty_ctl_header(header)) { if (!RB_EMPTY_ROOT(&dir->root)) return -EINVAL; sysctl_set_perm_empty_ctl_header(dir_h); } dir_h->nreg++; header->parent = dir; err = insert_links(header); if (err) goto fail_links; list_for_each_table_entry(entry, header) { err = insert_entry(header, entry); if (err) goto fail; } return 0; fail: erase_header(header); put_links(header); fail_links: if (header->ctl_table == sysctl_mount_point) sysctl_clear_perm_empty_ctl_header(dir_h); header->parent = NULL; drop_sysctl_table(dir_h); return err; } static int use_table(struct ctl_table_header *p) { lockdep_assert_held(&sysctl_lock); if (unlikely(p->unregistering)) return 0; p->used++; return 1; } static void unuse_table(struct ctl_table_header *p) { lockdep_assert_held(&sysctl_lock); if (!--p->used) if (unlikely(p->unregistering)) complete(p->unregistering); } static void proc_sys_invalidate_dcache(struct ctl_table_header *head) { proc_invalidate_siblings_dcache(&head->inodes, &sysctl_lock); } static void start_unregistering(struct ctl_table_header *p) { /* will reacquire if has to wait */ lockdep_assert_held(&sysctl_lock); /* * if p->used is 0, nobody will ever touch that entry again; * we'll eliminate all paths to it before dropping sysctl_lock */ if (unlikely(p->used)) { struct completion wait; init_completion(&wait); p->unregistering = &wait; spin_unlock(&sysctl_lock); wait_for_completion(&wait); } else { /* anything non-NULL; we'll never dereference it */ p->unregistering = ERR_PTR(-EINVAL); spin_unlock(&sysctl_lock); } /* * Invalidate dentries for unregistered sysctls: namespaced sysctls * can have duplicate names and contaminate dcache very badly. */ proc_sys_invalidate_dcache(p); /* * do not remove from the list until nobody holds it; walking the * list in do_sysctl() relies on that. */ spin_lock(&sysctl_lock); erase_header(p); } static struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head) { BUG_ON(!head); spin_lock(&sysctl_lock); if (!use_table(head)) head = ERR_PTR(-ENOENT); spin_unlock(&sysctl_lock); return head; } static void sysctl_head_finish(struct ctl_table_header *head) { if (!head) return; spin_lock(&sysctl_lock); unuse_table(head); spin_unlock(&sysctl_lock); } static struct ctl_table_set * lookup_header_set(struct ctl_table_root *root) { struct ctl_table_set *set = &root->default_set; if (root->lookup) set = root->lookup(root); return set; } static const struct ctl_table *lookup_entry(struct ctl_table_header **phead, struct ctl_dir *dir, const char *name, int namelen) { struct ctl_table_header *head; const struct ctl_table *entry; spin_lock(&sysctl_lock); entry = find_entry(&head, dir, name, namelen); if (entry && use_table(head)) *phead = head; else entry = NULL; spin_unlock(&sysctl_lock); return entry; } static struct ctl_node *first_usable_entry(struct rb_node *node) { struct ctl_node *ctl_node; for (;node; node = rb_next(node)) { ctl_node = rb_entry(node, struct ctl_node, node); if (use_table(ctl_node->header)) return ctl_node; } return NULL; } static void first_entry(struct ctl_dir *dir, struct ctl_table_header **phead, const struct ctl_table **pentry) { struct ctl_table_header *head = NULL; const struct ctl_table *entry = NULL; struct ctl_node *ctl_node; spin_lock(&sysctl_lock); ctl_node = first_usable_entry(rb_first(&dir->root)); spin_unlock(&sysctl_lock); if (ctl_node) { head = ctl_node->header; entry = &head->ctl_table[ctl_node - head->node]; } *phead = head; *pentry = entry; } static void next_entry(struct ctl_table_header **phead, const struct ctl_table **pentry) { struct ctl_table_header *head = *phead; const struct ctl_table *entry = *pentry; struct ctl_node *ctl_node = &head->node[entry - head->ctl_table]; spin_lock(&sysctl_lock); unuse_table(head); ctl_node = first_usable_entry(rb_next(&ctl_node->node)); spin_unlock(&sysctl_lock); head = NULL; if (ctl_node) { head = ctl_node->header; entry = &head->ctl_table[ctl_node - head->node]; } *phead = head; *pentry = entry; } /* * sysctl_perm does NOT grant the superuser all rights automatically, because * some sysctl variables are readonly even to root. */ static int test_perm(int mode, int op) { if (uid_eq(current_euid(), GLOBAL_ROOT_UID)) mode >>= 6; else if (in_egroup_p(GLOBAL_ROOT_GID)) mode >>= 3; if ((op & ~mode & (MAY_READ|MAY_WRITE|MAY_EXEC)) == 0) return 0; return -EACCES; } static int sysctl_perm(struct ctl_table_header *head, const struct ctl_table *table, int op) { struct ctl_table_root *root = head->root; int mode; if (root->permissions) mode = root->permissions(head, table); else mode = table->mode; return test_perm(mode, op); } static struct inode *proc_sys_make_inode(struct super_block *sb, struct ctl_table_header *head, const struct ctl_table *table) { struct ctl_table_root *root = head->root; struct inode *inode; struct proc_inode *ei; inode = new_inode(sb); if (!inode) return ERR_PTR(-ENOMEM); inode->i_ino = get_next_ino(); ei = PROC_I(inode); spin_lock(&sysctl_lock); if (unlikely(head->unregistering)) { spin_unlock(&sysctl_lock); iput(inode); return ERR_PTR(-ENOENT); } ei->sysctl = head; ei->sysctl_entry = table; hlist_add_head_rcu(&ei->sibling_inodes, &head->inodes); head->count++; spin_unlock(&sysctl_lock); simple_inode_init_ts(inode); inode->i_mode = table->mode; if (!S_ISDIR(table->mode)) { inode->i_mode |= S_IFREG; inode->i_op = &proc_sys_inode_operations; inode->i_fop = &proc_sys_file_operations; } else { inode->i_mode |= S_IFDIR; inode->i_op = &proc_sys_dir_operations; inode->i_fop = &proc_sys_dir_file_operations; if (sysctl_is_perm_empty_ctl_header(head)) make_empty_dir_inode(inode); } inode->i_uid = GLOBAL_ROOT_UID; inode->i_gid = GLOBAL_ROOT_GID; if (root->set_ownership) root->set_ownership(head, &inode->i_uid, &inode->i_gid); return inode; } void proc_sys_evict_inode(struct inode *inode, struct ctl_table_header *head) { spin_lock(&sysctl_lock); hlist_del_init_rcu(&PROC_I(inode)->sibling_inodes); if (!--head->count) kfree_rcu(head, rcu); spin_unlock(&sysctl_lock); } static struct ctl_table_header *grab_header(struct inode *inode) { struct ctl_table_header *head = PROC_I(inode)->sysctl; if (!head) head = &sysctl_table_root.default_set.dir.header; return sysctl_head_grab(head); } static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct ctl_table_header *head = grab_header(dir); struct ctl_table_header *h = NULL; const struct qstr *name = &dentry->d_name; const struct ctl_table *p; struct inode *inode; struct dentry *err = ERR_PTR(-ENOENT); struct ctl_dir *ctl_dir; int ret; if (IS_ERR(head)) return ERR_CAST(head); ctl_dir = container_of(head, struct ctl_dir, header); p = lookup_entry(&h, ctl_dir, name->name, name->len); if (!p) goto out; if (S_ISLNK(p->mode)) { ret = sysctl_follow_link(&h, &p); err = ERR_PTR(ret); if (ret) goto out; } inode = proc_sys_make_inode(dir->i_sb, h ? h : head, p); err = d_splice_alias_ops(inode, dentry, &proc_sys_dentry_operations); out: if (h) sysctl_head_finish(h); sysctl_head_finish(head); return err; } static ssize_t proc_sys_call_handler(struct kiocb *iocb, struct iov_iter *iter, int write) { struct inode *inode = file_inode(iocb->ki_filp); struct ctl_table_header *head = grab_header(inode); const struct ctl_table *table = PROC_I(inode)->sysctl_entry; size_t count = iov_iter_count(iter); char *kbuf; ssize_t error; if (IS_ERR(head)) return PTR_ERR(head); /* * At this point we know that the sysctl was not unregistered * and won't be until we finish. */ error = -EPERM; if (sysctl_perm(head, table, write ? MAY_WRITE : MAY_READ)) goto out; /* if that can happen at all, it should be -EINVAL, not -EISDIR */ error = -EINVAL; if (!table->proc_handler) goto out; /* don't even try if the size is too large */ error = -ENOMEM; if (count >= KMALLOC_MAX_SIZE) goto out; kbuf = kvzalloc(count + 1, GFP_KERNEL); if (!kbuf) goto out; if (write) { error = -EFAULT; if (!copy_from_iter_full(kbuf, count, iter)) goto out_free_buf; kbuf[count] = '\0'; } error = BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, &kbuf, &count, &iocb->ki_pos); if (error) goto out_free_buf; /* careful: calling conventions are nasty here */ error = table->proc_handler(table, write, kbuf, &count, &iocb->ki_pos); if (error) goto out_free_buf; if (!write) { error = -EFAULT; if (copy_to_iter(kbuf, count, iter) < count) goto out_free_buf; } error = count; out_free_buf: kvfree(kbuf); out: sysctl_head_finish(head); return error; } static ssize_t proc_sys_read(struct kiocb *iocb, struct iov_iter *iter) { return proc_sys_call_handler(iocb, iter, 0); } static ssize_t proc_sys_write(struct kiocb *iocb, struct iov_iter *iter) { return proc_sys_call_handler(iocb, iter, 1); } static int proc_sys_open(struct inode *inode, struct file *filp) { struct ctl_table_header *head = grab_header(inode); const struct ctl_table *table = PROC_I(inode)->sysctl_entry; /* sysctl was unregistered */ if (IS_ERR(head)) return PTR_ERR(head); if (table->poll) filp->private_data = proc_sys_poll_event(table->poll); sysctl_head_finish(head); return 0; } static __poll_t proc_sys_poll(struct file *filp, poll_table *wait) { struct inode *inode = file_inode(filp); struct ctl_table_header *head = grab_header(inode); const struct ctl_table *table = PROC_I(inode)->sysctl_entry; __poll_t ret = DEFAULT_POLLMASK; unsigned long event; /* sysctl was unregistered */ if (IS_ERR(head)) return EPOLLERR | EPOLLHUP; if (!table->proc_handler) goto out; if (!table->poll) goto out; event = (unsigned long)filp->private_data; poll_wait(filp, &table->poll->wait, wait); if (event != atomic_read(&table->poll->event)) { filp->private_data = proc_sys_poll_event(table->poll); ret = EPOLLIN | EPOLLRDNORM | EPOLLERR | EPOLLPRI; } out: sysctl_head_finish(head); return ret; } static bool proc_sys_fill_cache(struct file *file, struct dir_context *ctx, struct ctl_table_header *head, const struct ctl_table *table) { struct dentry *child, *dir = file->f_path.dentry; struct inode *inode; struct qstr qname; ino_t ino = 0; unsigned type = DT_UNKNOWN; qname.name = table->procname; qname.len = strlen(table->procname); qname.hash = full_name_hash(dir, qname.name, qname.len); child = d_lookup(dir, &qname); if (!child) { DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); child = d_alloc_parallel(dir, &qname, &wq); if (IS_ERR(child)) return false; if (d_in_lookup(child)) { struct dentry *res; inode = proc_sys_make_inode(dir->d_sb, head, table); res = d_splice_alias_ops(inode, child, &proc_sys_dentry_operations); d_lookup_done(child); if (unlikely(res)) { dput(child); if (IS_ERR(res)) return false; child = res; } } } inode = d_inode(child); ino = inode->i_ino; type = inode->i_mode >> 12; dput(child); return dir_emit(ctx, qname.name, qname.len, ino, type); } static bool proc_sys_link_fill_cache(struct file *file, struct dir_context *ctx, struct ctl_table_header *head, const struct ctl_table *table) { bool ret = true; head = sysctl_head_grab(head); if (IS_ERR(head)) return false; /* It is not an error if we can not follow the link ignore it */ if (sysctl_follow_link(&head, &table)) goto out; ret = proc_sys_fill_cache(file, ctx, head, table); out: sysctl_head_finish(head); return ret; } static int scan(struct ctl_table_header *head, const struct ctl_table *table, unsigned long *pos, struct file *file, struct dir_context *ctx) { bool res; if ((*pos)++ < ctx->pos) return true; if (unlikely(S_ISLNK(table->mode))) res = proc_sys_link_fill_cache(file, ctx, head, table); else res = proc_sys_fill_cache(file, ctx, head, table); if (res) ctx->pos = *pos; return res; } static int proc_sys_readdir(struct file *file, struct dir_context *ctx) { struct ctl_table_header *head = grab_header(file_inode(file)); struct ctl_table_header *h = NULL; const struct ctl_table *entry; struct ctl_dir *ctl_dir; unsigned long pos; if (IS_ERR(head)) return PTR_ERR(head); ctl_dir = container_of(head, struct ctl_dir, header); if (!dir_emit_dots(file, ctx)) goto out; pos = 2; for (first_entry(ctl_dir, &h, &entry); h; next_entry(&h, &entry)) { if (!scan(h, entry, &pos, file, ctx)) { sysctl_head_finish(h); break; } } out: sysctl_head_finish(head); return 0; } static int proc_sys_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { /* * sysctl entries that are not writeable, * are _NOT_ writeable, capabilities or not. */ struct ctl_table_header *head; const struct ctl_table *table; int error; /* Executable files are not allowed under /proc/sys/ */ if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) return -EACCES; head = grab_header(inode); if (IS_ERR(head)) return PTR_ERR(head); table = PROC_I(inode)->sysctl_entry; if (!table) /* global root - r-xr-xr-x */ error = mask & MAY_WRITE ? -EACCES : 0; else /* Use the permissions on the sysctl table entry */ error = sysctl_perm(head, table, mask & ~MAY_NOT_BLOCK); sysctl_head_finish(head); return error; } static int proc_sys_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); int error; if (attr->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID)) return -EPERM; error = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (error) return error; setattr_copy(&nop_mnt_idmap, inode, attr); return 0; } static int proc_sys_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); struct ctl_table_header *head = grab_header(inode); const struct ctl_table *table = PROC_I(inode)->sysctl_entry; if (IS_ERR(head)) return PTR_ERR(head); generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); if (table) stat->mode = (stat->mode & S_IFMT) | table->mode; sysctl_head_finish(head); return 0; } static const struct file_operations proc_sys_file_operations = { .open = proc_sys_open, .poll = proc_sys_poll, .read_iter = proc_sys_read, .write_iter = proc_sys_write, .splice_read = copy_splice_read, .splice_write = iter_file_splice_write, .llseek = default_llseek, }; static const struct file_operations proc_sys_dir_file_operations = { .read = generic_read_dir, .iterate_shared = proc_sys_readdir, .llseek = generic_file_llseek, }; static const struct inode_operations proc_sys_inode_operations = { .permission = proc_sys_permission, .setattr = proc_sys_setattr, .getattr = proc_sys_getattr, }; static const struct inode_operations proc_sys_dir_operations = { .lookup = proc_sys_lookup, .permission = proc_sys_permission, .setattr = proc_sys_setattr, .getattr = proc_sys_getattr, }; static int proc_sys_revalidate(struct inode *dir, const struct qstr *name, struct dentry *dentry, unsigned int flags) { if (flags & LOOKUP_RCU) return -ECHILD; return !PROC_I(d_inode(dentry))->sysctl->unregistering; } static int proc_sys_delete(const struct dentry *dentry) { return !!PROC_I(d_inode(dentry))->sysctl->unregistering; } static int sysctl_is_seen(struct ctl_table_header *p) { struct ctl_table_set *set = p->set; int res; spin_lock(&sysctl_lock); if (p->unregistering) res = 0; else if (!set->is_seen) res = 1; else res = set->is_seen(set); spin_unlock(&sysctl_lock); return res; } static int proc_sys_compare(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { struct ctl_table_header *head; struct inode *inode; if (name->len != len) return 1; if (memcmp(name->name, str, len)) return 1; // false positive is fine here - we'll recheck anyway if (d_in_lookup(dentry)) return 0; inode = d_inode_rcu(dentry); // we just might have run into dentry in the middle of __dentry_kill() if (!inode) return 1; head = READ_ONCE(PROC_I(inode)->sysctl); return !head || !sysctl_is_seen(head); } static const struct dentry_operations proc_sys_dentry_operations = { .d_revalidate = proc_sys_revalidate, .d_delete = proc_sys_delete, .d_compare = proc_sys_compare, }; static struct ctl_dir *find_subdir(struct ctl_dir *dir, const char *name, int namelen) { struct ctl_table_header *head; const struct ctl_table *entry; entry = find_entry(&head, dir, name, namelen); if (!entry) return ERR_PTR(-ENOENT); if (!S_ISDIR(entry->mode)) return ERR_PTR(-ENOTDIR); return container_of(head, struct ctl_dir, header); } static struct ctl_dir *new_dir(struct ctl_table_set *set, const char *name, int namelen) { struct ctl_table *table; struct ctl_dir *new; struct ctl_node *node; char *new_name; new = kzalloc(sizeof(*new) + sizeof(struct ctl_node) + sizeof(struct ctl_table) + namelen + 1, GFP_KERNEL); if (!new) return NULL; node = (struct ctl_node *)(new + 1); table = (struct ctl_table *)(node + 1); new_name = (char *)(table + 1); memcpy(new_name, name, namelen); table[0].procname = new_name; table[0].mode = S_IFDIR|S_IRUGO|S_IXUGO; init_header(&new->header, set->dir.header.root, set, node, table, 1); return new; } /** * get_subdir - find or create a subdir with the specified name. * @dir: Directory to create the subdirectory in * @name: The name of the subdirectory to find or create * @namelen: The length of name * * Takes a directory with an elevated reference count so we know that * if we drop the lock the directory will not go away. Upon success * the reference is moved from @dir to the returned subdirectory. * Upon error an error code is returned and the reference on @dir is * simply dropped. */ static struct ctl_dir *get_subdir(struct ctl_dir *dir, const char *name, int namelen) { struct ctl_table_set *set = dir->header.set; struct ctl_dir *subdir, *new = NULL; int err; spin_lock(&sysctl_lock); subdir = find_subdir(dir, name, namelen); if (!IS_ERR(subdir)) goto found; if (PTR_ERR(subdir) != -ENOENT) goto failed; spin_unlock(&sysctl_lock); new = new_dir(set, name, namelen); spin_lock(&sysctl_lock); subdir = ERR_PTR(-ENOMEM); if (!new) goto failed; /* Was the subdir added while we dropped the lock? */ subdir = find_subdir(dir, name, namelen); if (!IS_ERR(subdir)) goto found; if (PTR_ERR(subdir) != -ENOENT) goto failed; /* Nope. Use the our freshly made directory entry. */ err = insert_header(dir, &new->header); subdir = ERR_PTR(err); if (err) goto failed; subdir = new; found: subdir->header.nreg++; failed: if (IS_ERR(subdir)) { pr_err("sysctl could not get directory: "); sysctl_print_dir(dir); pr_cont("%*.*s %ld\n", namelen, namelen, name, PTR_ERR(subdir)); } drop_sysctl_table(&dir->header); if (new) drop_sysctl_table(&new->header); spin_unlock(&sysctl_lock); return subdir; } static struct ctl_dir *xlate_dir(struct ctl_table_set *set, struct ctl_dir *dir) { struct ctl_dir *parent; const char *procname; if (!dir->header.parent) return &set->dir; parent = xlate_dir(set, dir->header.parent); if (IS_ERR(parent)) return parent; procname = dir->header.ctl_table[0].procname; return find_subdir(parent, procname, strlen(procname)); } static int sysctl_follow_link(struct ctl_table_header **phead, const struct ctl_table **pentry) { struct ctl_table_header *head; const struct ctl_table *entry; struct ctl_table_root *root; struct ctl_table_set *set; struct ctl_dir *dir; int ret; spin_lock(&sysctl_lock); root = (*pentry)->data; set = lookup_header_set(root); dir = xlate_dir(set, (*phead)->parent); if (IS_ERR(dir)) ret = PTR_ERR(dir); else { const char *procname = (*pentry)->procname; head = NULL; entry = find_entry(&head, dir, procname, strlen(procname)); ret = -ENOENT; if (entry && use_table(head)) { unuse_table(*phead); *phead = head; *pentry = entry; ret = 0; } } spin_unlock(&sysctl_lock); return ret; } static int sysctl_err(const char *path, const struct ctl_table *table, char *fmt, ...) { struct va_format vaf; va_list args; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; pr_err("sysctl table check failed: %s/%s %pV\n", path, table->procname, &vaf); va_end(args); return -EINVAL; } static int sysctl_check_table_array(const char *path, const struct ctl_table *table) { unsigned int extra; int err = 0; if ((table->proc_handler == proc_douintvec) || (table->proc_handler == proc_douintvec_minmax)) { if (table->maxlen != sizeof(unsigned int)) err |= sysctl_err(path, table, "array not allowed"); } if (table->proc_handler == proc_dou8vec_minmax) { if (table->maxlen != sizeof(u8)) err |= sysctl_err(path, table, "array not allowed"); if (table->extra1) { extra = *(unsigned int *) table->extra1; if (extra > 255U) err |= sysctl_err(path, table, "range value too large for proc_dou8vec_minmax"); } if (table->extra2) { extra = *(unsigned int *) table->extra2; if (extra > 255U) err |= sysctl_err(path, table, "range value too large for proc_dou8vec_minmax"); } } if (table->proc_handler == proc_dobool) { if (table->maxlen != sizeof(bool)) err |= sysctl_err(path, table, "array not allowed"); } return err; } static int sysctl_check_table(const char *path, struct ctl_table_header *header) { const struct ctl_table *entry; int err = 0; list_for_each_table_entry(entry, header) { if (!entry->procname) err |= sysctl_err(path, entry, "procname is null"); if ((entry->proc_handler == proc_dostring) || (entry->proc_handler == proc_dobool) || (entry->proc_handler == proc_dointvec) || (entry->proc_handler == proc_douintvec) || (entry->proc_handler == proc_douintvec_minmax) || (entry->proc_handler == proc_dointvec_minmax) || (entry->proc_handler == proc_dou8vec_minmax) || (entry->proc_handler == proc_dointvec_jiffies) || (entry->proc_handler == proc_dointvec_userhz_jiffies) || (entry->proc_handler == proc_dointvec_ms_jiffies) || (entry->proc_handler == proc_doulongvec_minmax) || (entry->proc_handler == proc_doulongvec_ms_jiffies_minmax)) { if (!entry->data) err |= sysctl_err(path, entry, "No data"); if (!entry->maxlen) err |= sysctl_err(path, entry, "No maxlen"); else err |= sysctl_check_table_array(path, entry); } if (!entry->proc_handler) err |= sysctl_err(path, entry, "No proc_handler"); if ((entry->mode & (S_IRUGO|S_IWUGO)) != entry->mode) err |= sysctl_err(path, entry, "bogus .mode 0%o", entry->mode); } return err; } static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table_header *head) { struct ctl_table *link_table, *link; struct ctl_table_header *links; const struct ctl_table *entry; struct ctl_node *node; char *link_name; int name_bytes; name_bytes = 0; list_for_each_table_entry(entry, head) { name_bytes += strlen(entry->procname) + 1; } links = kzalloc(sizeof(struct ctl_table_header) + sizeof(struct ctl_node)*head->ctl_table_size + sizeof(struct ctl_table)*head->ctl_table_size + name_bytes, GFP_KERNEL); if (!links) return NULL; node = (struct ctl_node *)(links + 1); link_table = (struct ctl_table *)(node + head->ctl_table_size); link_name = (char *)(link_table + head->ctl_table_size); link = link_table; list_for_each_table_entry(entry, head) { int len = strlen(entry->procname) + 1; memcpy(link_name, entry->procname, len); link->procname = link_name; link->mode = S_IFLNK|S_IRWXUGO; link->data = head->root; link_name += len; link++; } init_header(links, dir->header.root, dir->header.set, node, link_table, head->ctl_table_size); links->nreg = head->ctl_table_size; return links; } static bool get_links(struct ctl_dir *dir, struct ctl_table_header *header, struct ctl_table_root *link_root) { struct ctl_table_header *tmp_head; const struct ctl_table *entry, *link; if (header->ctl_table_size == 0 || sysctl_is_perm_empty_ctl_header(header)) return true; /* Are there links available for every entry in table? */ list_for_each_table_entry(entry, header) { const char *procname = entry->procname; link = find_entry(&tmp_head, dir, procname, strlen(procname)); if (!link) return false; if (S_ISDIR(link->mode) && S_ISDIR(entry->mode)) continue; if (S_ISLNK(link->mode) && (link->data == link_root)) continue; return false; } /* The checks passed. Increase the registration count on the links */ list_for_each_table_entry(entry, header) { const char *procname = entry->procname; link = find_entry(&tmp_head, dir, procname, strlen(procname)); tmp_head->nreg++; } return true; } static int insert_links(struct ctl_table_header *head) { struct ctl_table_set *root_set = &sysctl_table_root.default_set; struct ctl_dir *core_parent; struct ctl_table_header *links; int err; if (head->set == root_set) return 0; core_parent = xlate_dir(root_set, head->parent); if (IS_ERR(core_parent)) return 0; if (get_links(core_parent, head, head->root)) return 0; core_parent->header.nreg++; spin_unlock(&sysctl_lock); links = new_links(core_parent, head); spin_lock(&sysctl_lock); err = -ENOMEM; if (!links) goto out; err = 0; if (get_links(core_parent, head, head->root)) { kfree(links); goto out; } err = insert_header(core_parent, links); if (err) kfree(links); out: drop_sysctl_table(&core_parent->header); return err; } /* Find the directory for the ctl_table. If one is not found create it. */ static struct ctl_dir *sysctl_mkdir_p(struct ctl_dir *dir, const char *path) { const char *name, *nextname; for (name = path; name; name = nextname) { int namelen; nextname = strchr(name, '/'); if (nextname) { namelen = nextname - name; nextname++; } else { namelen = strlen(name); } if (namelen == 0) continue; /* * namelen ensures if name is "foo/bar/yay" only foo is * registered first. We traverse as if using mkdir -p and * return a ctl_dir for the last directory entry. */ dir = get_subdir(dir, name, namelen); if (IS_ERR(dir)) break; } return dir; } /** * __register_sysctl_table - register a leaf sysctl table * @set: Sysctl tree to register on * @path: The path to the directory the sysctl table is in. * * @table: the top-level table structure. This table should not be free'd * after registration. So it should not be used on stack. It can either * be a global or dynamically allocated by the caller and free'd later * after sysctl unregistration. * @table_size : The number of elements in table * * Register a sysctl table hierarchy. @table should be a filled in ctl_table * array. * * The members of the &struct ctl_table structure are used as follows: * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not * enter a sysctl file * data - a pointer to data for use by proc_handler * maxlen - the maximum size in bytes of the data * mode - the file permissions for the /proc/sys file * type - Defines the target type (described in struct definition) * proc_handler - the text handler routine (described below) * * extra1, extra2 - extra pointers usable by the proc handler routines * XXX: we should eventually modify these to use long min / max [0] * [0] https://lkml.kernel.org/87zgpte9o4.fsf@email.froward.int.ebiederm.org * * Leaf nodes in the sysctl tree will be represented by a single file * under /proc; non-leaf nodes are not allowed. * * There must be a proc_handler routine for any terminal nodes. * Several default handlers are available to cover common cases - * * proc_dostring(), proc_dointvec(), proc_dointvec_jiffies(), * proc_dointvec_userhz_jiffies(), proc_dointvec_minmax(), * proc_doulongvec_ms_jiffies_minmax(), proc_doulongvec_minmax() * * It is the handler's job to read the input buffer from user memory * and process it. The handler should return 0 on success. * * This routine returns %NULL on a failure to register, and a pointer * to the table header on success. */ struct ctl_table_header *__register_sysctl_table( struct ctl_table_set *set, const char *path, const struct ctl_table *table, size_t table_size) { struct ctl_table_root *root = set->dir.header.root; struct ctl_table_header *header; struct ctl_dir *dir; struct ctl_node *node; header = kzalloc(sizeof(struct ctl_table_header) + sizeof(struct ctl_node)*table_size, GFP_KERNEL_ACCOUNT); if (!header) return NULL; node = (struct ctl_node *)(header + 1); init_header(header, root, set, node, table, table_size); if (sysctl_check_table(path, header)) goto fail; spin_lock(&sysctl_lock); dir = &set->dir; /* Reference moved down the directory tree get_subdir */ dir->header.nreg++; spin_unlock(&sysctl_lock); dir = sysctl_mkdir_p(dir, path); if (IS_ERR(dir)) goto fail; spin_lock(&sysctl_lock); if (insert_header(dir, header)) goto fail_put_dir_locked; drop_sysctl_table(&dir->header); spin_unlock(&sysctl_lock); return header; fail_put_dir_locked: drop_sysctl_table(&dir->header); spin_unlock(&sysctl_lock); fail: kfree(header); return NULL; } /** * register_sysctl_sz - register a sysctl table * @path: The path to the directory the sysctl table is in. If the path * doesn't exist we will create it for you. * @table: the table structure. The calller must ensure the life of the @table * will be kept during the lifetime use of the syctl. It must not be freed * until unregister_sysctl_table() is called with the given returned table * with this registration. If your code is non modular then you don't need * to call unregister_sysctl_table() and can instead use something like * register_sysctl_init() which does not care for the result of the syctl * registration. * @table_size: The number of elements in table. * * Register a sysctl table. @table should be a filled in ctl_table * array. A completely 0 filled entry terminates the table. * * See __register_sysctl_table for more details. */ struct ctl_table_header *register_sysctl_sz(const char *path, const struct ctl_table *table, size_t table_size) { return __register_sysctl_table(&sysctl_table_root.default_set, path, table, table_size); } EXPORT_SYMBOL(register_sysctl_sz); /** * __register_sysctl_init() - register sysctl table to path * @path: path name for sysctl base. If that path doesn't exist we will create * it for you. * @table: This is the sysctl table that needs to be registered to the path. * The caller must ensure the life of the @table will be kept during the * lifetime use of the sysctl. * @table_name: The name of sysctl table, only used for log printing when * registration fails * @table_size: The number of elements in table * * The sysctl interface is used by userspace to query or modify at runtime * a predefined value set on a variable. These variables however have default * values pre-set. Code which depends on these variables will always work even * if register_sysctl() fails. If register_sysctl() fails you'd just loose the * ability to query or modify the sysctls dynamically at run time. Chances of * register_sysctl() failing on init are extremely low, and so for both reasons * this function does not return any error as it is used by initialization code. * * Context: if your base directory does not exist it will be created for you. */ void __init __register_sysctl_init(const char *path, const struct ctl_table *table, const char *table_name, size_t table_size) { struct ctl_table_header *hdr = register_sysctl_sz(path, table, table_size); if (unlikely(!hdr)) { pr_err("failed when register_sysctl_sz %s to %s\n", table_name, path); return; } kmemleak_not_leak(hdr); } static void put_links(struct ctl_table_header *header) { struct ctl_table_set *root_set = &sysctl_table_root.default_set; struct ctl_table_root *root = header->root; struct ctl_dir *parent = header->parent; struct ctl_dir *core_parent; const struct ctl_table *entry; if (header->set == root_set) return; core_parent = xlate_dir(root_set, parent); if (IS_ERR(core_parent)) return; list_for_each_table_entry(entry, header) { struct ctl_table_header *link_head; const struct ctl_table *link; const char *name = entry->procname; link = find_entry(&link_head, core_parent, name, strlen(name)); if (link && ((S_ISDIR(link->mode) && S_ISDIR(entry->mode)) || (S_ISLNK(link->mode) && (link->data == root)))) { drop_sysctl_table(link_head); } else { pr_err("sysctl link missing during unregister: "); sysctl_print_dir(parent); pr_cont("%s\n", name); } } } static void drop_sysctl_table(struct ctl_table_header *header) { struct ctl_dir *parent = header->parent; if (--header->nreg) return; if (parent) { put_links(header); start_unregistering(header); } if (!--header->count) kfree_rcu(header, rcu); if (parent) drop_sysctl_table(&parent->header); } /** * unregister_sysctl_table - unregister a sysctl table hierarchy * @header: the header returned from register_sysctl or __register_sysctl_table * * Unregisters the sysctl table and all children. proc entries may not * actually be removed until they are no longer used by anyone. */ void unregister_sysctl_table(struct ctl_table_header * header) { might_sleep(); if (header == NULL) return; spin_lock(&sysctl_lock); drop_sysctl_table(header); spin_unlock(&sysctl_lock); } EXPORT_SYMBOL(unregister_sysctl_table); void setup_sysctl_set(struct ctl_table_set *set, struct ctl_table_root *root, int (*is_seen)(struct ctl_table_set *)) { memset(set, 0, sizeof(*set)); set->is_seen = is_seen; init_header(&set->dir.header, root, set, NULL, root_table, 1); } void retire_sysctl_set(struct ctl_table_set *set) { WARN_ON(!RB_EMPTY_ROOT(&set->dir.root)); } int __init proc_sys_init(void) { struct proc_dir_entry *proc_sys_root; proc_sys_root = proc_mkdir("sys", NULL); proc_sys_root->proc_iops = &proc_sys_dir_operations; proc_sys_root->proc_dir_ops = &proc_sys_dir_file_operations; proc_sys_root->nlink = 0; return sysctl_init_bases(); } struct sysctl_alias { const char *kernel_param; const char *sysctl_param; }; /* * Historically some settings had both sysctl and a command line parameter. * With the generic sysctl. parameter support, we can handle them at a single * place and only keep the historical name for compatibility. This is not meant * to add brand new aliases. When adding existing aliases, consider whether * the possibly different moment of changing the value (e.g. from early_param * to the moment do_sysctl_args() is called) is an issue for the specific * parameter. */ static const struct sysctl_alias sysctl_aliases[] = { {"hardlockup_all_cpu_backtrace", "kernel.hardlockup_all_cpu_backtrace" }, {"hung_task_panic", "kernel.hung_task_panic" }, {"numa_zonelist_order", "vm.numa_zonelist_order" }, {"softlockup_all_cpu_backtrace", "kernel.softlockup_all_cpu_backtrace" }, { } }; static const char *sysctl_find_alias(char *param) { const struct sysctl_alias *alias; for (alias = &sysctl_aliases[0]; alias->kernel_param != NULL; alias++) { if (strcmp(alias->kernel_param, param) == 0) return alias->sysctl_param; } return NULL; } bool sysctl_is_alias(char *param) { const char *alias = sysctl_find_alias(param); return alias != NULL; } /* Set sysctl value passed on kernel command line. */ static int process_sysctl_arg(char *param, char *val, const char *unused, void *arg) { char *path; struct vfsmount **proc_mnt = arg; struct file_system_type *proc_fs_type; struct file *file; int len; int err; loff_t pos = 0; ssize_t wret; if (strncmp(param, "sysctl", sizeof("sysctl") - 1) == 0) { param += sizeof("sysctl") - 1; if (param[0] != '/' && param[0] != '.') return 0; param++; } else { param = (char *) sysctl_find_alias(param); if (!param) return 0; } if (!val) return -EINVAL; len = strlen(val); if (len == 0) return -EINVAL; /* * To set sysctl options, we use a temporary mount of proc, look up the * respective sys/ file and write to it. To avoid mounting it when no * options were given, we mount it only when the first sysctl option is * found. Why not a persistent mount? There are problems with a * persistent mount of proc in that it forces userspace not to use any * proc mount options. */ if (!*proc_mnt) { proc_fs_type = get_fs_type("proc"); if (!proc_fs_type) { pr_err("Failed to find procfs to set sysctl from command line\n"); return 0; } *proc_mnt = kern_mount(proc_fs_type); put_filesystem(proc_fs_type); if (IS_ERR(*proc_mnt)) { pr_err("Failed to mount procfs to set sysctl from command line\n"); return 0; } } path = kasprintf(GFP_KERNEL, "sys/%s", param); if (!path) panic("%s: Failed to allocate path for %s\n", __func__, param); strreplace(path, '.', '/'); file = file_open_root_mnt(*proc_mnt, path, O_WRONLY, 0); if (IS_ERR(file)) { err = PTR_ERR(file); if (err == -ENOENT) pr_err("Failed to set sysctl parameter '%s=%s': parameter not found\n", param, val); else if (err == -EACCES) pr_err("Failed to set sysctl parameter '%s=%s': permission denied (read-only?)\n", param, val); else pr_err("Error %pe opening proc file to set sysctl parameter '%s=%s'\n", file, param, val); goto out; } wret = kernel_write(file, val, len, &pos); if (wret < 0) { err = wret; if (err == -EINVAL) pr_err("Failed to set sysctl parameter '%s=%s': invalid value\n", param, val); else pr_err("Error %pe writing to proc file to set sysctl parameter '%s=%s'\n", ERR_PTR(err), param, val); } else if (wret != len) { pr_err("Wrote only %zd bytes of %d writing to proc file %s to set sysctl parameter '%s=%s\n", wret, len, path, param, val); } err = filp_close(file, NULL); if (err) pr_err("Error %pe closing proc file to set sysctl parameter '%s=%s\n", ERR_PTR(err), param, val); out: kfree(path); return 0; } void do_sysctl_args(void) { char *command_line; struct vfsmount *proc_mnt = NULL; command_line = kstrdup(saved_command_line, GFP_KERNEL); if (!command_line) panic("%s: Failed to allocate copy of command line\n", __func__); parse_args("Setting sysctl args", command_line, NULL, 0, -1, -1, &proc_mnt, process_sysctl_arg); if (proc_mnt) kern_unmount(proc_mnt); kfree(command_line); }
2 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar * Copyright (C) 2005-2006, Thomas Gleixner * * This file contains the IRQ-resend code * * If the interrupt is waiting to be processed, we try to re-run it. * We can't directly run it from here since the caller might be in an * interrupt-protected region. Not all irq controller chips can * retrigger interrupts at the hardware level, so in those cases * we allow the resending of IRQs via a tasklet. */ #include <linux/irq.h> #include <linux/module.h> #include <linux/random.h> #include <linux/interrupt.h> #include "internals.h" #ifdef CONFIG_HARDIRQS_SW_RESEND /* hlist_head to handle software resend of interrupts: */ static HLIST_HEAD(irq_resend_list); static DEFINE_RAW_SPINLOCK(irq_resend_lock); /* * Run software resends of IRQ's */ static void resend_irqs(struct tasklet_struct *unused) { guard(raw_spinlock_irq)(&irq_resend_lock); while (!hlist_empty(&irq_resend_list)) { struct irq_desc *desc; desc = hlist_entry(irq_resend_list.first, struct irq_desc, resend_node); hlist_del_init(&desc->resend_node); raw_spin_unlock(&irq_resend_lock); desc->handle_irq(desc); raw_spin_lock(&irq_resend_lock); } } /* Tasklet to handle resend: */ static DECLARE_TASKLET(resend_tasklet, resend_irqs); static int irq_sw_resend(struct irq_desc *desc) { /* * Validate whether this interrupt can be safely injected from * non interrupt context */ if (irqd_is_handle_enforce_irqctx(&desc->irq_data)) return -EINVAL; /* * If the interrupt is running in the thread context of the parent * irq we need to be careful, because we cannot trigger it * directly. */ if (irq_settings_is_nested_thread(desc)) { /* * If the parent_irq is valid, we retrigger the parent, * otherwise we do nothing. */ if (!desc->parent_irq) return -EINVAL; desc = irq_to_desc(desc->parent_irq); if (!desc) return -EINVAL; } /* Add to resend_list and activate the softirq: */ scoped_guard(raw_spinlock, &irq_resend_lock) { if (hlist_unhashed(&desc->resend_node)) hlist_add_head(&desc->resend_node, &irq_resend_list); } tasklet_schedule(&resend_tasklet); return 0; } void clear_irq_resend(struct irq_desc *desc) { guard(raw_spinlock)(&irq_resend_lock); hlist_del_init(&desc->resend_node); } void irq_resend_init(struct irq_desc *desc) { INIT_HLIST_NODE(&desc->resend_node); } #else void clear_irq_resend(struct irq_desc *desc) {} void irq_resend_init(struct irq_desc *desc) {} static int irq_sw_resend(struct irq_desc *desc) { return -EINVAL; } #endif static int try_retrigger(struct irq_desc *desc) { if (desc->irq_data.chip->irq_retrigger) return desc->irq_data.chip->irq_retrigger(&desc->irq_data); #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY return irq_chip_retrigger_hierarchy(&desc->irq_data); #else return 0; #endif } /* * IRQ resend * * Is called with interrupts disabled and desc->lock held. */ int check_irq_resend(struct irq_desc *desc, bool inject) { int err = 0; /* * We do not resend level type interrupts. Level type interrupts * are resent by hardware when they are still active. Clear the * pending bit so suspend/resume does not get confused. */ if (irq_settings_is_level(desc)) { desc->istate &= ~IRQS_PENDING; return -EINVAL; } if (desc->istate & IRQS_REPLAY) return -EBUSY; if (!(desc->istate & IRQS_PENDING) && !inject) return 0; desc->istate &= ~IRQS_PENDING; if (!try_retrigger(desc)) err = irq_sw_resend(desc); /* If the retrigger was successful, mark it with the REPLAY bit */ if (!err) desc->istate |= IRQS_REPLAY; return err; } #ifdef CONFIG_GENERIC_IRQ_INJECTION /** * irq_inject_interrupt - Inject an interrupt for testing/error injection * @irq: The interrupt number * * This function must only be used for debug and testing purposes! * * Especially on x86 this can cause a premature completion of an interrupt * affinity change causing the interrupt line to become stale. Very * unlikely, but possible. * * The injection can fail for various reasons: * - Interrupt is not activated * - Interrupt is NMI type or currently replaying * - Interrupt is level type * - Interrupt does not support hardware retrigger and software resend is * either not enabled or not possible for the interrupt. */ int irq_inject_interrupt(unsigned int irq) { int err = -EINVAL; /* Try the state injection hardware interface first */ if (!irq_set_irqchip_state(irq, IRQCHIP_STATE_PENDING, true)) return 0; /* That failed, try via the resend mechanism */ scoped_irqdesc_get_and_buslock(irq, 0) { struct irq_desc *desc = scoped_irqdesc; /* * Only try to inject when the interrupt is: * - not NMI type * - activated */ if (!irq_is_nmi(desc) && irqd_is_activated(&desc->irq_data)) err = check_irq_resend(desc, true); } return err; } EXPORT_SYMBOL_GPL(irq_inject_interrupt); #endif
86 86 156 156 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2008 Christoph Hellwig. * Portions Copyright (C) 2000-2008 Silicon Graphics, Inc. */ #include "xfs_platform.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_da_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_da_btree.h" #include "xfs_attr.h" #include "xfs_acl.h" #include "xfs_log.h" #include "xfs_xattr.h" #include "xfs_quota.h" #include <linux/posix_acl_xattr.h> /* * Get permission to use log-assisted atomic exchange of file extents. * Callers must not be running any transactions or hold any ILOCKs. */ static inline int xfs_attr_grab_log_assist( struct xfs_mount *mp) { int error = 0; /* xattr update log intent items are already enabled */ if (xfs_is_using_logged_xattrs(mp)) return 0; /* * Check if the filesystem featureset is new enough to set this log * incompat feature bit. Strictly speaking, the minimum requirement is * a V5 filesystem for the superblock field, but we'll require rmap * or reflink to avoid having to deal with really old kernels. */ if (!xfs_has_reflink(mp) && !xfs_has_rmapbt(mp)) return -EOPNOTSUPP; /* Enable log-assisted xattrs. */ error = xfs_add_incompat_log_feature(mp, XFS_SB_FEAT_INCOMPAT_LOG_XATTRS); if (error) return error; xfs_set_using_logged_xattrs(mp); xfs_warn_experimental(mp, XFS_EXPERIMENTAL_LARP); return 0; } static inline bool xfs_attr_want_log_assist( struct xfs_mount *mp) { #ifdef DEBUG /* Logged xattrs require a V5 super for log_incompat */ return xfs_has_crc(mp) && xfs_globals.larp; #else return false; #endif } /* * Set or remove an xattr, having grabbed the appropriate logging resources * prior to calling libxfs. Callers of this function are only required to * initialize the inode, attr_filter, name, namelen, value, and valuelen fields * of @args. */ int xfs_attr_change( struct xfs_da_args *args, enum xfs_attr_update op) { struct xfs_mount *mp = args->dp->i_mount; int error; if (xfs_is_shutdown(mp)) return -EIO; error = xfs_qm_dqattach(args->dp); if (error) return error; /* * We have no control over the attribute names that userspace passes us * to remove, so we have to allow the name lookup prior to attribute * removal to fail as well. */ args->op_flags = XFS_DA_OP_OKNOENT; if (xfs_attr_want_log_assist(mp)) { error = xfs_attr_grab_log_assist(mp); if (error) return error; args->op_flags |= XFS_DA_OP_LOGGED; } args->owner = args->dp->i_ino; args->geo = mp->m_attr_geo; args->whichfork = XFS_ATTR_FORK; xfs_attr_sethash(args); /* * Some xattrs must be resistant to allocation failure at ENOSPC, e.g. * creating an inode with ACLs or security attributes requires the * allocation of the xattr holding that information to succeed. Hence * we allow xattrs in the VFS TRUSTED, SYSTEM, POSIX_ACL and SECURITY * (LSM xattr) namespaces to dip into the reserve block pool to allow * manipulation of these xattrs when at ENOSPC. These VFS xattr * namespaces translate to the XFS_ATTR_ROOT and XFS_ATTR_SECURE on-disk * namespaces. * * For most of these cases, these special xattrs will fit in the inode * itself and so consume no extra space or only require temporary extra * space while an overwrite is being made. Hence the use of the reserved * pool is largely to avoid the worst case reservation from preventing * the xattr from being created at ENOSPC. */ return xfs_attr_set(args, op, args->attr_filter & (XFS_ATTR_ROOT | XFS_ATTR_SECURE)); } static int xfs_xattr_get(const struct xattr_handler *handler, struct dentry *unused, struct inode *inode, const char *name, void *value, size_t size) { struct xfs_da_args args = { .dp = XFS_I(inode), .attr_filter = handler->flags, .name = name, .namelen = strlen(name), .value = value, .valuelen = size, }; int error; if (xfs_ifork_zapped(XFS_I(inode), XFS_ATTR_FORK)) return -EIO; error = xfs_attr_get(&args); if (error) return error; return args.valuelen; } static inline enum xfs_attr_update xfs_xattr_flags_to_op( int flags, const void *value) { if (!value) return XFS_ATTRUPDATE_REMOVE; if (flags & XATTR_CREATE) return XFS_ATTRUPDATE_CREATE; if (flags & XATTR_REPLACE) return XFS_ATTRUPDATE_REPLACE; return XFS_ATTRUPDATE_UPSERT; } static int xfs_xattr_set(const struct xattr_handler *handler, struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) { struct xfs_da_args args = { .dp = XFS_I(inode), .attr_filter = handler->flags, .name = name, .namelen = strlen(name), .value = (void *)value, .valuelen = size, }; int error; error = xfs_attr_change(&args, xfs_xattr_flags_to_op(flags, value)); if (!error && (handler->flags & XFS_ATTR_ROOT)) xfs_forget_acl(inode, name); return error; } static const struct xattr_handler xfs_xattr_user_handler = { .prefix = XATTR_USER_PREFIX, .flags = 0, /* no flags implies user namespace */ .get = xfs_xattr_get, .set = xfs_xattr_set, }; static const struct xattr_handler xfs_xattr_trusted_handler = { .prefix = XATTR_TRUSTED_PREFIX, .flags = XFS_ATTR_ROOT, .get = xfs_xattr_get, .set = xfs_xattr_set, }; static const struct xattr_handler xfs_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, .flags = XFS_ATTR_SECURE, .get = xfs_xattr_get, .set = xfs_xattr_set, }; const struct xattr_handler * const xfs_xattr_handlers[] = { &xfs_xattr_user_handler, &xfs_xattr_trusted_handler, &xfs_xattr_security_handler, NULL }; static void __xfs_xattr_put_listent( struct xfs_attr_list_context *context, char *prefix, int prefix_len, unsigned char *name, int namelen) { char *offset; int arraytop; if (context->count < 0 || context->seen_enough) return; if (!context->buffer) goto compute_size; arraytop = context->count + prefix_len + namelen + 1; if (arraytop > context->firstu) { context->count = -1; /* insufficient space */ context->seen_enough = 1; return; } offset = context->buffer + context->count; memcpy(offset, prefix, prefix_len); offset += prefix_len; memcpy(offset, (char *)name, namelen); /* real name */ offset += namelen; *offset = '\0'; compute_size: context->count += prefix_len + namelen + 1; return; } static void xfs_xattr_put_listent( struct xfs_attr_list_context *context, int flags, unsigned char *name, int namelen, void *value, int valuelen) { char *prefix; int prefix_len; ASSERT(context->count >= 0); /* Don't expose private xattr namespaces. */ if (flags & XFS_ATTR_PRIVATE_NSP_MASK) return; if (flags & XFS_ATTR_ROOT) { #ifdef CONFIG_XFS_POSIX_ACL if (namelen == SGI_ACL_FILE_SIZE && strncmp(name, SGI_ACL_FILE, SGI_ACL_FILE_SIZE) == 0) { __xfs_xattr_put_listent( context, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN, XATTR_POSIX_ACL_ACCESS, strlen(XATTR_POSIX_ACL_ACCESS)); } else if (namelen == SGI_ACL_DEFAULT_SIZE && strncmp(name, SGI_ACL_DEFAULT, SGI_ACL_DEFAULT_SIZE) == 0) { __xfs_xattr_put_listent( context, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN, XATTR_POSIX_ACL_DEFAULT, strlen(XATTR_POSIX_ACL_DEFAULT)); } #endif /* * Only show root namespace entries if we are actually allowed to * see them. */ if (!capable(CAP_SYS_ADMIN)) return; prefix = XATTR_TRUSTED_PREFIX; prefix_len = XATTR_TRUSTED_PREFIX_LEN; } else if (flags & XFS_ATTR_SECURE) { prefix = XATTR_SECURITY_PREFIX; prefix_len = XATTR_SECURITY_PREFIX_LEN; } else { prefix = XATTR_USER_PREFIX; prefix_len = XATTR_USER_PREFIX_LEN; } __xfs_xattr_put_listent(context, prefix, prefix_len, name, namelen); return; } ssize_t xfs_vn_listxattr( struct dentry *dentry, char *data, size_t size) { struct xfs_attr_list_context context; struct inode *inode = d_inode(dentry); int error; if (xfs_ifork_zapped(XFS_I(inode), XFS_ATTR_FORK)) return -EIO; /* * First read the regular on-disk attributes. */ memset(&context, 0, sizeof(context)); context.dp = XFS_I(inode); context.resynch = 1; context.bufsize = size; context.buffer = size ? data : NULL; context.firstu = context.bufsize; context.put_listent = xfs_xattr_put_listent; error = xfs_attr_list(&context); if (error) return error; if (context.count < 0) return -ERANGE; return context.count; }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 /* SPDX-License-Identifier: GPL-2.0 */ /* * NFS internal definitions */ #include "nfs4_fs.h" #include <linux/fs_context.h> #include <linux/security.h> #include <linux/compiler_attributes.h> #include <linux/crc32.h> #include <linux/sunrpc/addr.h> #include <linux/nfs_page.h> #include <linux/nfslocalio.h> #include <linux/wait_bit.h> #define NFS_SB_MASK (SB_RDONLY|SB_NOSUID|SB_NODEV|SB_NOEXEC|SB_SYNCHRONOUS) extern const struct export_operations nfs_export_ops; struct nfs_string; struct nfs_pageio_descriptor; static inline void nfs_attr_check_mountpoint(struct super_block *parent, struct nfs_fattr *fattr) { if (!nfs_fsid_equal(&NFS_SB(parent)->fsid, &fattr->fsid)) fattr->valid |= NFS_ATTR_FATTR_MOUNTPOINT; } static inline int nfs_attr_use_mounted_on_fileid(struct nfs_fattr *fattr) { if (((fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) == 0) || (((fattr->valid & NFS_ATTR_FATTR_MOUNTPOINT) == 0) && ((fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) == 0))) return 0; return 1; } static inline bool nfs_lookup_is_soft_revalidate(const struct dentry *dentry) { if (!(NFS_SB(dentry->d_sb)->flags & NFS_MOUNT_SOFTREVAL)) return false; if (!d_is_positive(dentry) || !NFS_FH(d_inode(dentry))->size) return false; return true; } static inline fmode_t flags_to_mode(int flags) { fmode_t res = (__force fmode_t)flags & FMODE_EXEC; if ((flags & O_ACCMODE) != O_WRONLY) res |= FMODE_READ; if ((flags & O_ACCMODE) != O_RDONLY) res |= FMODE_WRITE; return res; } /* * Note: RFC 1813 doesn't limit the number of auth flavors that * a server can return, so make something up. */ #define NFS_MAX_SECFLAVORS (12) /* * Value used if the user did not specify a port value. */ #define NFS_UNSPEC_PORT (-1) #define NFS_UNSPEC_RETRANS (UINT_MAX) #define NFS_UNSPEC_TIMEO (UINT_MAX) struct nfs_client_initdata { unsigned long init_flags; const char *hostname; /* Hostname of the server */ const struct sockaddr_storage *addr; /* Address of the server */ const char *nodename; /* Hostname of the client */ const char *ip_addr; /* IP address of the client */ size_t addrlen; struct nfs_subversion *nfs_mod; int proto; u32 minorversion; unsigned int nconnect; unsigned int max_connect; struct net *net; const struct rpc_timeout *timeparms; const struct cred *cred; struct xprtsec_parms xprtsec; unsigned long connect_timeout; unsigned long reconnect_timeout; }; /* * In-kernel mount arguments */ struct nfs_fs_context { bool internal; bool skip_reconfig_option_check; bool need_mount; bool sloppy; unsigned int flags; /* NFS{,4}_MOUNT_* flags */ unsigned int rsize, wsize; unsigned int timeo, retrans; unsigned int acregmin, acregmax; unsigned int acdirmin, acdirmax; unsigned int namlen; unsigned int options; unsigned int bsize; struct nfs_auth_info auth_info; rpc_authflavor_t selected_flavor; struct xprtsec_parms xprtsec; char *client_address; unsigned int version; unsigned int minorversion; char *fscache_uniq; unsigned short protofamily; unsigned short mountfamily; bool has_sec_mnt_opts; int lock_status; struct { union { struct sockaddr address; struct sockaddr_storage _address; }; size_t addrlen; char *hostname; u32 version; int port; unsigned short protocol; } mount_server; struct { union { struct sockaddr address; struct sockaddr_storage _address; }; size_t addrlen; char *hostname; char *export_path; int port; unsigned short protocol; unsigned short nconnect; unsigned short max_connect; unsigned short export_path_len; } nfs_server; struct nfs_fh *mntfh; struct nfs_server *server; struct nfs_subversion *nfs_mod; /* Information for a cloned mount. */ struct nfs_clone_mount { struct super_block *sb; struct dentry *dentry; struct nfs_fattr *fattr; } clone_data; }; enum nfs_lock_status { NFS_LOCK_NOT_SET = 0, NFS_LOCK_LOCK = 1, NFS_LOCK_NOLOCK = 2, }; #define nfs_errorf(fc, fmt, ...) ((fc)->log.log ? \ errorf(fc, fmt, ## __VA_ARGS__) : \ ({ dprintk(fmt "\n", ## __VA_ARGS__); })) #define nfs_ferrorf(fc, fac, fmt, ...) ((fc)->log.log ? \ errorf(fc, fmt, ## __VA_ARGS__) : \ ({ dfprintk(fac, fmt "\n", ## __VA_ARGS__); })) #define nfs_invalf(fc, fmt, ...) ((fc)->log.log ? \ invalf(fc, fmt, ## __VA_ARGS__) : \ ({ dprintk(fmt "\n", ## __VA_ARGS__); -EINVAL; })) #define nfs_finvalf(fc, fac, fmt, ...) ((fc)->log.log ? \ invalf(fc, fmt, ## __VA_ARGS__) : \ ({ dfprintk(fac, fmt "\n", ## __VA_ARGS__); -EINVAL; })) #define nfs_warnf(fc, fmt, ...) ((fc)->log.log ? \ warnf(fc, fmt, ## __VA_ARGS__) : \ ({ dprintk(fmt "\n", ## __VA_ARGS__); })) #define nfs_fwarnf(fc, fac, fmt, ...) ((fc)->log.log ? \ warnf(fc, fmt, ## __VA_ARGS__) : \ ({ dfprintk(fac, fmt "\n", ## __VA_ARGS__); })) static inline struct nfs_fs_context *nfs_fc2context(const struct fs_context *fc) { return fc->fs_private; } /* mount_clnt.c */ struct nfs_mount_request { struct sockaddr_storage *sap; size_t salen; char *hostname; char *dirpath; u32 version; unsigned short protocol; struct nfs_fh *fh; int noresvport; unsigned int *auth_flav_len; rpc_authflavor_t *auth_flavs; struct net *net; }; extern int nfs_mount(struct nfs_mount_request *info, int timeo, int retrans); /* client.c */ extern const struct rpc_program nfs_program; extern void nfs_clients_init(struct net *net); extern void nfs_clients_exit(struct net *net); extern struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *); int nfs_create_rpc_client(struct nfs_client *, const struct nfs_client_initdata *, rpc_authflavor_t); struct nfs_client *nfs_get_client(const struct nfs_client_initdata *); int nfs_probe_server(struct nfs_server *, struct nfs_fh *); void nfs_server_insert_lists(struct nfs_server *); void nfs_server_remove_lists(struct nfs_server *); void nfs_init_timeout_values(struct rpc_timeout *to, int proto, int timeo, int retrans); int nfs_init_server_rpcclient(struct nfs_server *, const struct rpc_timeout *t, rpc_authflavor_t); struct nfs_server *nfs_alloc_server(void); void nfs_server_copy_userdata(struct nfs_server *, struct nfs_server *); extern void nfs_put_client(struct nfs_client *); extern void nfs_free_client(struct nfs_client *); extern struct nfs_client *nfs4_find_client_ident(struct net *, int); extern struct nfs_client * nfs4_find_client_sessionid(struct net *, const struct sockaddr *, struct nfs4_sessionid *, u32); extern struct nfs_server *nfs_create_server(struct fs_context *); extern void nfs_server_set_init_caps(struct nfs_server *); extern struct nfs_server *nfs4_create_server(struct fs_context *); extern struct nfs_server *nfs4_create_referral_server(struct fs_context *); extern int nfs4_update_server(struct nfs_server *server, const char *hostname, struct sockaddr_storage *sap, size_t salen, struct net *net); extern void nfs_free_server(struct nfs_server *server); extern struct nfs_server *nfs_clone_server(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *, rpc_authflavor_t); extern bool nfs_client_init_is_complete(const struct nfs_client *clp); extern int nfs_client_init_status(const struct nfs_client *clp); extern int nfs_wait_client_init_complete(const struct nfs_client *clp); extern void nfs_mark_client_ready(struct nfs_client *clp, int state); extern struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, const struct sockaddr_storage *ds_addr, int ds_addrlen, int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans, u32 minor_version); extern struct rpc_clnt *nfs4_find_or_create_ds_client(struct nfs_client *, struct inode *); extern void nfs4_session_limit_rwsize(struct nfs_server *server); extern void nfs4_session_limit_xasize(struct nfs_server *server); extern struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv, const struct sockaddr_storage *ds_addr, int ds_addrlen, int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans); #ifdef CONFIG_PROC_FS extern int __init nfs_fs_proc_init(void); extern void nfs_fs_proc_exit(void); extern int nfs_fs_proc_net_init(struct net *net); extern void nfs_fs_proc_net_exit(struct net *net); #else static inline int nfs_fs_proc_net_init(struct net *net) { return 0; } static inline void nfs_fs_proc_net_exit(struct net *net) { } static inline int nfs_fs_proc_init(void) { return 0; } static inline void nfs_fs_proc_exit(void) { } #endif /* callback_xdr.c */ extern const struct svc_version nfs4_callback_version1; extern const struct svc_version nfs4_callback_version4; /* fs_context.c */ extern struct file_system_type nfs_fs_type; /* pagelist.c */ extern int __init nfs_init_nfspagecache(void); extern void nfs_destroy_nfspagecache(void); extern int __init nfs_init_readpagecache(void); extern void nfs_destroy_readpagecache(void); extern int __init nfs_init_writepagecache(void); extern void nfs_destroy_writepagecache(void); extern int __init nfs_init_directcache(void); extern void nfs_destroy_directcache(void); extern void nfs_pgheader_init(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr, void (*release)(struct nfs_pgio_header *hdr)); void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos); int nfs_iocounter_wait(struct nfs_lock_context *l_ctx); extern const struct nfs_pageio_ops nfs_pgio_rw_ops; struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *); void nfs_pgio_header_free(struct nfs_pgio_header *); int nfs_generic_pgio(struct nfs_pageio_descriptor *, struct nfs_pgio_header *); int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr, const struct cred *cred, const struct nfs_rpc_ops *rpc_ops, const struct rpc_call_ops *call_ops, int how, int flags, struct nfsd_file *localio); void nfs_free_request(struct nfs_page *req); struct nfs_pgio_mirror * nfs_pgio_current_mirror(struct nfs_pageio_descriptor *desc); static inline bool nfs_match_open_context(const struct nfs_open_context *ctx1, const struct nfs_open_context *ctx2) { return cred_fscmp(ctx1->cred, ctx2->cred) == 0 && ctx1->state == ctx2->state; } /* nfs2xdr.c */ extern const struct rpc_procinfo nfs_procedures[]; extern int nfs2_decode_dirent(struct xdr_stream *, struct nfs_entry *, bool); /* nfs3xdr.c */ extern const struct rpc_procinfo nfs3_procedures[]; extern int nfs3_decode_dirent(struct xdr_stream *, struct nfs_entry *, bool); /* nfs4xdr.c */ #if IS_ENABLED(CONFIG_NFS_V4) extern int nfs4_decode_dirent(struct xdr_stream *, struct nfs_entry *, bool); extern const u32 nfs41_maxread_overhead; extern const u32 nfs41_maxwrite_overhead; extern const u32 nfs41_maxgetdevinfo_overhead; /* nfs4proc.c */ extern const struct rpc_procinfo nfs4_procedures[]; #endif /* CONFIG_NFS_V4 */ #ifdef CONFIG_NFS_V4_SECURITY_LABEL extern struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags); static inline struct nfs4_label * nfs4_label_copy(struct nfs4_label *dst, struct nfs4_label *src) { if (!dst || !src) return NULL; if (src->len > NFS4_MAXLABELLEN) return NULL; dst->lfs = src->lfs; dst->pi = src->pi; dst->len = src->len; memcpy(dst->label, src->label, src->len); return dst; } static inline void nfs_zap_label_cache_locked(struct nfs_inode *nfsi) { if (nfs_server_capable(&nfsi->vfs_inode, NFS_CAP_SECURITY_LABEL)) nfsi->cache_validity |= NFS_INO_INVALID_LABEL; } #else static inline struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags) { return NULL; } static inline void nfs_zap_label_cache_locked(struct nfs_inode *nfsi) { } static inline struct nfs4_label * nfs4_label_copy(struct nfs4_label *dst, struct nfs4_label *src) { return NULL; } #endif /* CONFIG_NFS_V4_SECURITY_LABEL */ /* proc.c */ void nfs_close_context(struct nfs_open_context *ctx, int is_sync); extern struct nfs_client *nfs_init_client(struct nfs_client *clp, const struct nfs_client_initdata *); /* dir.c */ extern void nfs_readdir_record_entry_cache_hit(struct inode *dir); extern void nfs_readdir_record_entry_cache_miss(struct inode *dir); extern unsigned long nfs_access_cache_count(struct shrinker *shrink, struct shrink_control *sc); extern unsigned long nfs_access_cache_scan(struct shrinker *shrink, struct shrink_control *sc); struct dentry *nfs_lookup(struct inode *, struct dentry *, unsigned int); void nfs_d_prune_case_insensitive_aliases(struct inode *inode); int nfs_create(struct mnt_idmap *, struct inode *, struct dentry *, umode_t, bool); struct dentry *nfs_mkdir(struct mnt_idmap *, struct inode *, struct dentry *, umode_t); int nfs_rmdir(struct inode *, struct dentry *); int nfs_unlink(struct inode *, struct dentry *); int nfs_symlink(struct mnt_idmap *, struct inode *, struct dentry *, const char *); int nfs_link(struct dentry *, struct inode *, struct dentry *); int nfs_mknod(struct mnt_idmap *, struct inode *, struct dentry *, umode_t, dev_t); int nfs_rename(struct mnt_idmap *, struct inode *, struct dentry *, struct inode *, struct dentry *, unsigned int); #ifdef CONFIG_NFS_V4_2 static inline __u32 nfs_access_xattr_mask(const struct nfs_server *server) { if (!(server->caps & NFS_CAP_XATTR)) return 0; return NFS4_ACCESS_XAREAD | NFS4_ACCESS_XAWRITE | NFS4_ACCESS_XALIST; } #else static inline __u32 nfs_access_xattr_mask(const struct nfs_server *server) { return 0; } #endif /* file.c */ int nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync); loff_t nfs_file_llseek(struct file *, loff_t, int); ssize_t nfs_file_read(struct kiocb *, struct iov_iter *); ssize_t nfs_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); int nfs_file_mmap_prepare(struct vm_area_desc *); ssize_t nfs_file_write(struct kiocb *, struct iov_iter *); int nfs_file_release(struct inode *, struct file *); int nfs_lock(struct file *, int, struct file_lock *); int nfs_flock(struct file *, int, struct file_lock *); int nfs_check_flags(int); void nfs_truncate_last_folio(struct address_space *mapping, loff_t from, loff_t to); /* inode.c */ extern struct workqueue_struct *nfsiod_workqueue; extern struct workqueue_struct *nfslocaliod_workqueue; extern struct inode *nfs_alloc_inode(struct super_block *sb); extern void nfs_free_inode(struct inode *); extern int nfs_write_inode(struct inode *, struct writeback_control *); extern int nfs_drop_inode(struct inode *); extern void nfs_clear_inode(struct inode *); extern void nfs_evict_inode(struct inode *); extern void nfs_zap_acl_cache(struct inode *inode); extern void nfs_set_cache_invalid(struct inode *inode, unsigned long flags); extern bool nfs_check_cache_invalid(struct inode *, unsigned long); extern int nfs_wait_bit_killable(struct wait_bit_key *key, int mode); #if IS_ENABLED(CONFIG_NFS_LOCALIO) /* localio.c */ struct nfs_local_dio { u32 mem_align; u32 offset_align; loff_t middle_offset; loff_t end_offset; ssize_t start_len; /* Length for misaligned first extent */ ssize_t middle_len; /* Length for DIO-aligned middle extent */ ssize_t end_len; /* Length for misaligned last extent */ }; extern void nfs_local_probe_async(struct nfs_client *); extern void nfs_local_probe_async_work(struct work_struct *); extern struct nfsd_file *nfs_local_open_fh(struct nfs_client *, const struct cred *, struct nfs_fh *, struct nfs_file_localio *, const fmode_t); extern int nfs_local_doio(struct nfs_client *, struct nfsd_file *, struct nfs_pgio_header *, const struct rpc_call_ops *); extern int nfs_local_commit(struct nfsd_file *, struct nfs_commit_data *, const struct rpc_call_ops *, int); extern bool nfs_server_is_local(const struct nfs_client *clp); #else /* CONFIG_NFS_LOCALIO */ static inline void nfs_local_probe(struct nfs_client *clp) {} static inline void nfs_local_probe_async(struct nfs_client *clp) {} static inline struct nfsd_file * nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred, struct nfs_fh *fh, struct nfs_file_localio *nfl, const fmode_t mode) { return NULL; } static inline int nfs_local_doio(struct nfs_client *clp, struct nfsd_file *localio, struct nfs_pgio_header *hdr, const struct rpc_call_ops *call_ops) { return -EINVAL; } static inline int nfs_local_commit(struct nfsd_file *localio, struct nfs_commit_data *data, const struct rpc_call_ops *call_ops, int how) { return -EINVAL; } static inline bool nfs_server_is_local(const struct nfs_client *clp) { return false; } #endif /* CONFIG_NFS_LOCALIO */ /* super.c */ extern const struct super_operations nfs_sops; bool nfs_auth_info_match(const struct nfs_auth_info *, rpc_authflavor_t); int nfs_try_get_tree(struct fs_context *); int nfs_get_tree_common(struct fs_context *); void nfs_kill_super(struct super_block *); extern int __init register_nfs_fs(void); extern void __exit unregister_nfs_fs(void); extern bool nfs_sb_active(struct super_block *sb); extern void nfs_sb_deactive(struct super_block *sb); extern int nfs_client_for_each_server(struct nfs_client *clp, int (*fn)(struct nfs_server *, void *), void *data); #ifdef CONFIG_NFS_FSCACHE extern const struct netfs_request_ops nfs_netfs_ops; #endif /* io.c */ extern __must_check int nfs_start_io_read(struct inode *inode); extern void nfs_end_io_read(struct inode *inode); extern __must_check int nfs_start_io_write(struct inode *inode); extern void nfs_end_io_write(struct inode *inode); extern __must_check int nfs_start_io_direct(struct inode *inode); extern void nfs_end_io_direct(struct inode *inode); static inline bool nfs_file_io_is_buffered(struct nfs_inode *nfsi) { return test_bit(NFS_INO_ODIRECT, &nfsi->flags) == 0; } /* Must be called with exclusively locked inode->i_rwsem */ static inline void nfs_file_block_o_direct(struct nfs_inode *nfsi) { if (test_bit(NFS_INO_ODIRECT, &nfsi->flags)) { clear_bit(NFS_INO_ODIRECT, &nfsi->flags); inode_dio_wait(&nfsi->vfs_inode); } } /* namespace.c */ #define NFS_PATH_CANONICAL 1 extern char *nfs_path(char **p, struct dentry *dentry, char *buffer, ssize_t buflen, unsigned flags); extern struct vfsmount *nfs_d_automount(struct path *path); int nfs_submount(struct fs_context *, struct nfs_server *); int nfs_do_submount(struct fs_context *); /* getroot.c */ extern int nfs_get_root(struct super_block *s, struct fs_context *fc); #if IS_ENABLED(CONFIG_NFS_V4) extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh, bool); #endif struct nfs_pgio_completion_ops; /* read.c */ extern const struct nfs_pgio_completion_ops nfs_async_read_completion_ops; extern void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode, bool force_mds, const struct nfs_pgio_completion_ops *compl_ops); extern bool nfs_read_alloc_scratch(struct nfs_pgio_header *hdr, size_t size); extern int nfs_read_add_folio(struct nfs_pageio_descriptor *pgio, struct nfs_open_context *ctx, struct folio *folio); extern void nfs_pageio_complete_read(struct nfs_pageio_descriptor *pgio); extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio); /* super.c */ void nfs_umount_begin(struct super_block *); int nfs_statfs(struct dentry *, struct kstatfs *); int nfs_show_options(struct seq_file *, struct dentry *); int nfs_show_devname(struct seq_file *, struct dentry *); int nfs_show_path(struct seq_file *, struct dentry *); int nfs_show_stats(struct seq_file *, struct dentry *); int nfs_reconfigure(struct fs_context *); /* write.c */ extern void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode, int ioflags, bool force_mds, const struct nfs_pgio_completion_ops *compl_ops); extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio); extern void nfs_commit_free(struct nfs_commit_data *p); extern void nfs_commit_prepare(struct rpc_task *task, void *calldata); extern int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data, const struct nfs_rpc_ops *nfs_ops, const struct rpc_call_ops *call_ops, int how, int flags, struct nfsd_file *localio); extern void nfs_init_commit(struct nfs_commit_data *data, struct list_head *head, struct pnfs_layout_segment *lseg, struct nfs_commit_info *cinfo); int nfs_scan_commit_list(struct list_head *src, struct list_head *dst, struct nfs_commit_info *cinfo, int max); unsigned long nfs_reqs_to_commit(struct nfs_commit_info *); int nfs_scan_commit(struct inode *inode, struct list_head *dst, struct nfs_commit_info *cinfo); void nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, struct nfs_commit_info *cinfo, u32 ds_commit_idx); int nfs_write_need_commit(struct nfs_pgio_header *); void nfs_writeback_update_inode(struct nfs_pgio_header *hdr); int nfs_generic_commit_list(struct inode *inode, struct list_head *head, int how, struct nfs_commit_info *cinfo); void nfs_retry_commit(struct list_head *page_list, struct pnfs_layout_segment *lseg, struct nfs_commit_info *cinfo, u32 ds_commit_idx); void nfs_commitdata_release(struct nfs_commit_data *data); void nfs_request_add_commit_list(struct nfs_page *req, struct nfs_commit_info *cinfo); void nfs_request_add_commit_list_locked(struct nfs_page *req, struct list_head *dst, struct nfs_commit_info *cinfo); void nfs_request_remove_commit_list(struct nfs_page *req, struct nfs_commit_info *cinfo); void nfs_init_cinfo(struct nfs_commit_info *cinfo, struct inode *inode, struct nfs_direct_req *dreq); int nfs_key_timeout_notify(struct file *filp, struct inode *inode); bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx, struct inode *inode); void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio); int nfs_filemap_write_and_wait_range(struct address_space *mapping, loff_t lstart, loff_t lend); #ifdef CONFIG_NFS_V4 static inline void pnfs_bucket_clear_pnfs_ds_commit_verifiers(struct pnfs_commit_bucket *buckets, unsigned int nbuckets) { unsigned int i; for (i = 0; i < nbuckets; i++) buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW; } static inline void nfs_clear_pnfs_ds_commit_verifiers(struct pnfs_ds_commit_info *cinfo) { struct pnfs_commit_array *array; rcu_read_lock(); list_for_each_entry_rcu(array, &cinfo->commits, cinfo_list) pnfs_bucket_clear_pnfs_ds_commit_verifiers(array->buckets, array->nbuckets); rcu_read_unlock(); } #else /* CONFIG_NFS_V4 */ static inline void nfs_clear_pnfs_ds_commit_verifiers(struct pnfs_ds_commit_info *cinfo) { } #endif /* CONFIG_NFS_V4 */ #ifdef CONFIG_MIGRATION int nfs_migrate_folio(struct address_space *, struct folio *dst, struct folio *src, enum migrate_mode); #else #define nfs_migrate_folio NULL #endif static inline int nfs_write_verifier_cmp(const struct nfs_write_verifier *v1, const struct nfs_write_verifier *v2) { return memcmp(v1->data, v2->data, sizeof(v1->data)); } static inline bool nfs_write_match_verf(const struct nfs_writeverf *verf, struct nfs_page *req) { return verf->committed > NFS_UNSTABLE && !nfs_write_verifier_cmp(&req->wb_verf, &verf->verifier); } static inline gfp_t nfs_io_gfp_mask(void) { gfp_t ret = current_gfp_context(GFP_KERNEL); /* For workers __GFP_NORETRY only with __GFP_IO or __GFP_FS */ if ((current->flags & PF_WQ_WORKER) && ret == GFP_KERNEL) ret |= __GFP_NORETRY | __GFP_NOWARN; return ret; } /* * Special version of should_remove_suid() that ignores capabilities. */ static inline int nfs_should_remove_suid(const struct inode *inode) { umode_t mode = inode->i_mode; int kill = 0; /* suid always must be killed */ if (unlikely(mode & S_ISUID)) kill = ATTR_KILL_SUID; /* * sgid without any exec bits is just a mandatory locking mark; leave * it alone. If some exec bits are set, it's a real sgid; kill it. */ if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) kill |= ATTR_KILL_SGID; if (unlikely(kill && S_ISREG(mode))) return kill; return 0; } /* unlink.c */ extern struct rpc_task * nfs_async_rename(struct inode *old_dir, struct inode *new_dir, struct dentry *old_dentry, struct dentry *new_dentry, void (*complete)(struct rpc_task *, struct nfs_renamedata *)); extern int nfs_sillyrename(struct inode *dir, struct dentry *dentry); /* direct.c */ void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo, struct nfs_direct_req *dreq); extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq, loff_t offset); /* nfs4proc.c */ extern struct nfs_client *nfs4_init_client(struct nfs_client *clp, const struct nfs_client_initdata *); extern int nfs41_walk_client_list(struct nfs_client *clp, struct nfs_client **result, const struct cred *cred); extern void nfs4_test_session_trunk(struct rpc_clnt *clnt, struct rpc_xprt *xprt, void *data); static inline struct inode *nfs_igrab_and_active(struct inode *inode) { struct super_block *sb = inode->i_sb; if (sb && nfs_sb_active(sb)) { if (igrab(inode)) return inode; nfs_sb_deactive(sb); } return NULL; } static inline void nfs_iput_and_deactive(struct inode *inode) { if (inode != NULL) { struct super_block *sb = inode->i_sb; iput(inode); nfs_sb_deactive(sb); } } /* * Determine the device name as a string */ static inline char *nfs_devname(struct dentry *dentry, char *buffer, ssize_t buflen) { char *dummy; return nfs_path(&dummy, dentry, buffer, buflen, NFS_PATH_CANONICAL); } /* * Determine the actual block size (and log2 thereof) */ static inline unsigned long nfs_block_bits(unsigned long bsize, unsigned char *nrbitsp) { /* make sure blocksize is a power of two */ if ((bsize & (bsize - 1)) || nrbitsp) { unsigned char nrbits; for (nrbits = 31; nrbits && !(bsize & (1UL << nrbits)); nrbits--) ; bsize = 1UL << nrbits; if (nrbitsp) *nrbitsp = nrbits; } return bsize; } /* * Calculate the number of 512byte blocks used. */ static inline blkcnt_t nfs_calc_block_size(u64 tsize) { blkcnt_t used = (tsize + 511) >> 9; return (used > ULONG_MAX) ? ULONG_MAX : used; } /* * Compute and set NFS server blocksize */ static inline unsigned long nfs_block_size(unsigned long bsize, unsigned char *nrbitsp) { if (bsize < NFS_MIN_FILE_IO_SIZE) bsize = NFS_DEF_FILE_IO_SIZE; else if (bsize >= NFS_MAX_FILE_IO_SIZE) bsize = NFS_MAX_FILE_IO_SIZE; return nfs_block_bits(bsize, nrbitsp); } /* * Compute and set NFS server rsize / wsize */ static inline unsigned long nfs_io_size(unsigned long iosize, enum xprt_transports proto) { if (iosize < NFS_MIN_FILE_IO_SIZE) iosize = NFS_DEF_FILE_IO_SIZE; else if (iosize >= NFS_MAX_FILE_IO_SIZE) iosize = NFS_MAX_FILE_IO_SIZE; if (proto == XPRT_TRANSPORT_UDP || iosize < PAGE_SIZE) return nfs_block_bits(iosize, NULL); return iosize & PAGE_MASK; } /* * Determine the maximum file size for a superblock */ static inline void nfs_super_set_maxbytes(struct super_block *sb, __u64 maxfilesize) { sb->s_maxbytes = (loff_t)maxfilesize; if (sb->s_maxbytes > MAX_LFS_FILESIZE || sb->s_maxbytes <= 0) sb->s_maxbytes = MAX_LFS_FILESIZE; } /* * Record the page as unstable (an extra writeback period) and mark its * inode as dirty. */ static inline void nfs_folio_mark_unstable(struct folio *folio, struct nfs_commit_info *cinfo) { if (folio && !cinfo->dreq) { struct inode *inode = folio->mapping->host; long nr = folio_nr_pages(folio); /* This page is really still in write-back - just that the * writeback is happening on the server now. */ node_stat_mod_folio(folio, NR_WRITEBACK, nr); bdi_wb_stat_mod(inode, WB_WRITEBACK, nr); __mark_inode_dirty(inode, I_DIRTY_DATASYNC); } } /* * Determine the number of bytes of data the page contains */ static inline size_t nfs_folio_length(struct folio *folio) { loff_t i_size = i_size_read(folio->mapping->host); if (i_size > 0) { pgoff_t index = folio->index >> folio_order(folio); pgoff_t end_index = (i_size - 1) >> folio_shift(folio); if (index < end_index) return folio_size(folio); if (index == end_index) return offset_in_folio(folio, i_size - 1) + 1; } return 0; } /* * Convert a umode to a dirent->d_type */ static inline unsigned char nfs_umode_to_dtype(umode_t mode) { return (mode >> 12) & 15; } /* * Determine the number of pages in an array of length 'len' and * with a base offset of 'base' */ static inline unsigned int nfs_page_array_len(unsigned int base, size_t len) { return ((unsigned long)len + (unsigned long)base + PAGE_SIZE - 1) >> PAGE_SHIFT; } /* * Convert a struct timespec64 into a 64-bit change attribute * * This does approximately the same thing as timespec64_to_ns(), * but for calculation efficiency, we multiply the seconds by * 1024*1024*1024. */ static inline u64 nfs_timespec_to_change_attr(const struct timespec64 *ts) { return ((u64)ts->tv_sec << 30) + ts->tv_nsec; } static inline u32 nfs_stateid_hash(const nfs4_stateid *stateid) { return ~crc32_le(0xFFFFFFFF, &stateid->other[0], NFS4_STATEID_OTHER_SIZE); } static inline bool nfs_current_task_exiting(void) { return (current->flags & PF_EXITING) != 0; } static inline bool nfs_error_is_fatal(int err) { switch (err) { case -ERESTARTSYS: case -EINTR: case -EACCES: case -EDQUOT: case -EFBIG: case -EIO: case -ENOSPC: case -EROFS: case -ESTALE: case -E2BIG: case -ENOMEM: case -ETIMEDOUT: return true; default: return false; } } static inline bool nfs_error_is_fatal_on_server(int err) { switch (err) { case 0: case -ERESTARTSYS: case -EINTR: case -ENOMEM: return false; } return nfs_error_is_fatal(err); } /* * Select between a default port value and a user-specified port value. * If a zero value is set, then autobind will be used. */ static inline void nfs_set_port(struct sockaddr_storage *sap, int *port, const unsigned short default_port) { if (*port == NFS_UNSPEC_PORT) *port = default_port; rpc_set_port((struct sockaddr *)sap, *port); } struct nfs_direct_req { struct kref kref; /* release manager */ /* I/O parameters */ struct nfs_open_context *ctx; /* file open context info */ struct nfs_lock_context *l_ctx; /* Lock context info */ struct kiocb * iocb; /* controlling i/o request */ struct inode * inode; /* target file of i/o */ /* completion state */ atomic_t io_count; /* i/os we're waiting for */ spinlock_t lock; /* protect completion state */ loff_t io_start; /* Start offset for I/O */ ssize_t count, /* bytes actually processed */ max_count, /* max expected count */ error; /* any reported error */ struct completion completion; /* wait for i/o completion */ /* commit state */ struct nfs_mds_commit_info mds_cinfo; /* Storage for cinfo */ struct pnfs_ds_commit_info ds_cinfo; /* Storage for cinfo */ struct work_struct work; int flags; /* for write */ #define NFS_ODIRECT_DO_COMMIT (1) /* an unstable reply was received */ #define NFS_ODIRECT_RESCHED_WRITES (2) /* write verification failed */ /* for read */ #define NFS_ODIRECT_SHOULD_DIRTY (3) /* dirty user-space page after read */ #define NFS_ODIRECT_DONE INT_MAX /* write verification failed */ };
2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 // SPDX-License-Identifier: GPL-2.0-only /* * Suspend support specific for i386/x86-64. * * Copyright (c) 2007 Rafael J. Wysocki <rjw@sisk.pl> * Copyright (c) 2002 Pavel Machek <pavel@ucw.cz> * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org> */ #include <linux/suspend.h> #include <linux/export.h> #include <linux/smp.h> #include <linux/perf_event.h> #include <linux/tboot.h> #include <linux/dmi.h> #include <linux/pgtable.h> #include <asm/proto.h> #include <asm/mtrr.h> #include <asm/page.h> #include <asm/mce.h> #include <asm/suspend.h> #include <asm/fpu/api.h> #include <asm/debugreg.h> #include <asm/cpu.h> #include <asm/cacheinfo.h> #include <asm/mmu_context.h> #include <asm/cpu_device_id.h> #include <asm/microcode.h> #include <asm/msr.h> #include <asm/fred.h> #ifdef CONFIG_X86_32 __visible unsigned long saved_context_ebx; __visible unsigned long saved_context_esp, saved_context_ebp; __visible unsigned long saved_context_esi, saved_context_edi; __visible unsigned long saved_context_eflags; #endif struct saved_context saved_context; static void msr_save_context(struct saved_context *ctxt) { struct saved_msr *msr = ctxt->saved_msrs.array; struct saved_msr *end = msr + ctxt->saved_msrs.num; while (msr < end) { if (msr->valid) rdmsrq(msr->info.msr_no, msr->info.reg.q); msr++; } } static void msr_restore_context(struct saved_context *ctxt) { struct saved_msr *msr = ctxt->saved_msrs.array; struct saved_msr *end = msr + ctxt->saved_msrs.num; while (msr < end) { if (msr->valid) wrmsrq(msr->info.msr_no, msr->info.reg.q); msr++; } } /** * __save_processor_state() - Save CPU registers before creating a * hibernation image and before restoring * the memory state from it * @ctxt: Structure to store the registers contents in. * * NOTE: If there is a CPU register the modification of which by the * boot kernel (ie. the kernel used for loading the hibernation image) * might affect the operations of the restored target kernel (ie. the one * saved in the hibernation image), then its contents must be saved by this * function. In other words, if kernel A is hibernated and different * kernel B is used for loading the hibernation image into memory, the * kernel A's __save_processor_state() function must save all registers * needed by kernel A, so that it can operate correctly after the resume * regardless of what kernel B does in the meantime. */ static void __save_processor_state(struct saved_context *ctxt) { #ifdef CONFIG_X86_32 mtrr_save_fixed_ranges(NULL); #endif kernel_fpu_begin(); /* * descriptor tables */ store_idt(&ctxt->idt); /* * We save it here, but restore it only in the hibernate case. * For ACPI S3 resume, this is loaded via 'early_gdt_desc' in 64-bit * mode in "secondary_startup_64". In 32-bit mode it is done via * 'pmode_gdt' in wakeup_start. */ ctxt->gdt_desc.size = GDT_SIZE - 1; ctxt->gdt_desc.address = (unsigned long)get_cpu_gdt_rw(smp_processor_id()); store_tr(ctxt->tr); /* XMM0..XMM15 should be handled by kernel_fpu_begin(). */ /* * segment registers */ savesegment(gs, ctxt->gs); #ifdef CONFIG_X86_64 savesegment(fs, ctxt->fs); savesegment(ds, ctxt->ds); savesegment(es, ctxt->es); rdmsrq(MSR_FS_BASE, ctxt->fs_base); rdmsrq(MSR_GS_BASE, ctxt->kernelmode_gs_base); rdmsrq(MSR_KERNEL_GS_BASE, ctxt->usermode_gs_base); mtrr_save_fixed_ranges(NULL); rdmsrq(MSR_EFER, ctxt->efer); #endif /* * control registers */ ctxt->cr0 = read_cr0(); ctxt->cr2 = read_cr2(); ctxt->cr3 = __read_cr3(); ctxt->cr4 = __read_cr4(); ctxt->misc_enable_saved = !rdmsrq_safe(MSR_IA32_MISC_ENABLE, &ctxt->misc_enable); msr_save_context(ctxt); } /* Needed by apm.c */ void save_processor_state(void) { __save_processor_state(&saved_context); x86_platform.save_sched_clock_state(); } #ifdef CONFIG_X86_32 EXPORT_SYMBOL(save_processor_state); #endif static void do_fpu_end(void) { /* * Restore FPU regs if necessary. */ kernel_fpu_end(); } static void fix_processor_context(void) { int cpu = smp_processor_id(); #ifdef CONFIG_X86_64 struct desc_struct *desc = get_cpu_gdt_rw(cpu); tss_desc tss; #endif /* * We need to reload TR, which requires that we change the * GDT entry to indicate "available" first. * * XXX: This could probably all be replaced by a call to * force_reload_TR(). */ set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); #ifdef CONFIG_X86_64 memcpy(&tss, &desc[GDT_ENTRY_TSS], sizeof(tss_desc)); tss.type = 0x9; /* The available 64-bit TSS (see AMD vol 2, pg 91 */ write_gdt_entry(desc, GDT_ENTRY_TSS, &tss, DESC_TSS); syscall_init(); /* This sets MSR_*STAR and related */ #else if (boot_cpu_has(X86_FEATURE_SEP)) enable_sep_cpu(); #endif load_TR_desc(); /* This does ltr */ load_mm_ldt(current->active_mm); /* This does lldt */ initialize_tlbstate_and_flush(); fpu__resume_cpu(); /* The processor is back on the direct GDT, load back the fixmap */ load_fixmap_gdt(cpu); } /** * __restore_processor_state() - Restore the contents of CPU registers saved * by __save_processor_state() * @ctxt: Structure to load the registers contents from. * * The asm code that gets us here will have restored a usable GDT, although * it will be pointing to the wrong alias. */ static void notrace __restore_processor_state(struct saved_context *ctxt) { struct cpuinfo_x86 *c; if (ctxt->misc_enable_saved) wrmsrq(MSR_IA32_MISC_ENABLE, ctxt->misc_enable); /* * control registers */ /* cr4 was introduced in the Pentium CPU */ #ifdef CONFIG_X86_32 if (ctxt->cr4) __write_cr4(ctxt->cr4); #else /* CONFIG X86_64 */ wrmsrq(MSR_EFER, ctxt->efer); __write_cr4(ctxt->cr4); #endif write_cr3(ctxt->cr3); write_cr2(ctxt->cr2); write_cr0(ctxt->cr0); /* Restore the IDT. */ load_idt(&ctxt->idt); /* * Just in case the asm code got us here with the SS, DS, or ES * out of sync with the GDT, update them. */ loadsegment(ss, __KERNEL_DS); loadsegment(ds, __USER_DS); loadsegment(es, __USER_DS); /* * Restore percpu access. Percpu access can happen in exception * handlers or in complicated helpers like load_gs_index(). */ #ifdef CONFIG_X86_64 wrmsrq(MSR_GS_BASE, ctxt->kernelmode_gs_base); /* * Reinitialize FRED to ensure the FRED MSRs contain the same values * as before hibernation. * * Note, the setup of FRED RSPs requires access to percpu data * structures. Therefore, FRED reinitialization can only occur after * the percpu access pointer (i.e., MSR_GS_BASE) is restored. */ if (ctxt->cr4 & X86_CR4_FRED) { cpu_init_fred_exceptions(); cpu_init_fred_rsps(); } #else loadsegment(fs, __KERNEL_PERCPU); #endif /* Restore the TSS, RO GDT, LDT, and usermode-relevant MSRs. */ fix_processor_context(); /* * Now that we have descriptor tables fully restored and working * exception handling, restore the usermode segments. */ #ifdef CONFIG_X86_64 loadsegment(ds, ctxt->es); loadsegment(es, ctxt->es); loadsegment(fs, ctxt->fs); load_gs_index(ctxt->gs); /* * Restore FSBASE and GSBASE after restoring the selectors, since * restoring the selectors clobbers the bases. Keep in mind * that MSR_KERNEL_GS_BASE is horribly misnamed. */ wrmsrq(MSR_FS_BASE, ctxt->fs_base); wrmsrq(MSR_KERNEL_GS_BASE, ctxt->usermode_gs_base); #else loadsegment(gs, ctxt->gs); #endif do_fpu_end(); tsc_verify_tsc_adjust(true); x86_platform.restore_sched_clock_state(); cache_bp_restore(); perf_restore_debug_store(); c = &cpu_data(smp_processor_id()); if (cpu_has(c, X86_FEATURE_MSR_IA32_FEAT_CTL)) init_ia32_feat_ctl(c); microcode_bsp_resume(); /* * This needs to happen after the microcode has been updated upon resume * because some of the MSRs are "emulated" in microcode. */ msr_restore_context(ctxt); } /* Needed by apm.c */ void notrace restore_processor_state(void) { __restore_processor_state(&saved_context); } #ifdef CONFIG_X86_32 EXPORT_SYMBOL(restore_processor_state); #endif #if defined(CONFIG_HIBERNATION) && defined(CONFIG_HOTPLUG_CPU) static void __noreturn resume_play_dead(void) { play_dead_common(); tboot_shutdown(TB_SHUTDOWN_WFS); hlt_play_dead(); } int hibernate_resume_nonboot_cpu_disable(void) { void (*play_dead)(void) = smp_ops.play_dead; int ret; /* * Ensure that MONITOR/MWAIT will not be used in the "play dead" loop * during hibernate image restoration, because it is likely that the * monitored address will be actually written to at that time and then * the "dead" CPU will attempt to execute instructions again, but the * address in its instruction pointer may not be possible to resolve * any more at that point (the page tables used by it previously may * have been overwritten by hibernate image data). * * First, make sure that we wake up all the potentially disabled SMT * threads which have been initially brought up and then put into * mwait/cpuidle sleep. * Those will be put to proper (not interfering with hibernation * resume) sleep afterwards, and the resumed kernel will decide itself * what to do with them. */ ret = cpuhp_smt_enable(); if (ret) return ret; smp_ops.play_dead = resume_play_dead; ret = freeze_secondary_cpus(0); smp_ops.play_dead = play_dead; return ret; } #endif /* * When bsp_check() is called in hibernate and suspend, cpu hotplug * is disabled already. So it's unnecessary to handle race condition between * cpumask query and cpu hotplug. */ static int bsp_check(void) { if (cpumask_first(cpu_online_mask) != 0) { pr_warn("CPU0 is offline.\n"); return -ENODEV; } return 0; } static int bsp_pm_callback(struct notifier_block *nb, unsigned long action, void *ptr) { int ret = 0; switch (action) { case PM_SUSPEND_PREPARE: case PM_HIBERNATION_PREPARE: ret = bsp_check(); break; default: break; } return notifier_from_errno(ret); } static int __init bsp_pm_check_init(void) { /* * Set this bsp_pm_callback as lower priority than * cpu_hotplug_pm_callback. So cpu_hotplug_pm_callback will be called * earlier to disable cpu hotplug before bsp online check. */ pm_notifier(bsp_pm_callback, -INT_MAX); return 0; } core_initcall(bsp_pm_check_init); static int msr_build_context(const u32 *msr_id, const int num) { struct saved_msrs *saved_msrs = &saved_context.saved_msrs; struct saved_msr *msr_array; int total_num; int i, j; total_num = saved_msrs->num + num; msr_array = kmalloc_objs(struct saved_msr, total_num); if (!msr_array) { pr_err("x86/pm: Can not allocate memory to save/restore MSRs during suspend.\n"); return -ENOMEM; } if (saved_msrs->array) { /* * Multiple callbacks can invoke this function, so copy any * MSR save requests from previous invocations. */ memcpy(msr_array, saved_msrs->array, sizeof(struct saved_msr) * saved_msrs->num); kfree(saved_msrs->array); } for (i = saved_msrs->num, j = 0; i < total_num; i++, j++) { u64 dummy; msr_array[i].info.msr_no = msr_id[j]; msr_array[i].valid = !rdmsrq_safe(msr_id[j], &dummy); msr_array[i].info.reg.q = 0; } saved_msrs->num = total_num; saved_msrs->array = msr_array; return 0; } /* * The following sections are a quirk framework for problematic BIOSen: * Sometimes MSRs are modified by the BIOSen after suspended to * RAM, this might cause unexpected behavior after wakeup. * Thus we save/restore these specified MSRs across suspend/resume * in order to work around it. * * For any further problematic BIOSen/platforms, * please add your own function similar to msr_initialize_bdw. */ static int msr_initialize_bdw(const struct dmi_system_id *d) { /* Add any extra MSR ids into this array. */ u32 bdw_msr_id[] = { MSR_IA32_THERM_CONTROL }; pr_info("x86/pm: %s detected, MSR saving is needed during suspending.\n", d->ident); return msr_build_context(bdw_msr_id, ARRAY_SIZE(bdw_msr_id)); } static const struct dmi_system_id msr_save_dmi_table[] = { { .callback = msr_initialize_bdw, .ident = "BROADWELL BDX_EP", .matches = { DMI_MATCH(DMI_PRODUCT_NAME, "GRANTLEY"), DMI_MATCH(DMI_PRODUCT_VERSION, "E63448-400"), }, }, {} }; static int msr_save_cpuid_features(const struct x86_cpu_id *c) { u32 cpuid_msr_id[] = { MSR_AMD64_CPUID_FN_1, }; pr_info("x86/pm: family %#hx cpu detected, MSR saving is needed during suspending.\n", c->family); return msr_build_context(cpuid_msr_id, ARRAY_SIZE(cpuid_msr_id)); } static const struct x86_cpu_id msr_save_cpu_table[] = { X86_MATCH_VENDOR_FAM(AMD, 0x15, &msr_save_cpuid_features), X86_MATCH_VENDOR_FAM(AMD, 0x16, &msr_save_cpuid_features), {} }; typedef int (*pm_cpu_match_t)(const struct x86_cpu_id *); static int pm_cpu_check(const struct x86_cpu_id *c) { const struct x86_cpu_id *m; int ret = 0; m = x86_match_cpu(msr_save_cpu_table); if (m) { pm_cpu_match_t fn; fn = (pm_cpu_match_t)m->driver_data; ret = fn(m); } return ret; } static void pm_save_spec_msr(void) { struct msr_enumeration { u32 msr_no; u32 feature; } msr_enum[] = { { MSR_IA32_SPEC_CTRL, X86_FEATURE_MSR_SPEC_CTRL }, { MSR_IA32_TSX_CTRL, X86_FEATURE_MSR_TSX_CTRL }, { MSR_TSX_FORCE_ABORT, X86_FEATURE_TSX_FORCE_ABORT }, { MSR_IA32_MCU_OPT_CTRL, X86_FEATURE_SRBDS_CTRL }, { MSR_AMD64_LS_CFG, X86_FEATURE_LS_CFG_SSBD }, { MSR_AMD64_DE_CFG, X86_FEATURE_LFENCE_RDTSC }, }; int i; for (i = 0; i < ARRAY_SIZE(msr_enum); i++) { if (boot_cpu_has(msr_enum[i].feature)) msr_build_context(&msr_enum[i].msr_no, 1); } } static int pm_check_save_msr(void) { dmi_check_system(msr_save_dmi_table); pm_cpu_check(msr_save_cpu_table); pm_save_spec_msr(); return 0; } device_initcall(pm_check_save_msr);
7576 119 7294 1744 1874 3494 4843 309 3519 51 32 32 32 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 /* SPDX-License-Identifier: GPL-2.0 */ /* * Wrapper functions for accessing the file_struct fd array. */ #ifndef __LINUX_FILE_H #define __LINUX_FILE_H #include <linux/compiler.h> #include <linux/types.h> #include <linux/posix_types.h> #include <linux/errno.h> #include <linux/cleanup.h> #include <linux/err.h> struct file; extern void fput(struct file *); struct file_operations; struct task_struct; struct vfsmount; struct dentry; struct inode; struct path; extern struct file *alloc_file_pseudo(struct inode *, struct vfsmount *, const char *, int flags, const struct file_operations *); extern struct file *alloc_file_pseudo_noaccount(struct inode *, struct vfsmount *, const char *, int flags, const struct file_operations *); extern struct file *alloc_file_clone(struct file *, int flags, const struct file_operations *); /* either a reference to struct file + flags * (cloned vs. borrowed, pos locked), with * flags stored in lower bits of value, * or empty (represented by 0). */ struct fd { unsigned long word; }; #define FDPUT_FPUT 1 #define FDPUT_POS_UNLOCK 2 #define fd_file(f) ((struct file *)((f).word & ~(FDPUT_FPUT|FDPUT_POS_UNLOCK))) static inline bool fd_empty(struct fd f) { return unlikely(!f.word); } #define EMPTY_FD (struct fd){0} static inline struct fd BORROWED_FD(struct file *f) { return (struct fd){(unsigned long)f}; } static inline struct fd CLONED_FD(struct file *f) { return (struct fd){(unsigned long)f | FDPUT_FPUT}; } static inline void fdput(struct fd fd) { if (unlikely(fd.word & FDPUT_FPUT)) fput(fd_file(fd)); } extern struct file *fget(unsigned int fd); extern struct file *fget_raw(unsigned int fd); extern struct file *fget_task(struct task_struct *task, unsigned int fd); extern struct file *fget_task_next(struct task_struct *task, unsigned int *fd); extern void __f_unlock_pos(struct file *); struct fd fdget(unsigned int fd); struct fd fdget_raw(unsigned int fd); struct fd fdget_pos(unsigned int fd); static inline void fdput_pos(struct fd f) { if (f.word & FDPUT_POS_UNLOCK) __f_unlock_pos(fd_file(f)); fdput(f); } DEFINE_CLASS(fd, struct fd, fdput(_T), fdget(fd), int fd) DEFINE_CLASS(fd_raw, struct fd, fdput(_T), fdget_raw(fd), int fd) DEFINE_CLASS(fd_pos, struct fd, fdput_pos(_T), fdget_pos(fd), int fd) extern int f_dupfd(unsigned int from, struct file *file, unsigned flags); extern int replace_fd(unsigned fd, struct file *file, unsigned flags); extern void set_close_on_exec(unsigned int fd, int flag); extern bool get_close_on_exec(unsigned int fd); extern int __get_unused_fd_flags(unsigned flags, unsigned long nofile); extern int get_unused_fd_flags(unsigned flags); extern void put_unused_fd(unsigned int fd); DEFINE_CLASS(get_unused_fd, int, if (_T >= 0) put_unused_fd(_T), get_unused_fd_flags(flags), unsigned flags) DEFINE_FREE(fput, struct file *, if (!IS_ERR_OR_NULL(_T)) fput(_T)) /* * take_fd() will take care to set @fd to -EBADF ensuring that * CLASS(get_unused_fd) won't call put_unused_fd(). This makes it * easier to rely on CLASS(get_unused_fd): * * struct file *f; * * CLASS(get_unused_fd, fd)(O_CLOEXEC); * if (fd < 0) * return fd; * * f = dentry_open(&path, O_RDONLY, current_cred()); * if (IS_ERR(f)) * return PTR_ERR(f); * * fd_install(fd, f); * return take_fd(fd); */ #define take_fd(fd) __get_and_null(fd, -EBADF) extern void fd_install(unsigned int fd, struct file *file); int receive_fd(struct file *file, int __user *ufd, unsigned int o_flags); int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags); extern void flush_delayed_fput(void); extern void __fput_sync(struct file *); extern unsigned int sysctl_nr_open_min, sysctl_nr_open_max; /* * fd_prepare: Combined fd + file allocation cleanup class. * @err: Error code to indicate if allocation succeeded. * @__fd: Allocated fd (may not be accessed directly) * @__file: Allocated struct file pointer (may not be accessed directly) * * Allocates an fd and a file together. On error paths, automatically cleans * up whichever resource was successfully allocated. Allows flexible file * allocation with different functions per usage. * * Do not use directly. */ struct fd_prepare { s32 err; s32 __fd; /* do not access directly */ struct file *__file; /* do not access directly */ }; /* Typedef for fd_prepare cleanup guards. */ typedef struct fd_prepare class_fd_prepare_t; /* * Accessors for fd_prepare class members. * _Generic() is used for zero-cost type safety. */ #define fd_prepare_fd(_fdf) \ (_Generic((_fdf), struct fd_prepare: (_fdf).__fd)) #define fd_prepare_file(_fdf) \ (_Generic((_fdf), struct fd_prepare: (_fdf).__file)) /* Do not use directly. */ static inline void class_fd_prepare_destructor(const struct fd_prepare *fdf) { if (unlikely(fdf->__fd >= 0)) put_unused_fd(fdf->__fd); if (unlikely(!IS_ERR_OR_NULL(fdf->__file))) fput(fdf->__file); } /* Do not use directly. */ static inline int class_fd_prepare_lock_err(const struct fd_prepare *fdf) { if (unlikely(fdf->err)) return fdf->err; if (unlikely(fdf->__fd < 0)) return fdf->__fd; if (unlikely(IS_ERR(fdf->__file))) return PTR_ERR(fdf->__file); if (unlikely(!fdf->__file)) return -ENOMEM; return 0; } /* * __FD_PREPARE_INIT - Helper to initialize fd_prepare class. * @_fd_flags: flags for get_unused_fd_flags() * @_file_owned: expression that returns struct file * * * Returns a struct fd_prepare with fd, file, and err set. * If fd allocation fails, fd will be negative and err will be set. If * fd succeeds but file_init_expr fails, file will be ERR_PTR and err * will be set. The err field is the single source of truth for error * checking. */ #define __FD_PREPARE_INIT(_fd_flags, _file_owned) \ ({ \ struct fd_prepare fdf = { \ .__fd = get_unused_fd_flags((_fd_flags)), \ }; \ if (likely(fdf.__fd >= 0)) \ fdf.__file = (_file_owned); \ fdf.err = ACQUIRE_ERR(fd_prepare, &fdf); \ fdf; \ }) /* * FD_PREPARE - Macro to declare and initialize an fd_prepare variable. * * Declares and initializes an fd_prepare variable with automatic * cleanup. No separate scope required - cleanup happens when variable * goes out of scope. * * @_fdf: name of struct fd_prepare variable to define * @_fd_flags: flags for get_unused_fd_flags() * @_file_owned: struct file to take ownership of (can be expression) */ #define FD_PREPARE(_fdf, _fd_flags, _file_owned) \ CLASS_INIT(fd_prepare, _fdf, __FD_PREPARE_INIT(_fd_flags, _file_owned)) /* * fd_publish - Publish prepared fd and file to the fd table. * @_fdf: struct fd_prepare variable */ #define fd_publish(_fdf) \ ({ \ struct fd_prepare *fdp = &(_fdf); \ VFS_WARN_ON_ONCE(fdp->err); \ VFS_WARN_ON_ONCE(fdp->__fd < 0); \ VFS_WARN_ON_ONCE(IS_ERR_OR_NULL(fdp->__file)); \ fd_install(fdp->__fd, fdp->__file); \ retain_and_null_ptr(fdp->__file); \ take_fd(fdp->__fd); \ }) /* Do not use directly. */ #define __FD_ADD(_fdf, _fd_flags, _file_owned) \ ({ \ FD_PREPARE(_fdf, _fd_flags, _file_owned); \ s32 ret = _fdf.err; \ if (likely(!ret)) \ ret = fd_publish(_fdf); \ ret; \ }) /* * FD_ADD - Allocate and install an fd and file in one step. * @_fd_flags: flags for get_unused_fd_flags() * @_file_owned: struct file to take ownership of * * Returns the allocated fd number, or negative error code on failure. */ #define FD_ADD(_fd_flags, _file_owned) \ __FD_ADD(__UNIQUE_ID(fd_prepare), _fd_flags, _file_owned) #endif /* __LINUX_FILE_H */
3 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NDISC_H #define _NDISC_H /* * ICMP codes for neighbour discovery messages */ #define NDISC_ROUTER_SOLICITATION 133 #define NDISC_ROUTER_ADVERTISEMENT 134 #define NDISC_NEIGHBOUR_SOLICITATION 135 #define NDISC_NEIGHBOUR_ADVERTISEMENT 136 #define NDISC_REDIRECT 137 /* * Router type: cross-layer information from link-layer to * IPv6 layer reported by certain link types (e.g., RFC4214). */ #define NDISC_NODETYPE_UNSPEC 0 /* unspecified (default) */ #define NDISC_NODETYPE_HOST 1 /* host or unauthorized router */ #define NDISC_NODETYPE_NODEFAULT 2 /* non-default router */ #define NDISC_NODETYPE_DEFAULT 3 /* default router */ /* * ndisc options */ enum { __ND_OPT_PREFIX_INFO_END = 0, ND_OPT_SOURCE_LL_ADDR = 1, /* RFC2461 */ ND_OPT_TARGET_LL_ADDR = 2, /* RFC2461 */ ND_OPT_PREFIX_INFO = 3, /* RFC2461 */ ND_OPT_REDIRECT_HDR = 4, /* RFC2461 */ ND_OPT_MTU = 5, /* RFC2461 */ ND_OPT_NONCE = 14, /* RFC7527 */ __ND_OPT_ARRAY_MAX, ND_OPT_ROUTE_INFO = 24, /* RFC4191 */ ND_OPT_RDNSS = 25, /* RFC5006 */ ND_OPT_DNSSL = 31, /* RFC6106 */ ND_OPT_6CO = 34, /* RFC6775 */ ND_OPT_CAPTIVE_PORTAL = 37, /* RFC7710 */ ND_OPT_PREF64 = 38, /* RFC8781 */ __ND_OPT_MAX }; #define MAX_RTR_SOLICITATION_DELAY HZ #define ND_REACHABLE_TIME (30*HZ) #define ND_RETRANS_TIMER HZ #include <linux/compiler.h> #include <linux/icmpv6.h> #include <linux/in6.h> #include <linux/types.h> #include <linux/if_arp.h> #include <linux/netdevice.h> #include <linux/hash.h> #include <net/neighbour.h> struct ctl_table; struct inet6_dev; struct net_device; struct net_proto_family; struct sk_buff; struct prefix_info; extern struct neigh_table nd_tbl; struct nd_msg { struct icmp6hdr icmph; struct in6_addr target; __u8 opt[]; }; struct rs_msg { struct icmp6hdr icmph; __u8 opt[]; }; struct ra_msg { struct icmp6hdr icmph; __be32 reachable_time; __be32 retrans_timer; }; struct rd_msg { struct icmp6hdr icmph; struct in6_addr target; struct in6_addr dest; __u8 opt[]; }; struct nd_opt_hdr { __u8 nd_opt_type; __u8 nd_opt_len; } __packed; /* ND options */ struct ndisc_options { struct nd_opt_hdr *nd_opt_array[__ND_OPT_ARRAY_MAX]; #ifdef CONFIG_IPV6_ROUTE_INFO struct nd_opt_hdr *nd_opts_ri; struct nd_opt_hdr *nd_opts_ri_end; #endif struct nd_opt_hdr *nd_useropts; struct nd_opt_hdr *nd_useropts_end; #if IS_ENABLED(CONFIG_IEEE802154_6LOWPAN) struct nd_opt_hdr *nd_802154_opt_array[ND_OPT_TARGET_LL_ADDR + 1]; #endif }; #define nd_opts_src_lladdr nd_opt_array[ND_OPT_SOURCE_LL_ADDR] #define nd_opts_tgt_lladdr nd_opt_array[ND_OPT_TARGET_LL_ADDR] #define nd_opts_pi nd_opt_array[ND_OPT_PREFIX_INFO] #define nd_opts_pi_end nd_opt_array[__ND_OPT_PREFIX_INFO_END] #define nd_opts_rh nd_opt_array[ND_OPT_REDIRECT_HDR] #define nd_opts_mtu nd_opt_array[ND_OPT_MTU] #define nd_opts_nonce nd_opt_array[ND_OPT_NONCE] #define nd_802154_opts_src_lladdr nd_802154_opt_array[ND_OPT_SOURCE_LL_ADDR] #define nd_802154_opts_tgt_lladdr nd_802154_opt_array[ND_OPT_TARGET_LL_ADDR] #define NDISC_OPT_SPACE(len) (((len)+2+7)&~7) struct ndisc_options *ndisc_parse_options(const struct net_device *dev, u8 *opt, int opt_len, struct ndisc_options *ndopts); void __ndisc_fill_addr_option(struct sk_buff *skb, int type, const void *data, int data_len, int pad); #define NDISC_OPS_REDIRECT_DATA_SPACE 2 /* * This structure defines the hooks for IPv6 neighbour discovery. * The following hooks can be defined; unless noted otherwise, they are * optional and can be filled with a null pointer. * * int (*parse_options)(const struct net_device *dev, * struct nd_opt_hdr *nd_opt, * struct ndisc_options *ndopts): * This function is called while parsing ndisc ops and put each position * as pointer into ndopts. If this function return unequal 0, then this * function took care about the ndisc option, if 0 then the IPv6 ndisc * option parser will take care about that option. * * void (*update)(const struct net_device *dev, struct neighbour *n, * u32 flags, u8 icmp6_type, * const struct ndisc_options *ndopts): * This function is called when IPv6 ndisc updates the neighbour cache * entry. Additional options which can be updated may be previously * parsed by parse_opts callback and accessible over ndopts parameter. * * int (*opt_addr_space)(const struct net_device *dev, u8 icmp6_type, * struct neighbour *neigh, u8 *ha_buf, * u8 **ha): * This function is called when the necessary option space will be * calculated before allocating a skb. The parameters neigh, ha_buf * abd ha are available on NDISC_REDIRECT messages only. * * void (*fill_addr_option)(const struct net_device *dev, * struct sk_buff *skb, u8 icmp6_type, * const u8 *ha): * This function is called when the skb will finally fill the option * fields inside skb. NOTE: this callback should fill the option * fields to the skb which are previously indicated by opt_space * parameter. That means the decision to add such option should * not lost between these two callbacks, e.g. protected by interface * up state. * * void (*prefix_rcv_add_addr)(struct net *net, struct net_device *dev, * const struct prefix_info *pinfo, * struct inet6_dev *in6_dev, * struct in6_addr *addr, * int addr_type, u32 addr_flags, * bool sllao, bool tokenized, * __u32 valid_lft, u32 prefered_lft, * bool dev_addr_generated): * This function is called when a RA messages is received with valid * PIO option fields and an IPv6 address will be added to the interface * for autoconfiguration. The parameter dev_addr_generated reports about * if the address was based on dev->dev_addr or not. This can be used * to add a second address if link-layer operates with two link layer * addresses. E.g. 802.15.4 6LoWPAN. */ struct ndisc_ops { int (*parse_options)(const struct net_device *dev, struct nd_opt_hdr *nd_opt, struct ndisc_options *ndopts); void (*update)(const struct net_device *dev, struct neighbour *n, u32 flags, u8 icmp6_type, const struct ndisc_options *ndopts); int (*opt_addr_space)(const struct net_device *dev, u8 icmp6_type, struct neighbour *neigh, u8 *ha_buf, u8 **ha); void (*fill_addr_option)(const struct net_device *dev, struct sk_buff *skb, u8 icmp6_type, const u8 *ha); void (*prefix_rcv_add_addr)(struct net *net, struct net_device *dev, const struct prefix_info *pinfo, struct inet6_dev *in6_dev, struct in6_addr *addr, int addr_type, u32 addr_flags, bool sllao, bool tokenized, __u32 valid_lft, u32 prefered_lft, bool dev_addr_generated); }; #if IS_ENABLED(CONFIG_IPV6) static inline int ndisc_ops_parse_options(const struct net_device *dev, struct nd_opt_hdr *nd_opt, struct ndisc_options *ndopts) { if (dev->ndisc_ops && dev->ndisc_ops->parse_options) return dev->ndisc_ops->parse_options(dev, nd_opt, ndopts); else return 0; } static inline void ndisc_ops_update(const struct net_device *dev, struct neighbour *n, u32 flags, u8 icmp6_type, const struct ndisc_options *ndopts) { if (dev->ndisc_ops && dev->ndisc_ops->update) dev->ndisc_ops->update(dev, n, flags, icmp6_type, ndopts); } static inline int ndisc_ops_opt_addr_space(const struct net_device *dev, u8 icmp6_type) { if (dev->ndisc_ops && dev->ndisc_ops->opt_addr_space && icmp6_type != NDISC_REDIRECT) return dev->ndisc_ops->opt_addr_space(dev, icmp6_type, NULL, NULL, NULL); else return 0; } static inline int ndisc_ops_redirect_opt_addr_space(const struct net_device *dev, struct neighbour *neigh, u8 *ha_buf, u8 **ha) { if (dev->ndisc_ops && dev->ndisc_ops->opt_addr_space) return dev->ndisc_ops->opt_addr_space(dev, NDISC_REDIRECT, neigh, ha_buf, ha); else return 0; } static inline void ndisc_ops_fill_addr_option(const struct net_device *dev, struct sk_buff *skb, u8 icmp6_type) { if (dev->ndisc_ops && dev->ndisc_ops->fill_addr_option && icmp6_type != NDISC_REDIRECT) dev->ndisc_ops->fill_addr_option(dev, skb, icmp6_type, NULL); } static inline void ndisc_ops_fill_redirect_addr_option(const struct net_device *dev, struct sk_buff *skb, const u8 *ha) { if (dev->ndisc_ops && dev->ndisc_ops->fill_addr_option) dev->ndisc_ops->fill_addr_option(dev, skb, NDISC_REDIRECT, ha); } static inline void ndisc_ops_prefix_rcv_add_addr(struct net *net, struct net_device *dev, const struct prefix_info *pinfo, struct inet6_dev *in6_dev, struct in6_addr *addr, int addr_type, u32 addr_flags, bool sllao, bool tokenized, __u32 valid_lft, u32 prefered_lft, bool dev_addr_generated) { if (dev->ndisc_ops && dev->ndisc_ops->prefix_rcv_add_addr) dev->ndisc_ops->prefix_rcv_add_addr(net, dev, pinfo, in6_dev, addr, addr_type, addr_flags, sllao, tokenized, valid_lft, prefered_lft, dev_addr_generated); } #endif /* * Return the padding between the option length and the start of the * link addr. Currently only IP-over-InfiniBand needs this, although * if RFC 3831 IPv6-over-Fibre Channel is ever implemented it may * also need a pad of 2. */ static inline int ndisc_addr_option_pad(unsigned short type) { switch (type) { case ARPHRD_INFINIBAND: return 2; default: return 0; } } static inline int __ndisc_opt_addr_space(unsigned char addr_len, int pad) { return NDISC_OPT_SPACE(addr_len + pad); } #if IS_ENABLED(CONFIG_IPV6) static inline int ndisc_opt_addr_space(struct net_device *dev, u8 icmp6_type) { return __ndisc_opt_addr_space(dev->addr_len, ndisc_addr_option_pad(dev->type)) + ndisc_ops_opt_addr_space(dev, icmp6_type); } static inline int ndisc_redirect_opt_addr_space(struct net_device *dev, struct neighbour *neigh, u8 *ops_data_buf, u8 **ops_data) { return __ndisc_opt_addr_space(dev->addr_len, ndisc_addr_option_pad(dev->type)) + ndisc_ops_redirect_opt_addr_space(dev, neigh, ops_data_buf, ops_data); } #endif static inline u8 *__ndisc_opt_addr_data(struct nd_opt_hdr *p, unsigned char addr_len, int prepad) { u8 *lladdr = (u8 *)(p + 1); int lladdrlen = p->nd_opt_len << 3; if (lladdrlen != __ndisc_opt_addr_space(addr_len, prepad)) return NULL; return lladdr + prepad; } static inline u8 *ndisc_opt_addr_data(struct nd_opt_hdr *p, struct net_device *dev) { return __ndisc_opt_addr_data(p, dev->addr_len, ndisc_addr_option_pad(dev->type)); } static inline u32 ndisc_hashfn(const void *pkey, const struct net_device *dev, __u32 *hash_rnd) { const u32 *p32 = pkey; return (((p32[0] ^ hash32_ptr(dev)) * hash_rnd[0]) + (p32[1] * hash_rnd[1]) + (p32[2] * hash_rnd[2]) + (p32[3] * hash_rnd[3])); } static inline struct neighbour *__ipv6_neigh_lookup_noref(struct net_device *dev, const void *pkey) { return ___neigh_lookup_noref(&nd_tbl, neigh_key_eq128, ndisc_hashfn, pkey, dev); } static inline struct neighbour *__ipv6_neigh_lookup(struct net_device *dev, const void *pkey) { struct neighbour *n; rcu_read_lock(); n = __ipv6_neigh_lookup_noref(dev, pkey); if (n && !refcount_inc_not_zero(&n->refcnt)) n = NULL; rcu_read_unlock(); return n; } static inline void __ipv6_confirm_neigh(struct net_device *dev, const void *pkey) { struct neighbour *n; rcu_read_lock(); n = __ipv6_neigh_lookup_noref(dev, pkey); neigh_confirm(n); rcu_read_unlock(); } static inline struct neighbour *ip_neigh_gw6(struct net_device *dev, const void *addr) { #if IS_ENABLED(CONFIG_IPV6) struct neighbour *neigh; neigh = __ipv6_neigh_lookup_noref(dev, addr); if (unlikely(!neigh)) neigh = __neigh_create(&nd_tbl, addr, dev, false); return neigh; #else return ERR_PTR(-EAFNOSUPPORT); #endif } int ndisc_init(void); int ndisc_late_init(void); void ndisc_late_cleanup(void); void ndisc_cleanup(void); enum skb_drop_reason ndisc_rcv(struct sk_buff *skb); struct sk_buff *ndisc_ns_create(struct net_device *dev, const struct in6_addr *solicit, const struct in6_addr *saddr, u64 nonce); void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit, const struct in6_addr *daddr, const struct in6_addr *saddr, u64 nonce); void ndisc_send_skb(struct sk_buff *skb, const struct in6_addr *daddr, const struct in6_addr *saddr); void ndisc_send_rs(struct net_device *dev, const struct in6_addr *saddr, const struct in6_addr *daddr); void ndisc_send_na(struct net_device *dev, const struct in6_addr *daddr, const struct in6_addr *solicited_addr, bool router, bool solicited, bool override, bool inc_opt); void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target); int ndisc_mc_map(const struct in6_addr *addr, char *buf, struct net_device *dev, int dir); void ndisc_update(const struct net_device *dev, struct neighbour *neigh, const u8 *lladdr, u8 new, u32 flags, u8 icmp6_type, struct ndisc_options *ndopts); /* * IGMP */ int igmp6_init(void); int igmp6_late_init(void); void igmp6_cleanup(void); void igmp6_late_cleanup(void); void igmp6_event_query(struct sk_buff *skb); void igmp6_event_report(struct sk_buff *skb); #ifdef CONFIG_SYSCTL int ndisc_ifinfo_sysctl_change(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos); #endif void inet6_ifinfo_notify(int event, struct inet6_dev *idev); #endif
13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_NSPROXY_H #define _LINUX_NSPROXY_H #include <linux/refcount.h> #include <linux/spinlock.h> #include <linux/sched.h> struct mnt_namespace; struct uts_namespace; struct ipc_namespace; struct pid_namespace; struct cgroup_namespace; struct fs_struct; /* * A structure to contain pointers to all per-process * namespaces - fs (mount), uts, network, sysvipc, etc. * * The pid namespace is an exception -- it's accessed using * task_active_pid_ns. The pid namespace here is the * namespace that children will use. * * 'count' is the number of tasks holding a reference. * The count for each namespace, then, will be the number * of nsproxies pointing to it, not the number of tasks. * * The nsproxy is shared by tasks which share all namespaces. * As soon as a single namespace is cloned or unshared, the * nsproxy is copied. */ struct nsproxy { refcount_t count; struct uts_namespace *uts_ns; struct ipc_namespace *ipc_ns; struct mnt_namespace *mnt_ns; struct pid_namespace *pid_ns_for_children; struct net *net_ns; struct time_namespace *time_ns; struct time_namespace *time_ns_for_children; struct cgroup_namespace *cgroup_ns; }; extern struct nsproxy init_nsproxy; /* * A structure to encompass all bits needed to install * a partial or complete new set of namespaces. * * If a new user namespace is requested cred will * point to a modifiable set of credentials. If a pointer * to a modifiable set is needed nsset_cred() must be * used and tested. */ struct nsset { unsigned flags; struct nsproxy *nsproxy; struct fs_struct *fs; const struct cred *cred; }; static inline struct cred *nsset_cred(struct nsset *set) { if (set->flags & CLONE_NEWUSER) return (struct cred *)set->cred; return NULL; } /* * the namespaces access rules are: * * 1. only current task is allowed to change tsk->nsproxy pointer or * any pointer on the nsproxy itself. Current must hold the task_lock * when changing tsk->nsproxy. * * 2. when accessing (i.e. reading) current task's namespaces - no * precautions should be taken - just dereference the pointers * * 3. the access to other task namespaces is performed like this * task_lock(task); * nsproxy = task->nsproxy; * if (nsproxy != NULL) { * / * * * work with the namespaces here * * e.g. get the reference on one of them * * / * } / * * * NULL task->nsproxy means that this task is * * almost dead (zombie) * * / * task_unlock(task); * */ int copy_namespaces(u64 flags, struct task_struct *tsk); void switch_cred_namespaces(const struct cred *old, const struct cred *new); void exit_nsproxy_namespaces(struct task_struct *tsk); void get_cred_namespaces(struct task_struct *tsk); void exit_cred_namespaces(struct task_struct *tsk); void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new); int exec_task_namespaces(void); void deactivate_nsproxy(struct nsproxy *ns); int unshare_nsproxy_namespaces(unsigned long, struct nsproxy **, struct cred *, struct fs_struct *); int __init nsproxy_cache_init(void); static inline void put_nsproxy(struct nsproxy *ns) { if (refcount_dec_and_test(&ns->count)) deactivate_nsproxy(ns); } static inline void get_nsproxy(struct nsproxy *ns) { refcount_inc(&ns->count); } DEFINE_FREE(put_nsproxy, struct nsproxy *, if (_T) put_nsproxy(_T)) #endif
1 1 1 1 1 1 1 1 1 7 7 7 14 14 13 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 // SPDX-License-Identifier: GPL-2.0-or-later /* * ALSA sequencer Memory Manager * Copyright (c) 1998 by Frank van de Pol <fvdpol@coil.demon.nl> * Jaroslav Kysela <perex@perex.cz> * 2000 by Takashi Iwai <tiwai@suse.de> */ #include <linux/init.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/sched/signal.h> #include <linux/mm.h> #include <sound/core.h> #include <sound/seq_kernel.h> #include "seq_memory.h" #include "seq_queue.h" #include "seq_info.h" #include "seq_lock.h" static inline int snd_seq_pool_available(struct snd_seq_pool *pool) { return pool->total_elements - atomic_read(&pool->counter); } static inline int snd_seq_output_ok(struct snd_seq_pool *pool) { return snd_seq_pool_available(pool) >= pool->room; } /* * Variable length event: * The event like sysex uses variable length type. * The external data may be stored in three different formats. * 1) kernel space * This is the normal case. * ext.data.len = length * ext.data.ptr = buffer pointer * 2) user space * When an event is generated via read(), the external data is * kept in user space until expanded. * ext.data.len = length | SNDRV_SEQ_EXT_USRPTR * ext.data.ptr = userspace pointer * 3) chained cells * When the variable length event is enqueued (in prioq or fifo), * the external data is decomposed to several cells. * ext.data.len = length | SNDRV_SEQ_EXT_CHAINED * ext.data.ptr = the additiona cell head * -> cell.next -> cell.next -> .. */ /* * exported: * call dump function to expand external data. */ static int get_var_len(const struct snd_seq_event *event) { if ((event->flags & SNDRV_SEQ_EVENT_LENGTH_MASK) != SNDRV_SEQ_EVENT_LENGTH_VARIABLE) return -EINVAL; return event->data.ext.len & ~SNDRV_SEQ_EXT_MASK; } static int dump_var_event(const struct snd_seq_event *event, snd_seq_dump_func_t func, void *private_data, int offset, int maxlen) { int len, err; struct snd_seq_event_cell *cell; len = get_var_len(event); if (len <= 0) return len; if (len <= offset) return 0; if (maxlen && len > offset + maxlen) len = offset + maxlen; if (event->data.ext.len & SNDRV_SEQ_EXT_USRPTR) { char buf[32]; char __user *curptr = (char __force __user *)event->data.ext.ptr; curptr += offset; len -= offset; while (len > 0) { int size = sizeof(buf); if (len < size) size = len; if (copy_from_user(buf, curptr, size)) return -EFAULT; err = func(private_data, buf, size); if (err < 0) return err; curptr += size; len -= size; } return 0; } if (!(event->data.ext.len & SNDRV_SEQ_EXT_CHAINED)) return func(private_data, event->data.ext.ptr + offset, len - offset); cell = (struct snd_seq_event_cell *)event->data.ext.ptr; for (; len > 0 && cell; cell = cell->next) { int size = sizeof(struct snd_seq_event); char *curptr = (char *)&cell->event; if (offset >= size) { offset -= size; len -= size; continue; } if (len < size) size = len; err = func(private_data, curptr + offset, size - offset); if (err < 0) return err; offset = 0; len -= size; } return 0; } int snd_seq_dump_var_event(const struct snd_seq_event *event, snd_seq_dump_func_t func, void *private_data) { return dump_var_event(event, func, private_data, 0, 0); } EXPORT_SYMBOL(snd_seq_dump_var_event); /* * exported: * expand the variable length event to linear buffer space. */ static int seq_copy_in_kernel(void *ptr, void *src, int size) { char **bufptr = ptr; memcpy(*bufptr, src, size); *bufptr += size; return 0; } static int seq_copy_in_user(void *ptr, void *src, int size) { char __user **bufptr = ptr; if (copy_to_user(*bufptr, src, size)) return -EFAULT; *bufptr += size; return 0; } static int expand_var_event(const struct snd_seq_event *event, int offset, int size, char *buf, bool in_kernel) { if (event->data.ext.len & SNDRV_SEQ_EXT_USRPTR) { if (! in_kernel) return -EINVAL; if (copy_from_user(buf, (char __force __user *)event->data.ext.ptr + offset, size)) return -EFAULT; return 0; } return dump_var_event(event, in_kernel ? seq_copy_in_kernel : seq_copy_in_user, &buf, offset, size); } int snd_seq_expand_var_event(const struct snd_seq_event *event, int count, char *buf, int in_kernel, int size_aligned) { int len, newlen, err; len = get_var_len(event); if (len < 0) return len; newlen = len; if (size_aligned > 0) newlen = roundup(len, size_aligned); if (count < newlen) return -EAGAIN; err = expand_var_event(event, 0, len, buf, in_kernel); if (err < 0) return err; if (len != newlen) { if (in_kernel) memset(buf + len, 0, newlen - len); else if (clear_user((__force void __user *)buf + len, newlen - len)) return -EFAULT; } return newlen; } EXPORT_SYMBOL(snd_seq_expand_var_event); int snd_seq_expand_var_event_at(const struct snd_seq_event *event, int count, char *buf, int offset) { int len, err; len = get_var_len(event); if (len < 0) return len; if (len <= offset) return 0; len -= offset; if (len > count) len = count; err = expand_var_event(event, offset, count, buf, true); if (err < 0) return err; return len; } EXPORT_SYMBOL_GPL(snd_seq_expand_var_event_at); /* * release this cell, free extended data if available */ static inline void free_cell(struct snd_seq_pool *pool, struct snd_seq_event_cell *cell) { cell->next = pool->free; pool->free = cell; atomic_dec(&pool->counter); } void snd_seq_cell_free(struct snd_seq_event_cell * cell) { struct snd_seq_pool *pool; if (snd_BUG_ON(!cell)) return; pool = cell->pool; if (snd_BUG_ON(!pool)) return; guard(spinlock_irqsave)(&pool->lock); free_cell(pool, cell); if (snd_seq_ev_is_variable(&cell->event)) { if (cell->event.data.ext.len & SNDRV_SEQ_EXT_CHAINED) { struct snd_seq_event_cell *curp, *nextptr; curp = cell->event.data.ext.ptr; for (; curp; curp = nextptr) { nextptr = curp->next; curp->next = pool->free; free_cell(pool, curp); } } } if (waitqueue_active(&pool->output_sleep)) { /* has enough space now? */ if (snd_seq_output_ok(pool)) wake_up(&pool->output_sleep); } } /* * allocate an event cell. */ static int snd_seq_cell_alloc(struct snd_seq_pool *pool, struct snd_seq_event_cell **cellp, int nonblock, struct file *file, struct mutex *mutexp) { struct snd_seq_event_cell *cell; unsigned long flags; int err = -EAGAIN; wait_queue_entry_t wait; if (pool == NULL) return -EINVAL; *cellp = NULL; init_waitqueue_entry(&wait, current); spin_lock_irqsave(&pool->lock, flags); if (pool->ptr == NULL) { /* not initialized */ pr_debug("ALSA: seq: pool is not initialized\n"); err = -EINVAL; goto __error; } while (pool->free == NULL && ! nonblock && ! pool->closing) { set_current_state(TASK_INTERRUPTIBLE); add_wait_queue(&pool->output_sleep, &wait); spin_unlock_irqrestore(&pool->lock, flags); if (mutexp) mutex_unlock(mutexp); schedule(); if (mutexp) mutex_lock(mutexp); spin_lock_irqsave(&pool->lock, flags); remove_wait_queue(&pool->output_sleep, &wait); /* interrupted? */ if (signal_pending(current)) { err = -ERESTARTSYS; goto __error; } } if (pool->closing) { /* closing.. */ err = -ENOMEM; goto __error; } cell = pool->free; if (cell) { int used; pool->free = cell->next; atomic_inc(&pool->counter); used = atomic_read(&pool->counter); if (pool->max_used < used) pool->max_used = used; pool->event_alloc_success++; /* clear cell pointers */ cell->next = NULL; err = 0; } else pool->event_alloc_failures++; *cellp = cell; __error: spin_unlock_irqrestore(&pool->lock, flags); return err; } /* * duplicate the event to a cell. * if the event has external data, the data is decomposed to additional * cells. */ int snd_seq_event_dup(struct snd_seq_pool *pool, struct snd_seq_event *event, struct snd_seq_event_cell **cellp, int nonblock, struct file *file, struct mutex *mutexp) { int ncells, err; unsigned int extlen; struct snd_seq_event_cell *cell; int size; *cellp = NULL; ncells = 0; extlen = 0; if (snd_seq_ev_is_variable(event)) { extlen = event->data.ext.len & ~SNDRV_SEQ_EXT_MASK; ncells = DIV_ROUND_UP(extlen, sizeof(struct snd_seq_event)); } if (ncells >= pool->total_elements) return -ENOMEM; err = snd_seq_cell_alloc(pool, &cell, nonblock, file, mutexp); if (err < 0) return err; /* copy the event */ size = snd_seq_event_packet_size(event); memcpy(&cell->ump, event, size); #if IS_ENABLED(CONFIG_SND_SEQ_UMP) if (size < sizeof(cell->event)) cell->ump.raw.extra = 0; #endif /* decompose */ if (snd_seq_ev_is_variable(event)) { int len = extlen; int is_chained = event->data.ext.len & SNDRV_SEQ_EXT_CHAINED; int is_usrptr = event->data.ext.len & SNDRV_SEQ_EXT_USRPTR; struct snd_seq_event_cell *src, *tmp, *tail; char *buf; cell->event.data.ext.len = extlen | SNDRV_SEQ_EXT_CHAINED; cell->event.data.ext.ptr = NULL; src = (struct snd_seq_event_cell *)event->data.ext.ptr; buf = (char *)event->data.ext.ptr; tail = NULL; while (ncells-- > 0) { size = sizeof(struct snd_seq_event); if (len < size) size = len; err = snd_seq_cell_alloc(pool, &tmp, nonblock, file, mutexp); if (err < 0) goto __error; if (cell->event.data.ext.ptr == NULL) cell->event.data.ext.ptr = tmp; if (tail) tail->next = tmp; tail = tmp; /* copy chunk */ if (is_chained && src) { tmp->event = src->event; src = src->next; } else if (is_usrptr) { if (copy_from_user(&tmp->event, (char __force __user *)buf, size)) { err = -EFAULT; goto __error; } } else { memcpy(&tmp->event, buf, size); } buf += size; len -= size; } } *cellp = cell; return 0; __error: snd_seq_cell_free(cell); return err; } /* poll wait */ int snd_seq_pool_poll_wait(struct snd_seq_pool *pool, struct file *file, poll_table *wait) { poll_wait(file, &pool->output_sleep, wait); guard(spinlock_irq)(&pool->lock); return snd_seq_output_ok(pool); } /* allocate room specified number of events */ int snd_seq_pool_init(struct snd_seq_pool *pool) { int cell; struct snd_seq_event_cell *cellptr; if (snd_BUG_ON(!pool)) return -EINVAL; cellptr = kvmalloc_objs(struct snd_seq_event_cell, pool->size); if (!cellptr) return -ENOMEM; /* add new cells to the free cell list */ guard(spinlock_irq)(&pool->lock); if (pool->ptr) { kvfree(cellptr); return 0; } pool->ptr = cellptr; pool->free = NULL; for (cell = 0; cell < pool->size; cell++) { cellptr = pool->ptr + cell; cellptr->pool = pool; cellptr->next = pool->free; pool->free = cellptr; } pool->room = (pool->size + 1) / 2; /* init statistics */ pool->max_used = 0; pool->total_elements = pool->size; return 0; } /* refuse the further insertion to the pool */ void snd_seq_pool_mark_closing(struct snd_seq_pool *pool) { if (snd_BUG_ON(!pool)) return; guard(spinlock_irqsave)(&pool->lock); pool->closing = 1; } /* remove events */ int snd_seq_pool_done(struct snd_seq_pool *pool) { struct snd_seq_event_cell *ptr; if (snd_BUG_ON(!pool)) return -EINVAL; /* wait for closing all threads */ if (waitqueue_active(&pool->output_sleep)) wake_up(&pool->output_sleep); while (atomic_read(&pool->counter) > 0) schedule_timeout_uninterruptible(1); /* release all resources */ scoped_guard(spinlock_irq, &pool->lock) { ptr = pool->ptr; pool->ptr = NULL; pool->free = NULL; pool->total_elements = 0; } kvfree(ptr); guard(spinlock_irq)(&pool->lock); pool->closing = 0; return 0; } /* init new memory pool */ struct snd_seq_pool *snd_seq_pool_new(int poolsize) { struct snd_seq_pool *pool; /* create pool block */ pool = kzalloc_obj(*pool); if (!pool) return NULL; spin_lock_init(&pool->lock); pool->ptr = NULL; pool->free = NULL; pool->total_elements = 0; atomic_set(&pool->counter, 0); pool->closing = 0; init_waitqueue_head(&pool->output_sleep); pool->size = poolsize; /* init statistics */ pool->max_used = 0; return pool; } /* remove memory pool */ int snd_seq_pool_delete(struct snd_seq_pool **ppool) { struct snd_seq_pool *pool = *ppool; *ppool = NULL; if (pool == NULL) return 0; snd_seq_pool_mark_closing(pool); snd_seq_pool_done(pool); kfree(pool); return 0; } /* exported to seq_clientmgr.c */ void snd_seq_info_pool(struct snd_info_buffer *buffer, struct snd_seq_pool *pool, char *space) { if (pool == NULL) return; snd_iprintf(buffer, "%sPool size : %d\n", space, pool->total_elements); snd_iprintf(buffer, "%sCells in use : %d\n", space, atomic_read(&pool->counter)); snd_iprintf(buffer, "%sPeak cells in use : %d\n", space, pool->max_used); snd_iprintf(buffer, "%sAlloc success : %d\n", space, pool->event_alloc_success); snd_iprintf(buffer, "%sAlloc failures : %d\n", space, pool->event_alloc_failures); }
1 1 1 1 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 // SPDX-License-Identifier: GPL-2.0-only /* * VMware vSockets Driver * * Copyright (C) 2007-2013 VMware, Inc. All rights reserved. */ /* Implementation notes: * * - There are two kinds of sockets: those created by user action (such as * calling socket(2)) and those created by incoming connection request packets. * * - There are two "global" tables, one for bound sockets (sockets that have * specified an address that they are responsible for) and one for connected * sockets (sockets that have established a connection with another socket). * These tables are "global" in that all sockets on the system are placed * within them. - Note, though, that the bound table contains an extra entry * for a list of unbound sockets and SOCK_DGRAM sockets will always remain in * that list. The bound table is used solely for lookup of sockets when packets * are received and that's not necessary for SOCK_DGRAM sockets since we create * a datagram handle for each and need not perform a lookup. Keeping SOCK_DGRAM * sockets out of the bound hash buckets will reduce the chance of collisions * when looking for SOCK_STREAM sockets and prevents us from having to check the * socket type in the hash table lookups. * * - Sockets created by user action will either be "client" sockets that * initiate a connection or "server" sockets that listen for connections; we do * not support simultaneous connects (two "client" sockets connecting). * * - "Server" sockets are referred to as listener sockets throughout this * implementation because they are in the TCP_LISTEN state. When a * connection request is received (the second kind of socket mentioned above), * we create a new socket and refer to it as a pending socket. These pending * sockets are placed on the pending connection list of the listener socket. * When future packets are received for the address the listener socket is * bound to, we check if the source of the packet is from one that has an * existing pending connection. If it does, we process the packet for the * pending socket. When that socket reaches the connected state, it is removed * from the listener socket's pending list and enqueued in the listener * socket's accept queue. Callers of accept(2) will accept connected sockets * from the listener socket's accept queue. If the socket cannot be accepted * for some reason then it is marked rejected. Once the connection is * accepted, it is owned by the user process and the responsibility for cleanup * falls with that user process. * * - It is possible that these pending sockets will never reach the connected * state; in fact, we may never receive another packet after the connection * request. Because of this, we must schedule a cleanup function to run in the * future, after some amount of time passes where a connection should have been * established. This function ensures that the socket is off all lists so it * cannot be retrieved, then drops all references to the socket so it is cleaned * up (sock_put() -> sk_free() -> our sk_destruct implementation). Note this * function will also cleanup rejected sockets, those that reach the connected * state but leave it before they have been accepted. * * - Lock ordering for pending or accept queue sockets is: * * lock_sock(listener); * lock_sock_nested(pending, SINGLE_DEPTH_NESTING); * * Using explicit nested locking keeps lockdep happy since normally only one * lock of a given class may be taken at a time. * * - Sockets created by user action will be cleaned up when the user process * calls close(2), causing our release implementation to be called. Our release * implementation will perform some cleanup then drop the last reference so our * sk_destruct implementation is invoked. Our sk_destruct implementation will * perform additional cleanup that's common for both types of sockets. * * - A socket's reference count is what ensures that the structure won't be * freed. Each entry in a list (such as the "global" bound and connected tables * and the listener socket's pending list and connected queue) ensures a * reference. When we defer work until process context and pass a socket as our * argument, we must ensure the reference count is increased to ensure the * socket isn't freed before the function is run; the deferred function will * then drop the reference. * * - sk->sk_state uses the TCP state constants because they are widely used by * other address families and exposed to userspace tools like ss(8): * * TCP_CLOSE - unconnected * TCP_SYN_SENT - connecting * TCP_ESTABLISHED - connected * TCP_CLOSING - disconnecting * TCP_LISTEN - listening * * - Namespaces in vsock support two different modes: "local" and "global". * Each mode defines how the namespace interacts with CIDs. * Each namespace exposes two sysctl files: * * - /proc/sys/net/vsock/ns_mode (read-only) reports the current namespace's * mode, which is set at namespace creation and immutable thereafter. * - /proc/sys/net/vsock/child_ns_mode (write-once) controls what mode future * child namespaces will inherit when created. The initial value matches * the namespace's own ns_mode. * * Changing child_ns_mode only affects newly created namespaces, not the * current namespace or existing children. A "local" namespace cannot set * child_ns_mode to "global". child_ns_mode is write-once, so that it may be * configured and locked down by a namespace manager. Writing a different * value after the first write returns -EBUSY. At namespace creation, ns_mode * is inherited from the parent's child_ns_mode. * * The init_net mode is "global" and cannot be modified. The init_net * child_ns_mode is also write-once, so an init process (e.g. systemd) can * set it to "local" to ensure all new namespaces inherit local mode. * * The modes affect the allocation and accessibility of CIDs as follows: * * - global - access and allocation are all system-wide * - all CID allocation from global namespaces draw from the same * system-wide pool. * - if one global namespace has already allocated some CID, another * global namespace will not be able to allocate the same CID. * - global mode AF_VSOCK sockets can reach any VM or socket in any global * namespace, they are not contained to only their own namespace. * - AF_VSOCK sockets in a global mode namespace cannot reach VMs or * sockets in any local mode namespace. * - local - access and allocation are contained within the namespace * - CID allocation draws only from a private pool local only to the * namespace, and does not affect the CIDs available for allocation in any * other namespace (global or local). * - VMs in a local namespace do not collide with CIDs in any other local * namespace or any global namespace. For example, if a VM in a local mode * namespace is given CID 10, then CID 10 is still available for * allocation in any other namespace, but not in the same namespace. * - AF_VSOCK sockets in a local mode namespace can connect only to VMs or * other sockets within their own namespace. * - sockets bound to VMADDR_CID_ANY in local namespaces will never resolve * to any transport that is not compatible with local mode. There is no * error that propagates to the user (as there is for connection attempts) * because it is possible for some packet to reach this socket from * a different transport that *does* support local mode. For * example, virtio-vsock may not support local mode, but the socket * may still accept a connection from vhost-vsock which does. */ #include <linux/compat.h> #include <linux/types.h> #include <linux/bitops.h> #include <linux/cred.h> #include <linux/errqueue.h> #include <linux/init.h> #include <linux/io.h> #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/kmod.h> #include <linux/list.h> #include <linux/miscdevice.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/net.h> #include <linux/proc_fs.h> #include <linux/poll.h> #include <linux/random.h> #include <linux/skbuff.h> #include <linux/smp.h> #include <linux/socket.h> #include <linux/stddef.h> #include <linux/sysctl.h> #include <linux/unistd.h> #include <linux/wait.h> #include <linux/workqueue.h> #include <net/sock.h> #include <net/af_vsock.h> #include <net/netns/vsock.h> #include <uapi/linux/vm_sockets.h> #include <uapi/asm-generic/ioctls.h> #define VSOCK_NET_MODE_STR_GLOBAL "global" #define VSOCK_NET_MODE_STR_LOCAL "local" /* 6 chars for "global", 1 for null-terminator, and 1 more for '\n'. * The newline is added by proc_dostring() for read operations. */ #define VSOCK_NET_MODE_STR_MAX 8 static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr); static void vsock_sk_destruct(struct sock *sk); static int vsock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); static void vsock_close(struct sock *sk, long timeout); /* Protocol family. */ struct proto vsock_proto = { .name = "AF_VSOCK", .owner = THIS_MODULE, .obj_size = sizeof(struct vsock_sock), .close = vsock_close, #ifdef CONFIG_BPF_SYSCALL .psock_update_sk_prot = vsock_bpf_update_proto, #endif }; /* The default peer timeout indicates how long we will wait for a peer response * to a control message. */ #define VSOCK_DEFAULT_CONNECT_TIMEOUT (2 * HZ) #define VSOCK_DEFAULT_BUFFER_SIZE (1024 * 256) #define VSOCK_DEFAULT_BUFFER_MAX_SIZE (1024 * 256) #define VSOCK_DEFAULT_BUFFER_MIN_SIZE 128 /* Transport used for host->guest communication */ static const struct vsock_transport *transport_h2g; /* Transport used for guest->host communication */ static const struct vsock_transport *transport_g2h; /* Transport used for DGRAM communication */ static const struct vsock_transport *transport_dgram; /* Transport used for local communication */ static const struct vsock_transport *transport_local; static DEFINE_MUTEX(vsock_register_mutex); /**** UTILS ****/ /* Each bound VSocket is stored in the bind hash table and each connected * VSocket is stored in the connected hash table. * * Unbound sockets are all put on the same list attached to the end of the hash * table (vsock_unbound_sockets). Bound sockets are added to the hash table in * the bucket that their local address hashes to (vsock_bound_sockets(addr) * represents the list that addr hashes to). * * Specifically, we initialize the vsock_bind_table array to a size of * VSOCK_HASH_SIZE + 1 so that vsock_bind_table[0] through * vsock_bind_table[VSOCK_HASH_SIZE - 1] are for bound sockets and * vsock_bind_table[VSOCK_HASH_SIZE] is for unbound sockets. The hash function * mods with VSOCK_HASH_SIZE to ensure this. */ #define MAX_PORT_RETRIES 24 #define VSOCK_HASH(addr) ((addr)->svm_port % VSOCK_HASH_SIZE) #define vsock_bound_sockets(addr) (&vsock_bind_table[VSOCK_HASH(addr)]) #define vsock_unbound_sockets (&vsock_bind_table[VSOCK_HASH_SIZE]) /* XXX This can probably be implemented in a better way. */ #define VSOCK_CONN_HASH(src, dst) \ (((src)->svm_cid ^ (dst)->svm_port) % VSOCK_HASH_SIZE) #define vsock_connected_sockets(src, dst) \ (&vsock_connected_table[VSOCK_CONN_HASH(src, dst)]) #define vsock_connected_sockets_vsk(vsk) \ vsock_connected_sockets(&(vsk)->remote_addr, &(vsk)->local_addr) struct list_head vsock_bind_table[VSOCK_HASH_SIZE + 1]; EXPORT_SYMBOL_GPL(vsock_bind_table); struct list_head vsock_connected_table[VSOCK_HASH_SIZE]; EXPORT_SYMBOL_GPL(vsock_connected_table); DEFINE_SPINLOCK(vsock_table_lock); EXPORT_SYMBOL_GPL(vsock_table_lock); /* Autobind this socket to the local address if necessary. */ static int vsock_auto_bind(struct vsock_sock *vsk) { struct sock *sk = sk_vsock(vsk); struct sockaddr_vm local_addr; if (vsock_addr_bound(&vsk->local_addr)) return 0; vsock_addr_init(&local_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); return __vsock_bind(sk, &local_addr); } static void vsock_init_tables(void) { int i; for (i = 0; i < ARRAY_SIZE(vsock_bind_table); i++) INIT_LIST_HEAD(&vsock_bind_table[i]); for (i = 0; i < ARRAY_SIZE(vsock_connected_table); i++) INIT_LIST_HEAD(&vsock_connected_table[i]); } static void __vsock_insert_bound(struct list_head *list, struct vsock_sock *vsk) { sock_hold(&vsk->sk); list_add(&vsk->bound_table, list); } static void __vsock_insert_connected(struct list_head *list, struct vsock_sock *vsk) { sock_hold(&vsk->sk); list_add(&vsk->connected_table, list); } static void __vsock_remove_bound(struct vsock_sock *vsk) { list_del_init(&vsk->bound_table); sock_put(&vsk->sk); } static void __vsock_remove_connected(struct vsock_sock *vsk) { list_del_init(&vsk->connected_table); sock_put(&vsk->sk); } static struct sock *__vsock_find_bound_socket_net(struct sockaddr_vm *addr, struct net *net) { struct vsock_sock *vsk; list_for_each_entry(vsk, vsock_bound_sockets(addr), bound_table) { struct sock *sk = sk_vsock(vsk); if (vsock_addr_equals_addr(addr, &vsk->local_addr) && vsock_net_check_mode(sock_net(sk), net)) return sk; if (addr->svm_port == vsk->local_addr.svm_port && (vsk->local_addr.svm_cid == VMADDR_CID_ANY || addr->svm_cid == VMADDR_CID_ANY) && vsock_net_check_mode(sock_net(sk), net)) return sk; } return NULL; } static struct sock * __vsock_find_connected_socket_net(struct sockaddr_vm *src, struct sockaddr_vm *dst, struct net *net) { struct vsock_sock *vsk; list_for_each_entry(vsk, vsock_connected_sockets(src, dst), connected_table) { struct sock *sk = sk_vsock(vsk); if (vsock_addr_equals_addr(src, &vsk->remote_addr) && dst->svm_port == vsk->local_addr.svm_port && vsock_net_check_mode(sock_net(sk), net)) { return sk; } } return NULL; } static void vsock_insert_unbound(struct vsock_sock *vsk) { spin_lock_bh(&vsock_table_lock); __vsock_insert_bound(vsock_unbound_sockets, vsk); spin_unlock_bh(&vsock_table_lock); } void vsock_insert_connected(struct vsock_sock *vsk) { struct list_head *list = vsock_connected_sockets( &vsk->remote_addr, &vsk->local_addr); spin_lock_bh(&vsock_table_lock); __vsock_insert_connected(list, vsk); spin_unlock_bh(&vsock_table_lock); } EXPORT_SYMBOL_GPL(vsock_insert_connected); void vsock_remove_bound(struct vsock_sock *vsk) { spin_lock_bh(&vsock_table_lock); if (__vsock_in_bound_table(vsk)) __vsock_remove_bound(vsk); spin_unlock_bh(&vsock_table_lock); } EXPORT_SYMBOL_GPL(vsock_remove_bound); void vsock_remove_connected(struct vsock_sock *vsk) { spin_lock_bh(&vsock_table_lock); if (__vsock_in_connected_table(vsk)) __vsock_remove_connected(vsk); spin_unlock_bh(&vsock_table_lock); } EXPORT_SYMBOL_GPL(vsock_remove_connected); /* Find a bound socket, filtering by namespace and namespace mode. * * Use this in transports that are namespace-aware and can provide the * network namespace context. */ struct sock *vsock_find_bound_socket_net(struct sockaddr_vm *addr, struct net *net) { struct sock *sk; spin_lock_bh(&vsock_table_lock); sk = __vsock_find_bound_socket_net(addr, net); if (sk) sock_hold(sk); spin_unlock_bh(&vsock_table_lock); return sk; } EXPORT_SYMBOL_GPL(vsock_find_bound_socket_net); /* Find a bound socket without namespace filtering. * * Use this in transports that lack namespace context. All sockets are * treated as if in global mode. */ struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr) { return vsock_find_bound_socket_net(addr, NULL); } EXPORT_SYMBOL_GPL(vsock_find_bound_socket); /* Find a connected socket, filtering by namespace and namespace mode. * * Use this in transports that are namespace-aware and can provide the * network namespace context. */ struct sock *vsock_find_connected_socket_net(struct sockaddr_vm *src, struct sockaddr_vm *dst, struct net *net) { struct sock *sk; spin_lock_bh(&vsock_table_lock); sk = __vsock_find_connected_socket_net(src, dst, net); if (sk) sock_hold(sk); spin_unlock_bh(&vsock_table_lock); return sk; } EXPORT_SYMBOL_GPL(vsock_find_connected_socket_net); /* Find a connected socket without namespace filtering. * * Use this in transports that lack namespace context. All sockets are * treated as if in global mode. */ struct sock *vsock_find_connected_socket(struct sockaddr_vm *src, struct sockaddr_vm *dst) { return vsock_find_connected_socket_net(src, dst, NULL); } EXPORT_SYMBOL_GPL(vsock_find_connected_socket); void vsock_remove_sock(struct vsock_sock *vsk) { /* Transport reassignment must not remove the binding. */ if (sock_flag(sk_vsock(vsk), SOCK_DEAD)) vsock_remove_bound(vsk); vsock_remove_connected(vsk); } EXPORT_SYMBOL_GPL(vsock_remove_sock); void vsock_for_each_connected_socket(struct vsock_transport *transport, void (*fn)(struct sock *sk)) { int i; spin_lock_bh(&vsock_table_lock); for (i = 0; i < ARRAY_SIZE(vsock_connected_table); i++) { struct vsock_sock *vsk; list_for_each_entry(vsk, &vsock_connected_table[i], connected_table) { if (vsk->transport != transport) continue; fn(sk_vsock(vsk)); } } spin_unlock_bh(&vsock_table_lock); } EXPORT_SYMBOL_GPL(vsock_for_each_connected_socket); void vsock_add_pending(struct sock *listener, struct sock *pending) { struct vsock_sock *vlistener; struct vsock_sock *vpending; vlistener = vsock_sk(listener); vpending = vsock_sk(pending); sock_hold(pending); sock_hold(listener); list_add_tail(&vpending->pending_links, &vlistener->pending_links); } EXPORT_SYMBOL_GPL(vsock_add_pending); void vsock_remove_pending(struct sock *listener, struct sock *pending) { struct vsock_sock *vpending = vsock_sk(pending); list_del_init(&vpending->pending_links); sock_put(listener); sock_put(pending); } EXPORT_SYMBOL_GPL(vsock_remove_pending); void vsock_enqueue_accept(struct sock *listener, struct sock *connected) { struct vsock_sock *vlistener; struct vsock_sock *vconnected; vlistener = vsock_sk(listener); vconnected = vsock_sk(connected); sock_hold(connected); sock_hold(listener); list_add_tail(&vconnected->accept_queue, &vlistener->accept_queue); } EXPORT_SYMBOL_GPL(vsock_enqueue_accept); static bool vsock_use_local_transport(unsigned int remote_cid) { lockdep_assert_held(&vsock_register_mutex); if (!transport_local) return false; if (remote_cid == VMADDR_CID_LOCAL) return true; if (transport_g2h) { return remote_cid == transport_g2h->get_local_cid(); } else { return remote_cid == VMADDR_CID_HOST; } } static void vsock_deassign_transport(struct vsock_sock *vsk) { if (!vsk->transport) return; vsk->transport->destruct(vsk); module_put(vsk->transport->module); vsk->transport = NULL; } /* Assign a transport to a socket and call the .init transport callback. * * Note: for connection oriented socket this must be called when vsk->remote_addr * is set (e.g. during the connect() or when a connection request on a listener * socket is received). * The vsk->remote_addr is used to decide which transport to use: * - remote CID == VMADDR_CID_LOCAL or g2h->local_cid or VMADDR_CID_HOST if * g2h is not loaded, will use local transport; * - remote CID <= VMADDR_CID_HOST or remote flags field includes * VMADDR_FLAG_TO_HOST, will use guest->host transport; * - remote CID > VMADDR_CID_HOST and h2g is loaded and h2g claims that CID, * will use host->guest transport; * - h2g not loaded or h2g does not claim that CID and g2h claims the CID via * has_remote_cid, will use guest->host transport (when g2h_fallback=1) * - anything else goes to h2g or returns -ENODEV if no h2g is available */ int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk) { const struct vsock_transport *new_transport; struct sock *sk = sk_vsock(vsk); unsigned int remote_cid = vsk->remote_addr.svm_cid; __u8 remote_flags; int ret; /* If the packet is coming with the source and destination CIDs higher * than VMADDR_CID_HOST, then a vsock channel where all the packets are * forwarded to the host should be established. Then the host will * need to forward the packets to the guest. * * The flag is set on the (listen) receive path (psk is not NULL). On * the connect path the flag can be set by the user space application. */ if (psk && vsk->local_addr.svm_cid > VMADDR_CID_HOST && vsk->remote_addr.svm_cid > VMADDR_CID_HOST) vsk->remote_addr.svm_flags |= VMADDR_FLAG_TO_HOST; remote_flags = vsk->remote_addr.svm_flags; mutex_lock(&vsock_register_mutex); switch (sk->sk_type) { case SOCK_DGRAM: new_transport = transport_dgram; break; case SOCK_STREAM: case SOCK_SEQPACKET: if (vsock_use_local_transport(remote_cid)) new_transport = transport_local; else if (remote_cid <= VMADDR_CID_HOST || (remote_flags & VMADDR_FLAG_TO_HOST)) new_transport = transport_g2h; else if (transport_h2g && (!transport_h2g->has_remote_cid || transport_h2g->has_remote_cid(vsk, remote_cid))) new_transport = transport_h2g; else if (sock_net(sk)->vsock.g2h_fallback && transport_g2h && transport_g2h->has_remote_cid && transport_g2h->has_remote_cid(vsk, remote_cid)) { vsk->remote_addr.svm_flags |= VMADDR_FLAG_TO_HOST; new_transport = transport_g2h; } else { new_transport = transport_h2g; } break; default: ret = -ESOCKTNOSUPPORT; goto err; } if (vsk->transport && vsk->transport == new_transport) { ret = 0; goto err; } /* We increase the module refcnt to prevent the transport unloading * while there are open sockets assigned to it. */ if (!new_transport || !try_module_get(new_transport->module)) { ret = -ENODEV; goto err; } /* It's safe to release the mutex after a successful try_module_get(). * Whichever transport `new_transport` points at, it won't go away until * the last module_put() below or in vsock_deassign_transport(). */ mutex_unlock(&vsock_register_mutex); if (vsk->transport) { /* transport->release() must be called with sock lock acquired. * This path can only be taken during vsock_connect(), where we * have already held the sock lock. In the other cases, this * function is called on a new socket which is not assigned to * any transport. */ vsk->transport->release(vsk); vsock_deassign_transport(vsk); /* transport's release() and destruct() can touch some socket * state, since we are reassigning the socket to a new transport * during vsock_connect(), let's reset these fields to have a * clean state. */ sock_reset_flag(sk, SOCK_DONE); sk->sk_state = TCP_CLOSE; vsk->peer_shutdown = 0; } if (sk->sk_type == SOCK_SEQPACKET) { if (!new_transport->seqpacket_allow || !new_transport->seqpacket_allow(vsk, remote_cid)) { module_put(new_transport->module); return -ESOCKTNOSUPPORT; } } ret = new_transport->init(vsk, psk); if (ret) { module_put(new_transport->module); return ret; } vsk->transport = new_transport; return 0; err: mutex_unlock(&vsock_register_mutex); return ret; } EXPORT_SYMBOL_GPL(vsock_assign_transport); /* * Provide safe access to static transport_{h2g,g2h,dgram,local} callbacks. * Otherwise we may race with module removal. Do not use on `vsk->transport`. */ static u32 vsock_registered_transport_cid(const struct vsock_transport **transport) { u32 cid = VMADDR_CID_ANY; mutex_lock(&vsock_register_mutex); if (*transport) cid = (*transport)->get_local_cid(); mutex_unlock(&vsock_register_mutex); return cid; } bool vsock_find_cid(unsigned int cid) { if (cid == vsock_registered_transport_cid(&transport_g2h)) return true; if (transport_h2g && cid == VMADDR_CID_HOST) return true; if (transport_local && cid == VMADDR_CID_LOCAL) return true; return false; } EXPORT_SYMBOL_GPL(vsock_find_cid); static struct sock *vsock_dequeue_accept(struct sock *listener) { struct vsock_sock *vlistener; struct vsock_sock *vconnected; vlistener = vsock_sk(listener); if (list_empty(&vlistener->accept_queue)) return NULL; vconnected = list_entry(vlistener->accept_queue.next, struct vsock_sock, accept_queue); list_del_init(&vconnected->accept_queue); sock_put(listener); /* The caller will need a reference on the connected socket so we let * it call sock_put(). */ return sk_vsock(vconnected); } static bool vsock_is_accept_queue_empty(struct sock *sk) { struct vsock_sock *vsk = vsock_sk(sk); return list_empty(&vsk->accept_queue); } static bool vsock_is_pending(struct sock *sk) { struct vsock_sock *vsk = vsock_sk(sk); return !list_empty(&vsk->pending_links); } static int vsock_send_shutdown(struct sock *sk, int mode) { struct vsock_sock *vsk = vsock_sk(sk); if (!vsk->transport) return -ENODEV; return vsk->transport->shutdown(vsk, mode); } static void vsock_pending_work(struct work_struct *work) { struct sock *sk; struct sock *listener; struct vsock_sock *vsk; bool cleanup; vsk = container_of(work, struct vsock_sock, pending_work.work); sk = sk_vsock(vsk); listener = vsk->listener; cleanup = true; lock_sock(listener); lock_sock_nested(sk, SINGLE_DEPTH_NESTING); if (vsock_is_pending(sk)) { vsock_remove_pending(listener, sk); sk_acceptq_removed(listener); } else if (!vsk->rejected) { /* We are not on the pending list and accept() did not reject * us, so we must have been accepted by our user process. We * just need to drop our references to the sockets and be on * our way. */ cleanup = false; goto out; } /* We need to remove ourself from the global connected sockets list so * incoming packets can't find this socket, and to reduce the reference * count. */ vsock_remove_connected(vsk); sk->sk_state = TCP_CLOSE; out: release_sock(sk); release_sock(listener); if (cleanup) sock_put(sk); sock_put(sk); sock_put(listener); } /**** SOCKET OPERATIONS ****/ static int __vsock_bind_connectible(struct vsock_sock *vsk, struct sockaddr_vm *addr) { struct net *net = sock_net(sk_vsock(vsk)); struct sockaddr_vm new_addr; if (!net->vsock.port) net->vsock.port = get_random_u32_above(LAST_RESERVED_PORT); vsock_addr_init(&new_addr, addr->svm_cid, addr->svm_port); if (addr->svm_port == VMADDR_PORT_ANY) { bool found = false; unsigned int i; for (i = 0; i < MAX_PORT_RETRIES; i++) { if (net->vsock.port == VMADDR_PORT_ANY || net->vsock.port <= LAST_RESERVED_PORT) net->vsock.port = LAST_RESERVED_PORT + 1; new_addr.svm_port = net->vsock.port++; if (!__vsock_find_bound_socket_net(&new_addr, net)) { found = true; break; } } if (!found) return -EADDRNOTAVAIL; } else { /* If port is in reserved range, ensure caller * has necessary privileges. */ if (addr->svm_port <= LAST_RESERVED_PORT && !capable(CAP_NET_BIND_SERVICE)) { return -EACCES; } if (__vsock_find_bound_socket_net(&new_addr, net)) return -EADDRINUSE; } vsock_addr_init(&vsk->local_addr, new_addr.svm_cid, new_addr.svm_port); /* Remove connection oriented sockets from the unbound list and add them * to the hash table for easy lookup by its address. The unbound list * is simply an extra entry at the end of the hash table, a trick used * by AF_UNIX. */ __vsock_remove_bound(vsk); __vsock_insert_bound(vsock_bound_sockets(&vsk->local_addr), vsk); return 0; } static int __vsock_bind_dgram(struct vsock_sock *vsk, struct sockaddr_vm *addr) { return vsk->transport->dgram_bind(vsk, addr); } static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr) { struct vsock_sock *vsk = vsock_sk(sk); int retval; /* First ensure this socket isn't already bound. */ if (vsock_addr_bound(&vsk->local_addr)) return -EINVAL; /* Now bind to the provided address or select appropriate values if * none are provided (VMADDR_CID_ANY and VMADDR_PORT_ANY). Note that * like AF_INET prevents binding to a non-local IP address (in most * cases), we only allow binding to a local CID. */ if (addr->svm_cid != VMADDR_CID_ANY && !vsock_find_cid(addr->svm_cid)) return -EADDRNOTAVAIL; switch (sk->sk_socket->type) { case SOCK_STREAM: case SOCK_SEQPACKET: spin_lock_bh(&vsock_table_lock); retval = __vsock_bind_connectible(vsk, addr); spin_unlock_bh(&vsock_table_lock); break; case SOCK_DGRAM: retval = __vsock_bind_dgram(vsk, addr); break; default: retval = -EINVAL; break; } return retval; } static void vsock_connect_timeout(struct work_struct *work); static struct sock *__vsock_create(struct net *net, struct socket *sock, struct sock *parent, gfp_t priority, unsigned short type, int kern) { struct sock *sk; struct vsock_sock *psk; struct vsock_sock *vsk; sk = sk_alloc(net, AF_VSOCK, priority, &vsock_proto, kern); if (!sk) return NULL; sock_init_data(sock, sk); /* sk->sk_type is normally set in sock_init_data, but only if sock is * non-NULL. We make sure that our sockets always have a type by * setting it here if needed. */ if (!sock) sk->sk_type = type; vsk = vsock_sk(sk); vsock_addr_init(&vsk->local_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); vsock_addr_init(&vsk->remote_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); sk->sk_destruct = vsock_sk_destruct; sk->sk_backlog_rcv = vsock_queue_rcv_skb; sock_reset_flag(sk, SOCK_DONE); INIT_LIST_HEAD(&vsk->bound_table); INIT_LIST_HEAD(&vsk->connected_table); vsk->listener = NULL; INIT_LIST_HEAD(&vsk->pending_links); INIT_LIST_HEAD(&vsk->accept_queue); vsk->rejected = false; vsk->sent_request = false; vsk->ignore_connecting_rst = false; vsk->peer_shutdown = 0; INIT_DELAYED_WORK(&vsk->connect_work, vsock_connect_timeout); INIT_DELAYED_WORK(&vsk->pending_work, vsock_pending_work); psk = parent ? vsock_sk(parent) : NULL; if (parent) { vsk->trusted = psk->trusted; vsk->owner = get_cred(psk->owner); vsk->connect_timeout = psk->connect_timeout; vsk->buffer_size = psk->buffer_size; vsk->buffer_min_size = psk->buffer_min_size; vsk->buffer_max_size = psk->buffer_max_size; security_sk_clone(parent, sk); } else { vsk->trusted = ns_capable_noaudit(&init_user_ns, CAP_NET_ADMIN); vsk->owner = get_current_cred(); vsk->connect_timeout = VSOCK_DEFAULT_CONNECT_TIMEOUT; vsk->buffer_size = VSOCK_DEFAULT_BUFFER_SIZE; vsk->buffer_min_size = VSOCK_DEFAULT_BUFFER_MIN_SIZE; vsk->buffer_max_size = VSOCK_DEFAULT_BUFFER_MAX_SIZE; } return sk; } static bool sock_type_connectible(u16 type) { return (type == SOCK_STREAM) || (type == SOCK_SEQPACKET); } static void __vsock_release(struct sock *sk, int level) { struct vsock_sock *vsk; struct sock *pending; vsk = vsock_sk(sk); pending = NULL; /* Compiler warning. */ /* When "level" is SINGLE_DEPTH_NESTING, use the nested * version to avoid the warning "possible recursive locking * detected". When "level" is 0, lock_sock_nested(sk, level) * is the same as lock_sock(sk). */ lock_sock_nested(sk, level); /* Indicate to vsock_remove_sock() that the socket is being released and * can be removed from the bound_table. Unlike transport reassignment * case, where the socket must remain bound despite vsock_remove_sock() * being called from the transport release() callback. */ sock_set_flag(sk, SOCK_DEAD); if (vsk->transport) vsk->transport->release(vsk); else if (sock_type_connectible(sk->sk_type)) vsock_remove_sock(vsk); sock_orphan(sk); sk->sk_shutdown = SHUTDOWN_MASK; skb_queue_purge(&sk->sk_receive_queue); /* Clean up any sockets that never were accepted. */ while ((pending = vsock_dequeue_accept(sk)) != NULL) { __vsock_release(pending, SINGLE_DEPTH_NESTING); sock_put(pending); } release_sock(sk); sock_put(sk); } static void vsock_sk_destruct(struct sock *sk) { struct vsock_sock *vsk = vsock_sk(sk); /* Flush MSG_ZEROCOPY leftovers. */ __skb_queue_purge(&sk->sk_error_queue); vsock_deassign_transport(vsk); /* When clearing these addresses, there's no need to set the family and * possibly register the address family with the kernel. */ vsock_addr_init(&vsk->local_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); vsock_addr_init(&vsk->remote_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); put_cred(vsk->owner); } static int vsock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { int err; err = sock_queue_rcv_skb(sk, skb); if (err) kfree_skb(skb); return err; } struct sock *vsock_create_connected(struct sock *parent) { return __vsock_create(sock_net(parent), NULL, parent, GFP_KERNEL, parent->sk_type, 0); } EXPORT_SYMBOL_GPL(vsock_create_connected); s64 vsock_stream_has_data(struct vsock_sock *vsk) { if (WARN_ON(!vsk->transport)) return 0; return vsk->transport->stream_has_data(vsk); } EXPORT_SYMBOL_GPL(vsock_stream_has_data); s64 vsock_connectible_has_data(struct vsock_sock *vsk) { struct sock *sk = sk_vsock(vsk); if (WARN_ON(!vsk->transport)) return 0; if (sk->sk_type == SOCK_SEQPACKET) return vsk->transport->seqpacket_has_data(vsk); else return vsock_stream_has_data(vsk); } EXPORT_SYMBOL_GPL(vsock_connectible_has_data); s64 vsock_stream_has_space(struct vsock_sock *vsk) { if (WARN_ON(!vsk->transport)) return 0; return vsk->transport->stream_has_space(vsk); } EXPORT_SYMBOL_GPL(vsock_stream_has_space); void vsock_data_ready(struct sock *sk) { struct vsock_sock *vsk = vsock_sk(sk); if (vsock_stream_has_data(vsk) >= sk->sk_rcvlowat || sock_flag(sk, SOCK_DONE)) sk->sk_data_ready(sk); } EXPORT_SYMBOL_GPL(vsock_data_ready); /* Dummy callback required by sockmap. * See unconditional call of saved_close() in sock_map_close(). */ static void vsock_close(struct sock *sk, long timeout) { } static int vsock_release(struct socket *sock) { struct sock *sk = sock->sk; if (!sk) return 0; sk->sk_prot->close(sk, 0); __vsock_release(sk, 0); sock->sk = NULL; sock->state = SS_FREE; return 0; } static int vsock_bind(struct socket *sock, struct sockaddr_unsized *addr, int addr_len) { int err; struct sock *sk; struct sockaddr_vm *vm_addr; sk = sock->sk; if (vsock_addr_cast(addr, addr_len, &vm_addr) != 0) return -EINVAL; lock_sock(sk); err = __vsock_bind(sk, vm_addr); release_sock(sk); return err; } static int vsock_getname(struct socket *sock, struct sockaddr *addr, int peer) { int err; struct sock *sk; struct vsock_sock *vsk; struct sockaddr_vm *vm_addr; sk = sock->sk; vsk = vsock_sk(sk); err = 0; lock_sock(sk); if (peer) { if (sock->state != SS_CONNECTED) { err = -ENOTCONN; goto out; } vm_addr = &vsk->remote_addr; } else { vm_addr = &vsk->local_addr; } BUILD_BUG_ON(sizeof(*vm_addr) > sizeof(struct sockaddr_storage)); memcpy(addr, vm_addr, sizeof(*vm_addr)); err = sizeof(*vm_addr); out: release_sock(sk); return err; } void vsock_linger(struct sock *sk) { DEFINE_WAIT_FUNC(wait, woken_wake_function); ssize_t (*unsent)(struct vsock_sock *vsk); struct vsock_sock *vsk = vsock_sk(sk); long timeout; if (!sock_flag(sk, SOCK_LINGER)) return; timeout = sk->sk_lingertime; if (!timeout) return; /* Transports must implement `unsent_bytes` if they want to support * SOCK_LINGER through `vsock_linger()` since we use it to check when * the socket can be closed. */ unsent = vsk->transport->unsent_bytes; if (!unsent) return; add_wait_queue(sk_sleep(sk), &wait); do { if (sk_wait_event(sk, &timeout, unsent(vsk) == 0, &wait)) break; } while (!signal_pending(current) && timeout); remove_wait_queue(sk_sleep(sk), &wait); } EXPORT_SYMBOL_GPL(vsock_linger); static int vsock_shutdown(struct socket *sock, int mode) { int err; struct sock *sk; /* User level uses SHUT_RD (0) and SHUT_WR (1), but the kernel uses * RCV_SHUTDOWN (1) and SEND_SHUTDOWN (2), so we must increment mode * here like the other address families do. Note also that the * increment makes SHUT_RDWR (2) into RCV_SHUTDOWN | SEND_SHUTDOWN (3), * which is what we want. */ mode++; if ((mode & ~SHUTDOWN_MASK) || !mode) return -EINVAL; /* If this is a connection oriented socket and it is not connected then * bail out immediately. If it is a DGRAM socket then we must first * kick the socket so that it wakes up from any sleeping calls, for * example recv(), and then afterwards return the error. */ sk = sock->sk; lock_sock(sk); if (sock->state == SS_UNCONNECTED) { err = -ENOTCONN; if (sock_type_connectible(sk->sk_type)) goto out; } else { sock->state = SS_DISCONNECTING; err = 0; } /* Receive and send shutdowns are treated alike. */ mode = mode & (RCV_SHUTDOWN | SEND_SHUTDOWN); if (mode) { sk->sk_shutdown |= mode; sk->sk_state_change(sk); if (sock_type_connectible(sk->sk_type)) { sock_reset_flag(sk, SOCK_DONE); vsock_send_shutdown(sk, mode); } } out: release_sock(sk); return err; } static __poll_t vsock_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk; __poll_t mask; struct vsock_sock *vsk; sk = sock->sk; vsk = vsock_sk(sk); poll_wait(file, sk_sleep(sk), wait); mask = 0; if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue)) /* Signify that there has been an error on this socket. */ mask |= EPOLLERR; /* INET sockets treat local write shutdown and peer write shutdown as a * case of EPOLLHUP set. */ if ((sk->sk_shutdown == SHUTDOWN_MASK) || ((sk->sk_shutdown & SEND_SHUTDOWN) && (vsk->peer_shutdown & SEND_SHUTDOWN))) { mask |= EPOLLHUP; } if (sk->sk_shutdown & RCV_SHUTDOWN || vsk->peer_shutdown & SEND_SHUTDOWN) { mask |= EPOLLRDHUP; } if (sk_is_readable(sk)) mask |= EPOLLIN | EPOLLRDNORM; if (sock->type == SOCK_DGRAM) { /* For datagram sockets we can read if there is something in * the queue and write as long as the socket isn't shutdown for * sending. */ if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || (sk->sk_shutdown & RCV_SHUTDOWN)) { mask |= EPOLLIN | EPOLLRDNORM; } if (!(sk->sk_shutdown & SEND_SHUTDOWN)) mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; } else if (sock_type_connectible(sk->sk_type)) { const struct vsock_transport *transport; lock_sock(sk); transport = vsk->transport; /* Listening sockets that have connections in their accept * queue can be read. */ if (sk->sk_state == TCP_LISTEN && !vsock_is_accept_queue_empty(sk)) mask |= EPOLLIN | EPOLLRDNORM; /* If there is something in the queue then we can read. */ if (transport && transport->stream_is_active(vsk) && !(sk->sk_shutdown & RCV_SHUTDOWN)) { bool data_ready_now = false; int target = sock_rcvlowat(sk, 0, INT_MAX); int ret = transport->notify_poll_in( vsk, target, &data_ready_now); if (ret < 0) { mask |= EPOLLERR; } else { if (data_ready_now) mask |= EPOLLIN | EPOLLRDNORM; } } /* Sockets whose connections have been closed, reset, or * terminated should also be considered read, and we check the * shutdown flag for that. */ if (sk->sk_shutdown & RCV_SHUTDOWN || vsk->peer_shutdown & SEND_SHUTDOWN) { mask |= EPOLLIN | EPOLLRDNORM; } /* Connected sockets that can produce data can be written. */ if (transport && sk->sk_state == TCP_ESTABLISHED) { if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { bool space_avail_now = false; int ret = transport->notify_poll_out( vsk, 1, &space_avail_now); if (ret < 0) { mask |= EPOLLERR; } else { if (space_avail_now) /* Remove EPOLLWRBAND since INET * sockets are not setting it. */ mask |= EPOLLOUT | EPOLLWRNORM; } } } /* Simulate INET socket poll behaviors, which sets * EPOLLOUT|EPOLLWRNORM when peer is closed and nothing to read, * but local send is not shutdown. */ if (sk->sk_state == TCP_CLOSE || sk->sk_state == TCP_CLOSING) { if (!(sk->sk_shutdown & SEND_SHUTDOWN)) mask |= EPOLLOUT | EPOLLWRNORM; } release_sock(sk); } return mask; } static int vsock_read_skb(struct sock *sk, skb_read_actor_t read_actor) { struct vsock_sock *vsk = vsock_sk(sk); if (WARN_ON_ONCE(!vsk->transport)) return -ENODEV; return vsk->transport->read_skb(vsk, read_actor); } static int vsock_dgram_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { int err; struct sock *sk; struct vsock_sock *vsk; struct sockaddr_vm *remote_addr; const struct vsock_transport *transport; if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; /* For now, MSG_DONTWAIT is always assumed... */ err = 0; sk = sock->sk; vsk = vsock_sk(sk); lock_sock(sk); transport = vsk->transport; err = vsock_auto_bind(vsk); if (err) goto out; /* If the provided message contains an address, use that. Otherwise * fall back on the socket's remote handle (if it has been connected). */ if (msg->msg_name && vsock_addr_cast(msg->msg_name, msg->msg_namelen, &remote_addr) == 0) { /* Ensure this address is of the right type and is a valid * destination. */ if (remote_addr->svm_cid == VMADDR_CID_ANY) remote_addr->svm_cid = transport->get_local_cid(); if (!vsock_addr_bound(remote_addr)) { err = -EINVAL; goto out; } } else if (sock->state == SS_CONNECTED) { remote_addr = &vsk->remote_addr; if (remote_addr->svm_cid == VMADDR_CID_ANY) remote_addr->svm_cid = transport->get_local_cid(); /* XXX Should connect() or this function ensure remote_addr is * bound? */ if (!vsock_addr_bound(&vsk->remote_addr)) { err = -EINVAL; goto out; } } else { err = -EINVAL; goto out; } if (!transport->dgram_allow(vsk, remote_addr->svm_cid, remote_addr->svm_port)) { err = -EINVAL; goto out; } err = transport->dgram_enqueue(vsk, remote_addr, msg, len); out: release_sock(sk); return err; } static int vsock_dgram_connect(struct socket *sock, struct sockaddr_unsized *addr, int addr_len, int flags) { int err; struct sock *sk; struct vsock_sock *vsk; struct sockaddr_vm *remote_addr; sk = sock->sk; vsk = vsock_sk(sk); err = vsock_addr_cast(addr, addr_len, &remote_addr); if (err == -EAFNOSUPPORT && remote_addr->svm_family == AF_UNSPEC) { lock_sock(sk); vsock_addr_init(&vsk->remote_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); sock->state = SS_UNCONNECTED; release_sock(sk); return 0; } else if (err != 0) return -EINVAL; lock_sock(sk); err = vsock_auto_bind(vsk); if (err) goto out; if (!vsk->transport->dgram_allow(vsk, remote_addr->svm_cid, remote_addr->svm_port)) { err = -EINVAL; goto out; } memcpy(&vsk->remote_addr, remote_addr, sizeof(vsk->remote_addr)); sock->state = SS_CONNECTED; /* sock map disallows redirection of non-TCP sockets with sk_state != * TCP_ESTABLISHED (see sock_map_redirect_allowed()), so we set * TCP_ESTABLISHED here to allow redirection of connected vsock dgrams. * * This doesn't seem to be abnormal state for datagram sockets, as the * same approach can be see in other datagram socket types as well * (such as unix sockets). */ sk->sk_state = TCP_ESTABLISHED; out: release_sock(sk); return err; } int __vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { struct sock *sk = sock->sk; struct vsock_sock *vsk = vsock_sk(sk); return vsk->transport->dgram_dequeue(vsk, msg, len, flags); } int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { #ifdef CONFIG_BPF_SYSCALL struct sock *sk = sock->sk; const struct proto *prot; prot = READ_ONCE(sk->sk_prot); if (prot != &vsock_proto) return prot->recvmsg(sk, msg, len, flags); #endif return __vsock_dgram_recvmsg(sock, msg, len, flags); } EXPORT_SYMBOL_GPL(vsock_dgram_recvmsg); static int vsock_do_ioctl(struct socket *sock, unsigned int cmd, int __user *arg) { struct sock *sk = sock->sk; struct vsock_sock *vsk; int ret; vsk = vsock_sk(sk); switch (cmd) { case SIOCINQ: { ssize_t n_bytes; if (!vsk->transport) { ret = -EOPNOTSUPP; break; } if (sock_type_connectible(sk->sk_type) && sk->sk_state == TCP_LISTEN) { ret = -EINVAL; break; } n_bytes = vsock_stream_has_data(vsk); if (n_bytes < 0) { ret = n_bytes; break; } ret = put_user(n_bytes, arg); break; } case SIOCOUTQ: { ssize_t n_bytes; if (!vsk->transport || !vsk->transport->unsent_bytes) { ret = -EOPNOTSUPP; break; } if (sock_type_connectible(sk->sk_type) && sk->sk_state == TCP_LISTEN) { ret = -EINVAL; break; } n_bytes = vsk->transport->unsent_bytes(vsk); if (n_bytes < 0) { ret = n_bytes; break; } ret = put_user(n_bytes, arg); break; } default: ret = -ENOIOCTLCMD; } return ret; } static int vsock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { int ret; lock_sock(sock->sk); ret = vsock_do_ioctl(sock, cmd, (int __user *)arg); release_sock(sock->sk); return ret; } static const struct proto_ops vsock_dgram_ops = { .family = PF_VSOCK, .owner = THIS_MODULE, .release = vsock_release, .bind = vsock_bind, .connect = vsock_dgram_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = vsock_getname, .poll = vsock_poll, .ioctl = vsock_ioctl, .listen = sock_no_listen, .shutdown = vsock_shutdown, .sendmsg = vsock_dgram_sendmsg, .recvmsg = vsock_dgram_recvmsg, .mmap = sock_no_mmap, .read_skb = vsock_read_skb, }; static int vsock_transport_cancel_pkt(struct vsock_sock *vsk) { const struct vsock_transport *transport = vsk->transport; if (!transport || !transport->cancel_pkt) return -EOPNOTSUPP; return transport->cancel_pkt(vsk); } static void vsock_connect_timeout(struct work_struct *work) { struct sock *sk; struct vsock_sock *vsk; vsk = container_of(work, struct vsock_sock, connect_work.work); sk = sk_vsock(vsk); lock_sock(sk); if (sk->sk_state == TCP_SYN_SENT && (sk->sk_shutdown != SHUTDOWN_MASK)) { sk->sk_state = TCP_CLOSE; sk->sk_socket->state = SS_UNCONNECTED; sk->sk_err = ETIMEDOUT; sk_error_report(sk); vsock_transport_cancel_pkt(vsk); } release_sock(sk); sock_put(sk); } static int vsock_connect(struct socket *sock, struct sockaddr_unsized *addr, int addr_len, int flags) { int err; struct sock *sk; struct vsock_sock *vsk; const struct vsock_transport *transport; struct sockaddr_vm *remote_addr; long timeout; DEFINE_WAIT(wait); err = 0; sk = sock->sk; vsk = vsock_sk(sk); lock_sock(sk); /* XXX AF_UNSPEC should make us disconnect like AF_INET. */ switch (sock->state) { case SS_CONNECTED: err = -EISCONN; goto out; case SS_DISCONNECTING: err = -EINVAL; goto out; case SS_CONNECTING: /* This continues on so we can move sock into the SS_CONNECTED * state once the connection has completed (at which point err * will be set to zero also). Otherwise, we will either wait * for the connection or return -EALREADY should this be a * non-blocking call. */ err = -EALREADY; if (flags & O_NONBLOCK) goto out; break; default: if ((sk->sk_state == TCP_LISTEN) || vsock_addr_cast(addr, addr_len, &remote_addr) != 0) { err = -EINVAL; goto out; } /* Set the remote address that we are connecting to. */ memcpy(&vsk->remote_addr, remote_addr, sizeof(vsk->remote_addr)); err = vsock_assign_transport(vsk, NULL); if (err) goto out; transport = vsk->transport; /* The hypervisor and well-known contexts do not have socket * endpoints. */ if (!transport || !transport->stream_allow(vsk, remote_addr->svm_cid, remote_addr->svm_port)) { err = -ENETUNREACH; goto out; } if (vsock_msgzerocopy_allow(transport)) { set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); } else if (sock_flag(sk, SOCK_ZEROCOPY)) { /* If this option was set before 'connect()', * when transport was unknown, check that this * feature is supported here. */ err = -EOPNOTSUPP; goto out; } err = vsock_auto_bind(vsk); if (err) goto out; sk->sk_state = TCP_SYN_SENT; err = transport->connect(vsk); if (err < 0) goto out; /* sk_err might have been set as a result of an earlier * (failed) connect attempt. */ sk->sk_err = 0; /* Mark sock as connecting and set the error code to in * progress in case this is a non-blocking connect. */ sock->state = SS_CONNECTING; err = -EINPROGRESS; } /* The receive path will handle all communication until we are able to * enter the connected state. Here we wait for the connection to be * completed or a notification of an error. */ timeout = vsk->connect_timeout; prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); /* If the socket is already closing or it is in an error state, there * is no point in waiting. */ while (sk->sk_state != TCP_ESTABLISHED && sk->sk_state != TCP_CLOSING && sk->sk_err == 0) { if (flags & O_NONBLOCK) { /* If we're not going to block, we schedule a timeout * function to generate a timeout on the connection * attempt, in case the peer doesn't respond in a * timely manner. We hold on to the socket until the * timeout fires. */ sock_hold(sk); /* If the timeout function is already scheduled, * reschedule it, then ungrab the socket refcount to * keep it balanced. */ if (mod_delayed_work(system_percpu_wq, &vsk->connect_work, timeout)) sock_put(sk); /* Skip ahead to preserve error code set above. */ goto out_wait; } release_sock(sk); timeout = schedule_timeout(timeout); lock_sock(sk); /* Connection established. Whatever happens to socket once we * release it, that's not connect()'s concern. No need to go * into signal and timeout handling. Call it a day. * * Note that allowing to "reset" an already established socket * here is racy and insecure. */ if (sk->sk_state == TCP_ESTABLISHED) break; /* If connection was _not_ established and a signal/timeout came * to be, we want the socket's state reset. User space may want * to retry. * * sk_state != TCP_ESTABLISHED implies that socket is not on * vsock_connected_table. We keep the binding and the transport * assigned. */ if (signal_pending(current) || timeout == 0) { err = timeout == 0 ? -ETIMEDOUT : sock_intr_errno(timeout); /* Listener might have already responded with * VIRTIO_VSOCK_OP_RESPONSE. Its handling expects our * sk_state == TCP_SYN_SENT, which hereby we break. * In such case VIRTIO_VSOCK_OP_RST will follow. */ sk->sk_state = TCP_CLOSE; sock->state = SS_UNCONNECTED; /* Try to cancel VIRTIO_VSOCK_OP_REQUEST skb sent out by * transport->connect(). */ vsock_transport_cancel_pkt(vsk); goto out_wait; } prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); } if (sk->sk_err) { err = -sk->sk_err; sk->sk_state = TCP_CLOSE; sock->state = SS_UNCONNECTED; } else { err = 0; } out_wait: finish_wait(sk_sleep(sk), &wait); out: release_sock(sk); return err; } static int vsock_accept(struct socket *sock, struct socket *newsock, struct proto_accept_arg *arg) { struct sock *listener; int err; struct sock *connected; struct vsock_sock *vconnected; long timeout; DEFINE_WAIT(wait); err = 0; listener = sock->sk; lock_sock(listener); if (!sock_type_connectible(sock->type)) { err = -EOPNOTSUPP; goto out; } if (listener->sk_state != TCP_LISTEN) { err = -EINVAL; goto out; } /* Wait for children sockets to appear; these are the new sockets * created upon connection establishment. */ timeout = sock_rcvtimeo(listener, arg->flags & O_NONBLOCK); while ((connected = vsock_dequeue_accept(listener)) == NULL && listener->sk_err == 0 && timeout != 0) { prepare_to_wait(sk_sleep(listener), &wait, TASK_INTERRUPTIBLE); release_sock(listener); timeout = schedule_timeout(timeout); finish_wait(sk_sleep(listener), &wait); lock_sock(listener); if (signal_pending(current)) { err = sock_intr_errno(timeout); goto out; } } if (listener->sk_err) { err = -listener->sk_err; } else if (!connected) { err = -EAGAIN; } if (connected) { sk_acceptq_removed(listener); lock_sock_nested(connected, SINGLE_DEPTH_NESTING); vconnected = vsock_sk(connected); /* If the listener socket has received an error, then we should * reject this socket and return. Note that we simply mark the * socket rejected, drop our reference, and let the cleanup * function handle the cleanup; the fact that we found it in * the listener's accept queue guarantees that the cleanup * function hasn't run yet. */ if (err) { vconnected->rejected = true; } else { newsock->state = SS_CONNECTED; sock_graft(connected, newsock); set_bit(SOCK_CUSTOM_SOCKOPT, &connected->sk_socket->flags); if (vsock_msgzerocopy_allow(vconnected->transport)) set_bit(SOCK_SUPPORT_ZC, &connected->sk_socket->flags); } release_sock(connected); sock_put(connected); } out: release_sock(listener); return err; } static int vsock_listen(struct socket *sock, int backlog) { int err; struct sock *sk; struct vsock_sock *vsk; sk = sock->sk; lock_sock(sk); if (!sock_type_connectible(sk->sk_type)) { err = -EOPNOTSUPP; goto out; } if (sock->state != SS_UNCONNECTED) { err = -EINVAL; goto out; } vsk = vsock_sk(sk); if (!vsock_addr_bound(&vsk->local_addr)) { err = -EINVAL; goto out; } sk->sk_max_ack_backlog = backlog; sk->sk_state = TCP_LISTEN; err = 0; out: release_sock(sk); return err; } static void vsock_update_buffer_size(struct vsock_sock *vsk, const struct vsock_transport *transport, u64 val) { if (val < vsk->buffer_min_size) val = vsk->buffer_min_size; if (val > vsk->buffer_max_size) val = vsk->buffer_max_size; if (val != vsk->buffer_size && transport && transport->notify_buffer_size) transport->notify_buffer_size(vsk, &val); vsk->buffer_size = val; } static int vsock_connectible_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { int err; struct sock *sk; struct vsock_sock *vsk; const struct vsock_transport *transport; u64 val; if (level != AF_VSOCK && level != SOL_SOCKET) return -ENOPROTOOPT; #define COPY_IN(_v) \ do { \ if (optlen < sizeof(_v)) { \ err = -EINVAL; \ goto exit; \ } \ if (copy_from_sockptr(&_v, optval, sizeof(_v)) != 0) { \ err = -EFAULT; \ goto exit; \ } \ } while (0) err = 0; sk = sock->sk; vsk = vsock_sk(sk); lock_sock(sk); transport = vsk->transport; if (level == SOL_SOCKET) { int zerocopy; if (optname != SO_ZEROCOPY) { release_sock(sk); return sock_setsockopt(sock, level, optname, optval, optlen); } /* Use 'int' type here, because variable to * set this option usually has this type. */ COPY_IN(zerocopy); if (zerocopy < 0 || zerocopy > 1) { err = -EINVAL; goto exit; } if (transport && !vsock_msgzerocopy_allow(transport)) { err = -EOPNOTSUPP; goto exit; } sock_valbool_flag(sk, SOCK_ZEROCOPY, zerocopy); goto exit; } switch (optname) { case SO_VM_SOCKETS_BUFFER_SIZE: COPY_IN(val); vsock_update_buffer_size(vsk, transport, val); break; case SO_VM_SOCKETS_BUFFER_MAX_SIZE: COPY_IN(val); vsk->buffer_max_size = val; vsock_update_buffer_size(vsk, transport, vsk->buffer_size); break; case SO_VM_SOCKETS_BUFFER_MIN_SIZE: COPY_IN(val); vsk->buffer_min_size = val; vsock_update_buffer_size(vsk, transport, vsk->buffer_size); break; case SO_VM_SOCKETS_CONNECT_TIMEOUT_NEW: case SO_VM_SOCKETS_CONNECT_TIMEOUT_OLD: { struct __kernel_sock_timeval tv; err = sock_copy_user_timeval(&tv, optval, optlen, optname == SO_VM_SOCKETS_CONNECT_TIMEOUT_OLD); if (err) break; if (tv.tv_sec >= 0 && tv.tv_usec < USEC_PER_SEC && tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)) { vsk->connect_timeout = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, (USEC_PER_SEC / HZ)); if (vsk->connect_timeout == 0) vsk->connect_timeout = VSOCK_DEFAULT_CONNECT_TIMEOUT; } else { err = -ERANGE; } break; } default: err = -ENOPROTOOPT; break; } #undef COPY_IN exit: release_sock(sk); return err; } static int vsock_connectible_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; struct vsock_sock *vsk = vsock_sk(sk); union { u64 val64; struct old_timeval32 tm32; struct __kernel_old_timeval tm; struct __kernel_sock_timeval stm; } v; int lv = sizeof(v.val64); int len; if (level != AF_VSOCK) return -ENOPROTOOPT; if (get_user(len, optlen)) return -EFAULT; memset(&v, 0, sizeof(v)); switch (optname) { case SO_VM_SOCKETS_BUFFER_SIZE: v.val64 = vsk->buffer_size; break; case SO_VM_SOCKETS_BUFFER_MAX_SIZE: v.val64 = vsk->buffer_max_size; break; case SO_VM_SOCKETS_BUFFER_MIN_SIZE: v.val64 = vsk->buffer_min_size; break; case SO_VM_SOCKETS_CONNECT_TIMEOUT_NEW: case SO_VM_SOCKETS_CONNECT_TIMEOUT_OLD: lv = sock_get_timeout(vsk->connect_timeout, &v, optname == SO_VM_SOCKETS_CONNECT_TIMEOUT_OLD); break; default: return -ENOPROTOOPT; } if (len < lv) return -EINVAL; if (len > lv) len = lv; if (copy_to_user(optval, &v, len)) return -EFAULT; if (put_user(len, optlen)) return -EFAULT; return 0; } static int vsock_connectible_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk; struct vsock_sock *vsk; const struct vsock_transport *transport; ssize_t total_written; long timeout; int err; struct vsock_transport_send_notify_data send_data; DEFINE_WAIT_FUNC(wait, woken_wake_function); sk = sock->sk; vsk = vsock_sk(sk); total_written = 0; err = 0; if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; lock_sock(sk); transport = vsk->transport; /* Callers should not provide a destination with connection oriented * sockets. */ if (msg->msg_namelen) { err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; goto out; } /* Send data only if both sides are not shutdown in the direction. */ if (sk->sk_shutdown & SEND_SHUTDOWN || vsk->peer_shutdown & RCV_SHUTDOWN) { err = -EPIPE; goto out; } if (!transport || sk->sk_state != TCP_ESTABLISHED || !vsock_addr_bound(&vsk->local_addr)) { err = -ENOTCONN; goto out; } if (!vsock_addr_bound(&vsk->remote_addr)) { err = -EDESTADDRREQ; goto out; } if (msg->msg_flags & MSG_ZEROCOPY && !vsock_msgzerocopy_allow(transport)) { err = -EOPNOTSUPP; goto out; } /* Wait for room in the produce queue to enqueue our user's data. */ timeout = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); err = transport->notify_send_init(vsk, &send_data); if (err < 0) goto out; while (total_written < len) { ssize_t written; add_wait_queue(sk_sleep(sk), &wait); while (vsock_stream_has_space(vsk) == 0 && sk->sk_err == 0 && !(sk->sk_shutdown & SEND_SHUTDOWN) && !(vsk->peer_shutdown & RCV_SHUTDOWN)) { /* Don't wait for non-blocking sockets. */ if (timeout == 0) { err = -EAGAIN; remove_wait_queue(sk_sleep(sk), &wait); goto out_err; } err = transport->notify_send_pre_block(vsk, &send_data); if (err < 0) { remove_wait_queue(sk_sleep(sk), &wait); goto out_err; } release_sock(sk); timeout = wait_woken(&wait, TASK_INTERRUPTIBLE, timeout); lock_sock(sk); if (signal_pending(current)) { err = sock_intr_errno(timeout); remove_wait_queue(sk_sleep(sk), &wait); goto out_err; } else if (timeout == 0) { err = -EAGAIN; remove_wait_queue(sk_sleep(sk), &wait); goto out_err; } } remove_wait_queue(sk_sleep(sk), &wait); /* These checks occur both as part of and after the loop * conditional since we need to check before and after * sleeping. */ if (sk->sk_err) { err = -sk->sk_err; goto out_err; } else if ((sk->sk_shutdown & SEND_SHUTDOWN) || (vsk->peer_shutdown & RCV_SHUTDOWN)) { err = -EPIPE; goto out_err; } err = transport->notify_send_pre_enqueue(vsk, &send_data); if (err < 0) goto out_err; /* Note that enqueue will only write as many bytes as are free * in the produce queue, so we don't need to ensure len is * smaller than the queue size. It is the caller's * responsibility to check how many bytes we were able to send. */ if (sk->sk_type == SOCK_SEQPACKET) { written = transport->seqpacket_enqueue(vsk, msg, len - total_written); } else { written = transport->stream_enqueue(vsk, msg, len - total_written); } if (written < 0) { err = written; goto out_err; } total_written += written; err = transport->notify_send_post_enqueue( vsk, written, &send_data); if (err < 0) goto out_err; } out_err: if (total_written > 0) { /* Return number of written bytes only if: * 1) SOCK_STREAM socket. * 2) SOCK_SEQPACKET socket when whole buffer is sent. */ if (sk->sk_type == SOCK_STREAM || total_written == len) err = total_written; } out: if (sk->sk_type == SOCK_STREAM) err = sk_stream_error(sk, msg->msg_flags, err); release_sock(sk); return err; } static int vsock_connectible_wait_data(struct sock *sk, struct wait_queue_entry *wait, long timeout, struct vsock_transport_recv_notify_data *recv_data, size_t target) { const struct vsock_transport *transport; struct vsock_sock *vsk; s64 data; int err; vsk = vsock_sk(sk); err = 0; transport = vsk->transport; while (1) { prepare_to_wait(sk_sleep(sk), wait, TASK_INTERRUPTIBLE); data = vsock_connectible_has_data(vsk); if (data != 0) break; if (sk->sk_err != 0 || (sk->sk_shutdown & RCV_SHUTDOWN) || (vsk->peer_shutdown & SEND_SHUTDOWN)) { break; } /* Don't wait for non-blocking sockets. */ if (timeout == 0) { err = -EAGAIN; break; } if (recv_data) { err = transport->notify_recv_pre_block(vsk, target, recv_data); if (err < 0) break; } release_sock(sk); timeout = schedule_timeout(timeout); lock_sock(sk); if (signal_pending(current)) { err = sock_intr_errno(timeout); break; } else if (timeout == 0) { err = -EAGAIN; break; } } finish_wait(sk_sleep(sk), wait); if (err) return err; /* Internal transport error when checking for available * data. XXX This should be changed to a connection * reset in a later change. */ if (data < 0) return -ENOMEM; return data; } static int __vsock_stream_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags) { struct vsock_transport_recv_notify_data recv_data; const struct vsock_transport *transport; struct vsock_sock *vsk; ssize_t copied; size_t target; long timeout; int err; DEFINE_WAIT(wait); vsk = vsock_sk(sk); transport = vsk->transport; /* We must not copy less than target bytes into the user's buffer * before returning successfully, so we wait for the consume queue to * have that much data to consume before dequeueing. Note that this * makes it impossible to handle cases where target is greater than the * queue size. */ target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); if (target >= transport->stream_rcvhiwat(vsk)) { err = -ENOMEM; goto out; } timeout = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); copied = 0; err = transport->notify_recv_init(vsk, target, &recv_data); if (err < 0) goto out; while (1) { ssize_t read; err = vsock_connectible_wait_data(sk, &wait, timeout, &recv_data, target); if (err <= 0) break; err = transport->notify_recv_pre_dequeue(vsk, target, &recv_data); if (err < 0) break; read = transport->stream_dequeue(vsk, msg, len - copied, flags); if (read < 0) { err = read; break; } copied += read; err = transport->notify_recv_post_dequeue(vsk, target, read, !(flags & MSG_PEEK), &recv_data); if (err < 0) goto out; if (read >= target || flags & MSG_PEEK) break; target -= read; } if (sk->sk_err) err = -sk->sk_err; else if (sk->sk_shutdown & RCV_SHUTDOWN) err = 0; if (copied > 0) err = copied; out: return err; } static int __vsock_seqpacket_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags) { const struct vsock_transport *transport; struct vsock_sock *vsk; ssize_t msg_len; long timeout; int err = 0; DEFINE_WAIT(wait); vsk = vsock_sk(sk); transport = vsk->transport; timeout = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); err = vsock_connectible_wait_data(sk, &wait, timeout, NULL, 0); if (err <= 0) goto out; msg_len = transport->seqpacket_dequeue(vsk, msg, flags); if (msg_len < 0) { err = msg_len; goto out; } if (sk->sk_err) { err = -sk->sk_err; } else if (sk->sk_shutdown & RCV_SHUTDOWN) { err = 0; } else { /* User sets MSG_TRUNC, so return real length of * packet. */ if (flags & MSG_TRUNC) err = msg_len; else err = len - msg_data_left(msg); /* Always set MSG_TRUNC if real length of packet is * bigger than user's buffer. */ if (msg_len > len) msg->msg_flags |= MSG_TRUNC; } out: return err; } int __vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { struct sock *sk; struct vsock_sock *vsk; const struct vsock_transport *transport; int err; sk = sock->sk; if (unlikely(flags & MSG_ERRQUEUE)) return sock_recv_errqueue(sk, msg, len, SOL_VSOCK, VSOCK_RECVERR); vsk = vsock_sk(sk); err = 0; lock_sock(sk); transport = vsk->transport; if (!transport || sk->sk_state != TCP_ESTABLISHED) { /* Recvmsg is supposed to return 0 if a peer performs an * orderly shutdown. Differentiate between that case and when a * peer has not connected or a local shutdown occurred with the * SOCK_DONE flag. */ if (sock_flag(sk, SOCK_DONE)) err = 0; else err = -ENOTCONN; goto out; } if (flags & MSG_OOB) { err = -EOPNOTSUPP; goto out; } /* We don't check peer_shutdown flag here since peer may actually shut * down, but there can be data in the queue that a local socket can * receive. */ if (sk->sk_shutdown & RCV_SHUTDOWN) { err = 0; goto out; } /* It is valid on Linux to pass in a zero-length receive buffer. This * is not an error. We may as well bail out now. */ if (!len) { err = 0; goto out; } if (sk->sk_type == SOCK_STREAM) err = __vsock_stream_recvmsg(sk, msg, len, flags); else err = __vsock_seqpacket_recvmsg(sk, msg, len, flags); out: release_sock(sk); return err; } int vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { #ifdef CONFIG_BPF_SYSCALL struct sock *sk = sock->sk; const struct proto *prot; prot = READ_ONCE(sk->sk_prot); if (prot != &vsock_proto) return prot->recvmsg(sk, msg, len, flags); #endif return __vsock_connectible_recvmsg(sock, msg, len, flags); } EXPORT_SYMBOL_GPL(vsock_connectible_recvmsg); static int vsock_set_rcvlowat(struct sock *sk, int val) { const struct vsock_transport *transport; struct vsock_sock *vsk; vsk = vsock_sk(sk); if (val > vsk->buffer_size) return -EINVAL; transport = vsk->transport; if (transport && transport->notify_set_rcvlowat) { int err; err = transport->notify_set_rcvlowat(vsk, val); if (err) return err; } WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); return 0; } static const struct proto_ops vsock_stream_ops = { .family = PF_VSOCK, .owner = THIS_MODULE, .release = vsock_release, .bind = vsock_bind, .connect = vsock_connect, .socketpair = sock_no_socketpair, .accept = vsock_accept, .getname = vsock_getname, .poll = vsock_poll, .ioctl = vsock_ioctl, .listen = vsock_listen, .shutdown = vsock_shutdown, .setsockopt = vsock_connectible_setsockopt, .getsockopt = vsock_connectible_getsockopt, .sendmsg = vsock_connectible_sendmsg, .recvmsg = vsock_connectible_recvmsg, .mmap = sock_no_mmap, .set_rcvlowat = vsock_set_rcvlowat, .read_skb = vsock_read_skb, }; static const struct proto_ops vsock_seqpacket_ops = { .family = PF_VSOCK, .owner = THIS_MODULE, .release = vsock_release, .bind = vsock_bind, .connect = vsock_connect, .socketpair = sock_no_socketpair, .accept = vsock_accept, .getname = vsock_getname, .poll = vsock_poll, .ioctl = vsock_ioctl, .listen = vsock_listen, .shutdown = vsock_shutdown, .setsockopt = vsock_connectible_setsockopt, .getsockopt = vsock_connectible_getsockopt, .sendmsg = vsock_connectible_sendmsg, .recvmsg = vsock_connectible_recvmsg, .mmap = sock_no_mmap, .read_skb = vsock_read_skb, }; static int vsock_create(struct net *net, struct socket *sock, int protocol, int kern) { struct vsock_sock *vsk; struct sock *sk; int ret; if (!sock) return -EINVAL; if (protocol && protocol != PF_VSOCK) return -EPROTONOSUPPORT; switch (sock->type) { case SOCK_DGRAM: sock->ops = &vsock_dgram_ops; break; case SOCK_STREAM: sock->ops = &vsock_stream_ops; break; case SOCK_SEQPACKET: sock->ops = &vsock_seqpacket_ops; break; default: return -ESOCKTNOSUPPORT; } sock->state = SS_UNCONNECTED; sk = __vsock_create(net, sock, NULL, GFP_KERNEL, 0, kern); if (!sk) return -ENOMEM; vsk = vsock_sk(sk); if (sock->type == SOCK_DGRAM) { ret = vsock_assign_transport(vsk, NULL); if (ret < 0) { sock->sk = NULL; sock_put(sk); return ret; } } /* SOCK_DGRAM doesn't have 'setsockopt' callback set in its * proto_ops, so there is no handler for custom logic. */ if (sock_type_connectible(sock->type)) set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags); vsock_insert_unbound(vsk); return 0; } static const struct net_proto_family vsock_family_ops = { .family = AF_VSOCK, .create = vsock_create, .owner = THIS_MODULE, }; static long vsock_dev_do_ioctl(struct file *filp, unsigned int cmd, void __user *ptr) { u32 __user *p = ptr; int retval = 0; u32 cid; switch (cmd) { case IOCTL_VM_SOCKETS_GET_LOCAL_CID: /* To be compatible with the VMCI behavior, we prioritize the * guest CID instead of well-know host CID (VMADDR_CID_HOST). */ cid = vsock_registered_transport_cid(&transport_g2h); if (cid == VMADDR_CID_ANY) cid = vsock_registered_transport_cid(&transport_h2g); if (cid == VMADDR_CID_ANY) cid = vsock_registered_transport_cid(&transport_local); if (put_user(cid, p) != 0) retval = -EFAULT; break; default: retval = -ENOIOCTLCMD; } return retval; } static long vsock_dev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { return vsock_dev_do_ioctl(filp, cmd, (void __user *)arg); } #ifdef CONFIG_COMPAT static long vsock_dev_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { return vsock_dev_do_ioctl(filp, cmd, compat_ptr(arg)); } #endif static const struct file_operations vsock_device_ops = { .owner = THIS_MODULE, .unlocked_ioctl = vsock_dev_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = vsock_dev_compat_ioctl, #endif .open = nonseekable_open, }; static struct miscdevice vsock_device = { .name = "vsock", .fops = &vsock_device_ops, }; static int __vsock_net_mode_string(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos, enum vsock_net_mode mode, enum vsock_net_mode *new_mode) { char data[VSOCK_NET_MODE_STR_MAX] = {0}; struct ctl_table tmp; int ret; if (!table->data || !table->maxlen || !*lenp) { *lenp = 0; return 0; } tmp = *table; tmp.data = data; if (!write) { const char *p; switch (mode) { case VSOCK_NET_MODE_GLOBAL: p = VSOCK_NET_MODE_STR_GLOBAL; break; case VSOCK_NET_MODE_LOCAL: p = VSOCK_NET_MODE_STR_LOCAL; break; default: WARN_ONCE(true, "netns has invalid vsock mode"); *lenp = 0; return 0; } strscpy(data, p, sizeof(data)); tmp.maxlen = strlen(p); } ret = proc_dostring(&tmp, write, buffer, lenp, ppos); if (ret || !write) return ret; if (*lenp >= sizeof(data)) return -EINVAL; if (!strncmp(data, VSOCK_NET_MODE_STR_GLOBAL, sizeof(data))) *new_mode = VSOCK_NET_MODE_GLOBAL; else if (!strncmp(data, VSOCK_NET_MODE_STR_LOCAL, sizeof(data))) *new_mode = VSOCK_NET_MODE_LOCAL; else return -EINVAL; return 0; } static int vsock_net_mode_string(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net; if (write) return -EPERM; net = container_of(table->data, struct net, vsock.mode); return __vsock_net_mode_string(table, write, buffer, lenp, ppos, vsock_net_mode(net), NULL); } static int vsock_net_child_mode_string(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { enum vsock_net_mode new_mode; struct net *net; int ret; net = container_of(table->data, struct net, vsock.child_ns_mode); ret = __vsock_net_mode_string(table, write, buffer, lenp, ppos, vsock_net_child_mode(net), &new_mode); if (ret) return ret; if (write) { /* Prevent a "local" namespace from escalating to "global", * which would give nested namespaces access to global CIDs. */ if (vsock_net_mode(net) == VSOCK_NET_MODE_LOCAL && new_mode == VSOCK_NET_MODE_GLOBAL) return -EPERM; if (!vsock_net_set_child_mode(net, new_mode)) return -EBUSY; } return 0; } static struct ctl_table vsock_table[] = { { .procname = "ns_mode", .data = &init_net.vsock.mode, .maxlen = VSOCK_NET_MODE_STR_MAX, .mode = 0444, .proc_handler = vsock_net_mode_string }, { .procname = "child_ns_mode", .data = &init_net.vsock.child_ns_mode, .maxlen = VSOCK_NET_MODE_STR_MAX, .mode = 0644, .proc_handler = vsock_net_child_mode_string }, { .procname = "g2h_fallback", .data = &init_net.vsock.g2h_fallback, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, }; static int __net_init vsock_sysctl_register(struct net *net) { struct ctl_table *table; if (net_eq(net, &init_net)) { table = vsock_table; } else { table = kmemdup(vsock_table, sizeof(vsock_table), GFP_KERNEL); if (!table) goto err_alloc; table[0].data = &net->vsock.mode; table[1].data = &net->vsock.child_ns_mode; table[2].data = &net->vsock.g2h_fallback; } net->vsock.sysctl_hdr = register_net_sysctl_sz(net, "net/vsock", table, ARRAY_SIZE(vsock_table)); if (!net->vsock.sysctl_hdr) goto err_reg; return 0; err_reg: if (!net_eq(net, &init_net)) kfree(table); err_alloc: return -ENOMEM; } static void vsock_sysctl_unregister(struct net *net) { const struct ctl_table *table; table = net->vsock.sysctl_hdr->ctl_table_arg; unregister_net_sysctl_table(net->vsock.sysctl_hdr); if (!net_eq(net, &init_net)) kfree(table); } static void vsock_net_init(struct net *net) { if (net_eq(net, &init_net)) net->vsock.mode = VSOCK_NET_MODE_GLOBAL; else net->vsock.mode = vsock_net_child_mode(current->nsproxy->net_ns); net->vsock.child_ns_mode = net->vsock.mode; net->vsock.child_ns_mode_locked = 0; net->vsock.g2h_fallback = 1; } static __net_init int vsock_sysctl_init_net(struct net *net) { vsock_net_init(net); if (vsock_sysctl_register(net)) return -ENOMEM; return 0; } static __net_exit void vsock_sysctl_exit_net(struct net *net) { vsock_sysctl_unregister(net); } static struct pernet_operations vsock_sysctl_ops = { .init = vsock_sysctl_init_net, .exit = vsock_sysctl_exit_net, }; static int __init vsock_init(void) { int err = 0; vsock_init_tables(); vsock_proto.owner = THIS_MODULE; vsock_device.minor = MISC_DYNAMIC_MINOR; err = misc_register(&vsock_device); if (err) { pr_err("Failed to register misc device\n"); goto err_reset_transport; } err = proto_register(&vsock_proto, 1); /* we want our slab */ if (err) { pr_err("Cannot register vsock protocol\n"); goto err_deregister_misc; } err = sock_register(&vsock_family_ops); if (err) { pr_err("could not register af_vsock (%d) address family: %d\n", AF_VSOCK, err); goto err_unregister_proto; } if (register_pernet_subsys(&vsock_sysctl_ops)) { err = -ENOMEM; goto err_unregister_sock; } vsock_bpf_build_proto(); return 0; err_unregister_sock: sock_unregister(AF_VSOCK); err_unregister_proto: proto_unregister(&vsock_proto); err_deregister_misc: misc_deregister(&vsock_device); err_reset_transport: return err; } static void __exit vsock_exit(void) { misc_deregister(&vsock_device); sock_unregister(AF_VSOCK); proto_unregister(&vsock_proto); unregister_pernet_subsys(&vsock_sysctl_ops); } const struct vsock_transport *vsock_core_get_transport(struct vsock_sock *vsk) { return vsk->transport; } EXPORT_SYMBOL_GPL(vsock_core_get_transport); int vsock_core_register(const struct vsock_transport *t, int features) { const struct vsock_transport *t_h2g, *t_g2h, *t_dgram, *t_local; int err = mutex_lock_interruptible(&vsock_register_mutex); if (err) return err; t_h2g = transport_h2g; t_g2h = transport_g2h; t_dgram = transport_dgram; t_local = transport_local; if (features & VSOCK_TRANSPORT_F_H2G) { if (t_h2g) { err = -EBUSY; goto err_busy; } t_h2g = t; } if (features & VSOCK_TRANSPORT_F_G2H) { if (t_g2h) { err = -EBUSY; goto err_busy; } t_g2h = t; } if (features & VSOCK_TRANSPORT_F_DGRAM) { if (t_dgram) { err = -EBUSY; goto err_busy; } t_dgram = t; } if (features & VSOCK_TRANSPORT_F_LOCAL) { if (t_local) { err = -EBUSY; goto err_busy; } t_local = t; } transport_h2g = t_h2g; transport_g2h = t_g2h; transport_dgram = t_dgram; transport_local = t_local; err_busy: mutex_unlock(&vsock_register_mutex); return err; } EXPORT_SYMBOL_GPL(vsock_core_register); void vsock_core_unregister(const struct vsock_transport *t) { mutex_lock(&vsock_register_mutex); if (transport_h2g == t) transport_h2g = NULL; if (transport_g2h == t) transport_g2h = NULL; if (transport_dgram == t) transport_dgram = NULL; if (transport_local == t) transport_local = NULL; mutex_unlock(&vsock_register_mutex); } EXPORT_SYMBOL_GPL(vsock_core_unregister); module_init(vsock_init); module_exit(vsock_exit); MODULE_AUTHOR("VMware, Inc."); MODULE_DESCRIPTION("VMware Virtual Socket Family"); MODULE_VERSION("1.0.2.0-k"); MODULE_LICENSE("GPL v2");
2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 // SPDX-License-Identifier: GPL-2.0 /* * (C) 2001 Clemson University and The University of Chicago * * See COPYING in top-level directory. */ #include "protocol.h" #include "orangefs-kernel.h" /* tags assigned to kernel upcall operations */ static __u64 next_tag_value; static DEFINE_SPINLOCK(next_tag_value_lock); /* the orangefs memory caches */ /* a cache for orangefs upcall/downcall operations */ static struct kmem_cache *op_cache; int op_cache_initialize(void) { op_cache = kmem_cache_create_usercopy("orangefs_op_cache", sizeof(struct orangefs_kernel_op_s), 0, 0, offsetof(struct orangefs_kernel_op_s, tag), offsetof(struct orangefs_kernel_op_s, upcall) + sizeof(struct orangefs_upcall_s) - offsetof(struct orangefs_kernel_op_s, tag), NULL); if (!op_cache) { gossip_err("Cannot create orangefs_op_cache\n"); return -ENOMEM; } /* initialize our atomic tag counter */ spin_lock(&next_tag_value_lock); next_tag_value = 100; spin_unlock(&next_tag_value_lock); return 0; } int op_cache_finalize(void) { kmem_cache_destroy(op_cache); return 0; } char *get_opname_string(struct orangefs_kernel_op_s *new_op) { if (new_op) { __s32 type = new_op->upcall.type; if (type == ORANGEFS_VFS_OP_FILE_IO) return "OP_FILE_IO"; else if (type == ORANGEFS_VFS_OP_LOOKUP) return "OP_LOOKUP"; else if (type == ORANGEFS_VFS_OP_CREATE) return "OP_CREATE"; else if (type == ORANGEFS_VFS_OP_GETATTR) return "OP_GETATTR"; else if (type == ORANGEFS_VFS_OP_REMOVE) return "OP_REMOVE"; else if (type == ORANGEFS_VFS_OP_MKDIR) return "OP_MKDIR"; else if (type == ORANGEFS_VFS_OP_READDIR) return "OP_READDIR"; else if (type == ORANGEFS_VFS_OP_READDIRPLUS) return "OP_READDIRPLUS"; else if (type == ORANGEFS_VFS_OP_SETATTR) return "OP_SETATTR"; else if (type == ORANGEFS_VFS_OP_SYMLINK) return "OP_SYMLINK"; else if (type == ORANGEFS_VFS_OP_RENAME) return "OP_RENAME"; else if (type == ORANGEFS_VFS_OP_STATFS) return "OP_STATFS"; else if (type == ORANGEFS_VFS_OP_TRUNCATE) return "OP_TRUNCATE"; else if (type == ORANGEFS_VFS_OP_RA_FLUSH) return "OP_RA_FLUSH"; else if (type == ORANGEFS_VFS_OP_FS_MOUNT) return "OP_FS_MOUNT"; else if (type == ORANGEFS_VFS_OP_FS_UMOUNT) return "OP_FS_UMOUNT"; else if (type == ORANGEFS_VFS_OP_GETXATTR) return "OP_GETXATTR"; else if (type == ORANGEFS_VFS_OP_SETXATTR) return "OP_SETXATTR"; else if (type == ORANGEFS_VFS_OP_LISTXATTR) return "OP_LISTXATTR"; else if (type == ORANGEFS_VFS_OP_REMOVEXATTR) return "OP_REMOVEXATTR"; else if (type == ORANGEFS_VFS_OP_PARAM) return "OP_PARAM"; else if (type == ORANGEFS_VFS_OP_PERF_COUNT) return "OP_PERF_COUNT"; else if (type == ORANGEFS_VFS_OP_CANCEL) return "OP_CANCEL"; else if (type == ORANGEFS_VFS_OP_FSYNC) return "OP_FSYNC"; else if (type == ORANGEFS_VFS_OP_FSKEY) return "OP_FSKEY"; else if (type == ORANGEFS_VFS_OP_FEATURES) return "OP_FEATURES"; } return "OP_UNKNOWN?"; } void orangefs_new_tag(struct orangefs_kernel_op_s *op) { spin_lock(&next_tag_value_lock); op->tag = next_tag_value++; if (next_tag_value == 0) next_tag_value = 100; spin_unlock(&next_tag_value_lock); } struct orangefs_kernel_op_s *op_alloc(__s32 type) { struct orangefs_kernel_op_s *new_op = NULL; new_op = kmem_cache_zalloc(op_cache, GFP_KERNEL); if (new_op) { INIT_LIST_HEAD(&new_op->list); spin_lock_init(&new_op->lock); init_completion(&new_op->waitq); new_op->upcall.type = ORANGEFS_VFS_OP_INVALID; new_op->downcall.type = ORANGEFS_VFS_OP_INVALID; new_op->downcall.status = -1; new_op->op_state = OP_VFS_STATE_UNKNOWN; /* initialize the op specific tag and upcall credentials */ orangefs_new_tag(new_op); new_op->upcall.type = type; new_op->attempts = 0; gossip_debug(GOSSIP_CACHE_DEBUG, "Alloced OP (%p: %llu %s)\n", new_op, llu(new_op->tag), get_opname_string(new_op)); new_op->upcall.uid = from_kuid(&init_user_ns, current_fsuid()); new_op->upcall.gid = from_kgid(&init_user_ns, current_fsgid()); } else { gossip_err("op_alloc: kmem_cache_zalloc failed!\n"); } return new_op; } void op_release(struct orangefs_kernel_op_s *orangefs_op) { if (orangefs_op) { gossip_debug(GOSSIP_CACHE_DEBUG, "Releasing OP (%p: %llu)\n", orangefs_op, llu(orangefs_op->tag)); kmem_cache_free(op_cache, orangefs_op); } else { gossip_err("NULL pointer in op_release\n"); } }
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2017 Facebook */ #include <linux/kernel.h> #include <linux/blkdev.h> #include <linux/build_bug.h> #include <linux/debugfs.h> #include "blk.h" #include "blk-mq.h" #include "blk-mq-debugfs.h" #include "blk-mq-sched.h" #include "blk-rq-qos.h" static int queue_poll_stat_show(void *data, struct seq_file *m) { return 0; } static void *queue_requeue_list_start(struct seq_file *m, loff_t *pos) __acquires(&q->requeue_lock) { struct request_queue *q = m->private; spin_lock_irq(&q->requeue_lock); return seq_list_start(&q->requeue_list, *pos); } static void *queue_requeue_list_next(struct seq_file *m, void *v, loff_t *pos) { struct request_queue *q = m->private; return seq_list_next(v, &q->requeue_list, pos); } static void queue_requeue_list_stop(struct seq_file *m, void *v) __releases(&q->requeue_lock) { struct request_queue *q = m->private; spin_unlock_irq(&q->requeue_lock); } static const struct seq_operations queue_requeue_list_seq_ops = { .start = queue_requeue_list_start, .next = queue_requeue_list_next, .stop = queue_requeue_list_stop, .show = blk_mq_debugfs_rq_show, }; static int blk_flags_show(struct seq_file *m, const unsigned long flags, const char *const *flag_name, int flag_name_count) { bool sep = false; int i; for (i = 0; i < sizeof(flags) * BITS_PER_BYTE; i++) { if (!(flags & BIT(i))) continue; if (sep) seq_puts(m, "|"); sep = true; if (i < flag_name_count && flag_name[i]) seq_puts(m, flag_name[i]); else seq_printf(m, "%d", i); } return 0; } static int queue_pm_only_show(void *data, struct seq_file *m) { struct request_queue *q = data; seq_printf(m, "%d\n", atomic_read(&q->pm_only)); return 0; } #define QUEUE_FLAG_NAME(name) [QUEUE_FLAG_##name] = #name static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(DYING), QUEUE_FLAG_NAME(NOMERGES), QUEUE_FLAG_NAME(SAME_COMP), QUEUE_FLAG_NAME(FAIL_IO), QUEUE_FLAG_NAME(NOXMERGES), QUEUE_FLAG_NAME(SAME_FORCE), QUEUE_FLAG_NAME(INIT_DONE), QUEUE_FLAG_NAME(STATS), QUEUE_FLAG_NAME(REGISTERED), QUEUE_FLAG_NAME(QUIESCED), QUEUE_FLAG_NAME(RQ_ALLOC_TIME), QUEUE_FLAG_NAME(HCTX_ACTIVE), QUEUE_FLAG_NAME(SQ_SCHED), QUEUE_FLAG_NAME(DISABLE_WBT_DEF), QUEUE_FLAG_NAME(NO_ELV_SWITCH), QUEUE_FLAG_NAME(QOS_ENABLED), QUEUE_FLAG_NAME(BIO_ISSUE_TIME), QUEUE_FLAG_NAME(ZONED_QD1_WRITES), }; #undef QUEUE_FLAG_NAME static int queue_state_show(void *data, struct seq_file *m) { struct request_queue *q = data; BUILD_BUG_ON(ARRAY_SIZE(blk_queue_flag_name) != QUEUE_FLAG_MAX); blk_flags_show(m, q->queue_flags, blk_queue_flag_name, ARRAY_SIZE(blk_queue_flag_name)); seq_puts(m, "\n"); return 0; } static ssize_t queue_state_write(void *data, const char __user *buf, size_t count, loff_t *ppos) { struct request_queue *q = data; char opbuf[16] = { }, *op; /* * The "state" attribute is removed when the queue is removed. Don't * allow setting the state on a dying queue to avoid a use-after-free. */ if (blk_queue_dying(q)) return -ENOENT; if (count >= sizeof(opbuf)) { pr_err("%s: operation too long\n", __func__); goto inval; } if (copy_from_user(opbuf, buf, count)) return -EFAULT; op = strstrip(opbuf); if (strcmp(op, "run") == 0) { blk_mq_run_hw_queues(q, true); } else if (strcmp(op, "start") == 0) { blk_mq_start_stopped_hw_queues(q, true); } else if (strcmp(op, "kick") == 0) { blk_mq_kick_requeue_list(q); } else { pr_err("%s: unsupported operation '%s'\n", __func__, op); inval: pr_err("%s: use 'run', 'start' or 'kick'\n", __func__); return -EINVAL; } return count; } static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = { { "poll_stat", 0400, queue_poll_stat_show }, { "requeue_list", 0400, .seq_ops = &queue_requeue_list_seq_ops }, { "pm_only", 0600, queue_pm_only_show, NULL }, { "state", 0600, queue_state_show, queue_state_write }, { "zone_wplugs", 0400, queue_zone_wplugs_show, NULL }, { }, }; #define HCTX_STATE_NAME(name) [BLK_MQ_S_##name] = #name static const char *const hctx_state_name[] = { HCTX_STATE_NAME(STOPPED), HCTX_STATE_NAME(TAG_ACTIVE), HCTX_STATE_NAME(SCHED_RESTART), HCTX_STATE_NAME(INACTIVE), }; #undef HCTX_STATE_NAME static int hctx_state_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; BUILD_BUG_ON(ARRAY_SIZE(hctx_state_name) != BLK_MQ_S_MAX); blk_flags_show(m, hctx->state, hctx_state_name, ARRAY_SIZE(hctx_state_name)); seq_puts(m, "\n"); return 0; } #define HCTX_FLAG_NAME(name) [ilog2(BLK_MQ_F_##name)] = #name static const char *const hctx_flag_name[] = { HCTX_FLAG_NAME(TAG_QUEUE_SHARED), HCTX_FLAG_NAME(STACKING), HCTX_FLAG_NAME(TAG_HCTX_SHARED), HCTX_FLAG_NAME(BLOCKING), HCTX_FLAG_NAME(TAG_RR), HCTX_FLAG_NAME(NO_SCHED_BY_DEFAULT), }; #undef HCTX_FLAG_NAME static int hctx_flags_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; BUILD_BUG_ON(ARRAY_SIZE(hctx_flag_name) != ilog2(BLK_MQ_F_MAX)); blk_flags_show(m, hctx->flags, hctx_flag_name, ARRAY_SIZE(hctx_flag_name)); seq_puts(m, "\n"); return 0; } #define CMD_FLAG_NAME(name) [__REQ_##name] = #name static const char *const cmd_flag_name[] = { CMD_FLAG_NAME(FAILFAST_DEV), CMD_FLAG_NAME(FAILFAST_TRANSPORT), CMD_FLAG_NAME(FAILFAST_DRIVER), CMD_FLAG_NAME(SYNC), CMD_FLAG_NAME(META), CMD_FLAG_NAME(PRIO), CMD_FLAG_NAME(NOMERGE), CMD_FLAG_NAME(IDLE), CMD_FLAG_NAME(INTEGRITY), CMD_FLAG_NAME(FUA), CMD_FLAG_NAME(PREFLUSH), CMD_FLAG_NAME(RAHEAD), CMD_FLAG_NAME(BACKGROUND), CMD_FLAG_NAME(NOWAIT), CMD_FLAG_NAME(POLLED), CMD_FLAG_NAME(ALLOC_CACHE), CMD_FLAG_NAME(SWAP), CMD_FLAG_NAME(DRV), CMD_FLAG_NAME(FS_PRIVATE), CMD_FLAG_NAME(ATOMIC), CMD_FLAG_NAME(NOUNMAP), }; #undef CMD_FLAG_NAME #define RQF_NAME(name) [__RQF_##name] = #name static const char *const rqf_name[] = { RQF_NAME(STARTED), RQF_NAME(FLUSH_SEQ), RQF_NAME(MIXED_MERGE), RQF_NAME(DONTPREP), RQF_NAME(SCHED_TAGS), RQF_NAME(USE_SCHED), RQF_NAME(FAILED), RQF_NAME(QUIET), RQF_NAME(IO_STAT), RQF_NAME(PM), RQF_NAME(HASHED), RQF_NAME(STATS), RQF_NAME(SPECIAL_PAYLOAD), RQF_NAME(ZONE_WRITE_PLUGGING), RQF_NAME(TIMED_OUT), RQF_NAME(RESV), }; #undef RQF_NAME static const char *const blk_mq_rq_state_name_array[] = { [MQ_RQ_IDLE] = "idle", [MQ_RQ_IN_FLIGHT] = "in_flight", [MQ_RQ_COMPLETE] = "complete", }; static const char *blk_mq_rq_state_name(enum mq_rq_state rq_state) { if (WARN_ON_ONCE((unsigned int)rq_state >= ARRAY_SIZE(blk_mq_rq_state_name_array))) return "(?)"; return blk_mq_rq_state_name_array[rq_state]; } int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq) { const struct blk_mq_ops *const mq_ops = rq->q->mq_ops; const enum req_op op = req_op(rq); const char *op_str = blk_op_str(op); BUILD_BUG_ON(ARRAY_SIZE(cmd_flag_name) != __REQ_NR_BITS); BUILD_BUG_ON(ARRAY_SIZE(rqf_name) != __RQF_BITS); seq_printf(m, "%p {.op=", rq); if (strcmp(op_str, "UNKNOWN") == 0) seq_printf(m, "%u", op); else seq_printf(m, "%s", op_str); seq_puts(m, ", .cmd_flags="); blk_flags_show(m, (__force unsigned int)(rq->cmd_flags & ~REQ_OP_MASK), cmd_flag_name, ARRAY_SIZE(cmd_flag_name)); seq_puts(m, ", .rq_flags="); blk_flags_show(m, (__force unsigned int)rq->rq_flags, rqf_name, ARRAY_SIZE(rqf_name)); seq_printf(m, ", .state=%s", blk_mq_rq_state_name(blk_mq_rq_state(rq))); seq_printf(m, ", .tag=%d, .internal_tag=%d", rq->tag, rq->internal_tag); if (mq_ops->show_rq) mq_ops->show_rq(m, rq); seq_puts(m, "}\n"); return 0; } EXPORT_SYMBOL_GPL(__blk_mq_debugfs_rq_show); int blk_mq_debugfs_rq_show(struct seq_file *m, void *v) { return __blk_mq_debugfs_rq_show(m, list_entry_rq(v)); } EXPORT_SYMBOL_GPL(blk_mq_debugfs_rq_show); static void *hctx_dispatch_start(struct seq_file *m, loff_t *pos) __acquires(&hctx->lock) { struct blk_mq_hw_ctx *hctx = m->private; spin_lock(&hctx->lock); return seq_list_start(&hctx->dispatch, *pos); } static void *hctx_dispatch_next(struct seq_file *m, void *v, loff_t *pos) { struct blk_mq_hw_ctx *hctx = m->private; return seq_list_next(v, &hctx->dispatch, pos); } static void hctx_dispatch_stop(struct seq_file *m, void *v) __releases(&hctx->lock) { struct blk_mq_hw_ctx *hctx = m->private; spin_unlock(&hctx->lock); } static const struct seq_operations hctx_dispatch_seq_ops = { .start = hctx_dispatch_start, .next = hctx_dispatch_next, .stop = hctx_dispatch_stop, .show = blk_mq_debugfs_rq_show, }; struct show_busy_params { struct seq_file *m; struct blk_mq_hw_ctx *hctx; }; /* * Note: the state of a request may change while this function is in progress, * e.g. due to a concurrent blk_mq_finish_request() call. Returns true to * keep iterating requests. */ static bool hctx_show_busy_rq(struct request *rq, void *data) { const struct show_busy_params *params = data; if (rq->mq_hctx == params->hctx) __blk_mq_debugfs_rq_show(params->m, rq); return true; } static int hctx_busy_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; struct show_busy_params params = { .m = m, .hctx = hctx }; int res; res = mutex_lock_interruptible(&hctx->queue->elevator_lock); if (res) return res; blk_mq_tagset_busy_iter(hctx->queue->tag_set, hctx_show_busy_rq, &params); mutex_unlock(&hctx->queue->elevator_lock); return 0; } static const char *const hctx_types[] = { [HCTX_TYPE_DEFAULT] = "default", [HCTX_TYPE_READ] = "read", [HCTX_TYPE_POLL] = "poll", }; static int hctx_type_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; BUILD_BUG_ON(ARRAY_SIZE(hctx_types) != HCTX_MAX_TYPES); seq_printf(m, "%s\n", hctx_types[hctx->type]); return 0; } static int hctx_ctx_map_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; sbitmap_bitmap_show(&hctx->ctx_map, m); return 0; } static void blk_mq_debugfs_tags_show(struct seq_file *m, struct blk_mq_tags *tags) { seq_printf(m, "nr_tags=%u\n", tags->nr_tags); seq_printf(m, "nr_reserved_tags=%u\n", tags->nr_reserved_tags); seq_printf(m, "active_queues=%d\n", READ_ONCE(tags->active_queues)); seq_puts(m, "\nbitmap_tags:\n"); sbitmap_queue_show(&tags->bitmap_tags, m); if (tags->nr_reserved_tags) { seq_puts(m, "\nbreserved_tags:\n"); sbitmap_queue_show(&tags->breserved_tags, m); } } static int hctx_tags_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; struct request_queue *q = hctx->queue; int res; res = mutex_lock_interruptible(&q->elevator_lock); if (res) return res; if (hctx->tags) blk_mq_debugfs_tags_show(m, hctx->tags); mutex_unlock(&q->elevator_lock); return 0; } static int hctx_tags_bitmap_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; struct request_queue *q = hctx->queue; int res; res = mutex_lock_interruptible(&q->elevator_lock); if (res) return res; if (hctx->tags) sbitmap_bitmap_show(&hctx->tags->bitmap_tags.sb, m); mutex_unlock(&q->elevator_lock); return 0; } static int hctx_sched_tags_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; struct request_queue *q = hctx->queue; int res; res = mutex_lock_interruptible(&q->elevator_lock); if (res) return res; if (hctx->sched_tags) blk_mq_debugfs_tags_show(m, hctx->sched_tags); mutex_unlock(&q->elevator_lock); return 0; } static int hctx_sched_tags_bitmap_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; struct request_queue *q = hctx->queue; int res; res = mutex_lock_interruptible(&q->elevator_lock); if (res) return res; if (hctx->sched_tags) sbitmap_bitmap_show(&hctx->sched_tags->bitmap_tags.sb, m); mutex_unlock(&q->elevator_lock); return 0; } static int hctx_active_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; seq_printf(m, "%d\n", __blk_mq_active_requests(hctx)); return 0; } static int hctx_dispatch_busy_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; seq_printf(m, "%u\n", hctx->dispatch_busy); return 0; } #define CTX_RQ_SEQ_OPS(name, type) \ static void *ctx_##name##_rq_list_start(struct seq_file *m, loff_t *pos) \ __acquires(&ctx->lock) \ { \ struct blk_mq_ctx *ctx = m->private; \ \ spin_lock(&ctx->lock); \ return seq_list_start(&ctx->rq_lists[type], *pos); \ } \ \ static void *ctx_##name##_rq_list_next(struct seq_file *m, void *v, \ loff_t *pos) \ { \ struct blk_mq_ctx *ctx = m->private; \ \ return seq_list_next(v, &ctx->rq_lists[type], pos); \ } \ \ static void ctx_##name##_rq_list_stop(struct seq_file *m, void *v) \ __releases(&ctx->lock) \ { \ struct blk_mq_ctx *ctx = m->private; \ \ spin_unlock(&ctx->lock); \ } \ \ static const struct seq_operations ctx_##name##_rq_list_seq_ops = { \ .start = ctx_##name##_rq_list_start, \ .next = ctx_##name##_rq_list_next, \ .stop = ctx_##name##_rq_list_stop, \ .show = blk_mq_debugfs_rq_show, \ } CTX_RQ_SEQ_OPS(default, HCTX_TYPE_DEFAULT); CTX_RQ_SEQ_OPS(read, HCTX_TYPE_READ); CTX_RQ_SEQ_OPS(poll, HCTX_TYPE_POLL); static int blk_mq_debugfs_show(struct seq_file *m, void *v) { const struct blk_mq_debugfs_attr *attr = m->private; void *data = debugfs_get_aux(m->file); return attr->show(data, m); } static ssize_t blk_mq_debugfs_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct seq_file *m = file->private_data; const struct blk_mq_debugfs_attr *attr = m->private; void *data = debugfs_get_aux(file); /* * Attributes that only implement .seq_ops are read-only and 'attr' is * the same with 'data' in this case. */ if (attr == data || !attr->write) return -EPERM; return attr->write(data, buf, count, ppos); } static int blk_mq_debugfs_open(struct inode *inode, struct file *file) { const struct blk_mq_debugfs_attr *attr = inode->i_private; void *data = debugfs_get_aux(file); struct seq_file *m; int ret; if (attr->seq_ops) { ret = seq_open(file, attr->seq_ops); if (!ret) { m = file->private_data; m->private = data; } return ret; } if (WARN_ON_ONCE(!attr->show)) return -EPERM; return single_open(file, blk_mq_debugfs_show, inode->i_private); } static int blk_mq_debugfs_release(struct inode *inode, struct file *file) { const struct blk_mq_debugfs_attr *attr = inode->i_private; if (attr->show) return single_release(inode, file); return seq_release(inode, file); } static const struct file_operations blk_mq_debugfs_fops = { .open = blk_mq_debugfs_open, .read = seq_read, .write = blk_mq_debugfs_write, .llseek = seq_lseek, .release = blk_mq_debugfs_release, }; static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = { {"state", 0400, hctx_state_show}, {"flags", 0400, hctx_flags_show}, {"dispatch", 0400, .seq_ops = &hctx_dispatch_seq_ops}, {"busy", 0400, hctx_busy_show}, {"ctx_map", 0400, hctx_ctx_map_show}, {"tags", 0400, hctx_tags_show}, {"tags_bitmap", 0400, hctx_tags_bitmap_show}, {"sched_tags", 0400, hctx_sched_tags_show}, {"sched_tags_bitmap", 0400, hctx_sched_tags_bitmap_show}, {"active", 0400, hctx_active_show}, {"dispatch_busy", 0400, hctx_dispatch_busy_show}, {"type", 0400, hctx_type_show}, {}, }; static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = { {"default_rq_list", 0400, .seq_ops = &ctx_default_rq_list_seq_ops}, {"read_rq_list", 0400, .seq_ops = &ctx_read_rq_list_seq_ops}, {"poll_rq_list", 0400, .seq_ops = &ctx_poll_rq_list_seq_ops}, {}, }; static void debugfs_create_files(struct request_queue *q, struct dentry *parent, void *data, const struct blk_mq_debugfs_attr *attr) { lockdep_assert_held(&q->debugfs_mutex); /* * debugfs_mutex should not be nested under other locks that can be * grabbed while queue is frozen. */ lockdep_assert_not_held(&q->elevator_lock); lockdep_assert_not_held(&q->rq_qos_mutex); if (IS_ERR_OR_NULL(parent)) return; for (; attr->name; attr++) debugfs_create_file_aux(attr->name, attr->mode, parent, (void *)attr, data, &blk_mq_debugfs_fops); } void blk_mq_debugfs_register(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; unsigned long i; debugfs_create_files(q, q->debugfs_dir, q, blk_mq_debugfs_queue_attrs); queue_for_each_hw_ctx(q, hctx, i) { if (!hctx->debugfs_dir) blk_mq_debugfs_register_hctx(q, hctx); } blk_mq_debugfs_register_rq_qos(q); } static void blk_mq_debugfs_register_ctx(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx) { struct dentry *ctx_dir; char name[20]; snprintf(name, sizeof(name), "cpu%u", ctx->cpu); ctx_dir = debugfs_create_dir(name, hctx->debugfs_dir); debugfs_create_files(hctx->queue, ctx_dir, ctx, blk_mq_debugfs_ctx_attrs); } void blk_mq_debugfs_register_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx) { struct blk_mq_ctx *ctx; char name[20]; int i; if (!q->debugfs_dir) return; snprintf(name, sizeof(name), "hctx%u", hctx->queue_num); hctx->debugfs_dir = debugfs_create_dir(name, q->debugfs_dir); debugfs_create_files(q, hctx->debugfs_dir, hctx, blk_mq_debugfs_hctx_attrs); hctx_for_each_ctx(hctx, ctx, i) blk_mq_debugfs_register_ctx(hctx, ctx); } void blk_mq_debugfs_unregister_hctx(struct blk_mq_hw_ctx *hctx) { if (!hctx->queue->debugfs_dir) return; debugfs_remove_recursive(hctx->debugfs_dir); hctx->sched_debugfs_dir = NULL; hctx->debugfs_dir = NULL; } void blk_mq_debugfs_register_hctxs(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; unsigned int memflags; unsigned long i; memflags = blk_debugfs_lock(q); queue_for_each_hw_ctx(q, hctx, i) blk_mq_debugfs_register_hctx(q, hctx); blk_debugfs_unlock(q, memflags); } void blk_mq_debugfs_unregister_hctxs(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; unsigned long i; queue_for_each_hw_ctx(q, hctx, i) blk_mq_debugfs_unregister_hctx(hctx); } void blk_mq_debugfs_register_sched(struct request_queue *q) { struct elevator_type *e = q->elevator->type; lockdep_assert_held(&q->debugfs_mutex); /* * If the parent directory has not been created yet, return, we will be * called again later on and the directory/files will be created then. */ if (!q->debugfs_dir) return; if (!e->queue_debugfs_attrs) return; q->sched_debugfs_dir = debugfs_create_dir("sched", q->debugfs_dir); debugfs_create_files(q, q->sched_debugfs_dir, q, e->queue_debugfs_attrs); } void blk_mq_debugfs_unregister_sched(struct request_queue *q) { lockdep_assert_held(&q->debugfs_mutex); debugfs_remove_recursive(q->sched_debugfs_dir); q->sched_debugfs_dir = NULL; } static const char *rq_qos_id_to_name(enum rq_qos_id id) { switch (id) { case RQ_QOS_WBT: return "wbt"; case RQ_QOS_LATENCY: return "latency"; case RQ_QOS_COST: return "cost"; } return "unknown"; } static void blk_mq_debugfs_register_rqos(struct rq_qos *rqos) { struct request_queue *q = rqos->disk->queue; const char *dir_name = rq_qos_id_to_name(rqos->id); lockdep_assert_held(&q->debugfs_mutex); if (rqos->debugfs_dir || !rqos->ops->debugfs_attrs) return; if (!q->rqos_debugfs_dir) q->rqos_debugfs_dir = debugfs_create_dir("rqos", q->debugfs_dir); rqos->debugfs_dir = debugfs_create_dir(dir_name, q->rqos_debugfs_dir); debugfs_create_files(q, rqos->debugfs_dir, rqos, rqos->ops->debugfs_attrs); } void blk_mq_debugfs_register_rq_qos(struct request_queue *q) { lockdep_assert_held(&q->debugfs_mutex); if (q->rq_qos) { struct rq_qos *rqos = q->rq_qos; while (rqos) { blk_mq_debugfs_register_rqos(rqos); rqos = rqos->next; } } } void blk_mq_debugfs_register_sched_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx) { struct elevator_type *e = q->elevator->type; lockdep_assert_held(&q->debugfs_mutex); /* * If the parent debugfs directory has not been created yet, return; * We will be called again later on with appropriate parent debugfs * directory from blk_register_queue() */ if (!hctx->debugfs_dir) return; if (!e->hctx_debugfs_attrs) return; hctx->sched_debugfs_dir = debugfs_create_dir("sched", hctx->debugfs_dir); debugfs_create_files(q, hctx->sched_debugfs_dir, hctx, e->hctx_debugfs_attrs); } void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx) { lockdep_assert_held(&hctx->queue->debugfs_mutex); if (!hctx->queue->debugfs_dir) return; debugfs_remove_recursive(hctx->sched_debugfs_dir); hctx->sched_debugfs_dir = NULL; }
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 /* SPDX-License-Identifier: GPL-2.0 */ #include <linux/ceph/ceph_debug.h> #include <linux/types.h> #include <linux/percpu_counter.h> #include <linux/math64.h> #include <linux/ratelimit.h> #include <linux/ceph/decode.h> #include "metric.h" #include "mds_client.h" static bool metrics_disable_warned; static inline u32 ceph_subvolume_entry_payload_len(void) { return sizeof(struct ceph_subvolume_metric_entry_wire); } static inline u32 ceph_subvolume_entry_encoded_len(void) { return CEPH_ENCODING_START_BLK_LEN + ceph_subvolume_entry_payload_len(); } static inline u32 ceph_subvolume_outer_payload_len(u32 nr_subvols) { /* count is encoded as le64 (size_t on wire) to match FUSE client */ return sizeof(__le64) + nr_subvols * ceph_subvolume_entry_encoded_len(); } static inline u32 ceph_subvolume_metric_data_len(u32 nr_subvols) { return CEPH_ENCODING_START_BLK_LEN + ceph_subvolume_outer_payload_len(nr_subvols); } static inline u32 ceph_subvolume_clamp_u32(u64 val) { return val > U32_MAX ? U32_MAX : (u32)val; } static void ceph_init_subvolume_wire_entry( struct ceph_subvolume_metric_entry_wire *dst, const struct ceph_subvol_metric_snapshot *src) { dst->subvolume_id = cpu_to_le64(src->subvolume_id); dst->read_ops = cpu_to_le32(ceph_subvolume_clamp_u32(src->read_ops)); dst->write_ops = cpu_to_le32(ceph_subvolume_clamp_u32(src->write_ops)); dst->read_bytes = cpu_to_le64(src->read_bytes); dst->write_bytes = cpu_to_le64(src->write_bytes); dst->read_latency_us = cpu_to_le64(src->read_latency_us); dst->write_latency_us = cpu_to_le64(src->write_latency_us); dst->time_stamp = 0; } static int ceph_encode_subvolume_metrics(void **p, void *end, struct ceph_subvol_metric_snapshot *subvols, u32 nr_subvols) { u32 i; ceph_start_encoding(p, 1, 1, ceph_subvolume_outer_payload_len(nr_subvols)); /* count is encoded as le64 (size_t on wire) to match FUSE client */ ceph_encode_64_safe(p, end, (u64)nr_subvols, enc_err); for (i = 0; i < nr_subvols; i++) { struct ceph_subvolume_metric_entry_wire wire_entry; ceph_init_subvolume_wire_entry(&wire_entry, &subvols[i]); ceph_start_encoding(p, 1, 1, ceph_subvolume_entry_payload_len()); ceph_encode_copy_safe(p, end, &wire_entry, sizeof(wire_entry), enc_err); } return 0; enc_err: return -ERANGE; } static void ktime_to_ceph_timespec(struct ceph_timespec *ts, ktime_t val) { struct timespec64 t = ktime_to_timespec64(val); ceph_encode_timespec64(ts, &t); } static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, struct ceph_mds_session *s) { struct ceph_metric_head *head; struct ceph_metric_cap *cap; struct ceph_metric_read_latency *read; struct ceph_metric_write_latency *write; struct ceph_metric_metadata_latency *meta; struct ceph_metric_dlease *dlease; struct ceph_opened_files *files; struct ceph_pinned_icaps *icaps; struct ceph_opened_inodes *inodes; struct ceph_read_io_size *rsize; struct ceph_write_io_size *wsize; struct ceph_client_metric *m = &mdsc->metric; struct ceph_subvol_metric_snapshot *subvols = NULL; u64 nr_caps = atomic64_read(&m->total_caps); u32 header_len = sizeof(struct ceph_metric_header); struct ceph_client *cl = mdsc->fsc->client; struct ceph_msg *msg; u32 nr_subvols = 0; size_t subvol_len = 0; void *cursor; s64 sum; s32 items = 0; s32 len; /* Do not send the metrics until the MDS rank is ready */ mutex_lock(&mdsc->mutex); if (ceph_mdsmap_get_state(mdsc->mdsmap, s->s_mds) != CEPH_MDS_STATE_ACTIVE) { mutex_unlock(&mdsc->mutex); return false; } mutex_unlock(&mdsc->mutex); if (ceph_subvolume_metrics_enabled(&mdsc->subvol_metrics) && test_bit(CEPHFS_FEATURE_SUBVOLUME_METRICS, &s->s_features)) { int ret; ret = ceph_subvolume_metrics_snapshot(&mdsc->subvol_metrics, &subvols, &nr_subvols, true); if (ret) { pr_warn_client(cl, "failed to snapshot subvolume metrics: %d\n", ret); /* * On error, ceph_subvolume_metrics_snapshot() guarantees * *out = NULL and *nr = 0 at function entry, so subvols * is already NULL here - no cleanup needed. */ nr_subvols = 0; subvols = NULL; } } if (nr_subvols) { /* type (le32) + ENCODE_START payload - no metric header */ subvol_len = sizeof(__le32) + ceph_subvolume_metric_data_len(nr_subvols); } len = sizeof(*head) + sizeof(*cap) + sizeof(*read) + sizeof(*write) + sizeof(*meta) + sizeof(*dlease) + sizeof(*files) + sizeof(*icaps) + sizeof(*inodes) + sizeof(*rsize) + sizeof(*wsize) + subvol_len; msg = ceph_msg_new(CEPH_MSG_CLIENT_METRICS, len, GFP_NOFS, true); if (!msg) { pr_err_client(cl, "to mds%d, failed to allocate message\n", s->s_mds); kfree(subvols); return false; } head = msg->front.iov_base; /* encode the cap metric */ cap = (struct ceph_metric_cap *)(head + 1); cap->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_CAP_INFO); cap->header.ver = 1; cap->header.compat = 1; cap->header.data_len = cpu_to_le32(sizeof(*cap) - header_len); cap->hit = cpu_to_le64(percpu_counter_sum(&m->i_caps_hit)); cap->mis = cpu_to_le64(percpu_counter_sum(&m->i_caps_mis)); cap->total = cpu_to_le64(nr_caps); items++; /* encode the read latency metric */ read = (struct ceph_metric_read_latency *)(cap + 1); read->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_READ_LATENCY); read->header.ver = 2; read->header.compat = 1; read->header.data_len = cpu_to_le32(sizeof(*read) - header_len); sum = m->metric[METRIC_READ].latency_sum; ktime_to_ceph_timespec(&read->lat, sum); ktime_to_ceph_timespec(&read->avg, m->metric[METRIC_READ].latency_avg); read->sq_sum = cpu_to_le64(m->metric[METRIC_READ].latency_sq_sum); read->count = cpu_to_le64(m->metric[METRIC_READ].total); items++; /* encode the write latency metric */ write = (struct ceph_metric_write_latency *)(read + 1); write->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_WRITE_LATENCY); write->header.ver = 2; write->header.compat = 1; write->header.data_len = cpu_to_le32(sizeof(*write) - header_len); sum = m->metric[METRIC_WRITE].latency_sum; ktime_to_ceph_timespec(&write->lat, sum); ktime_to_ceph_timespec(&write->avg, m->metric[METRIC_WRITE].latency_avg); write->sq_sum = cpu_to_le64(m->metric[METRIC_WRITE].latency_sq_sum); write->count = cpu_to_le64(m->metric[METRIC_WRITE].total); items++; /* encode the metadata latency metric */ meta = (struct ceph_metric_metadata_latency *)(write + 1); meta->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_METADATA_LATENCY); meta->header.ver = 2; meta->header.compat = 1; meta->header.data_len = cpu_to_le32(sizeof(*meta) - header_len); sum = m->metric[METRIC_METADATA].latency_sum; ktime_to_ceph_timespec(&meta->lat, sum); ktime_to_ceph_timespec(&meta->avg, m->metric[METRIC_METADATA].latency_avg); meta->sq_sum = cpu_to_le64(m->metric[METRIC_METADATA].latency_sq_sum); meta->count = cpu_to_le64(m->metric[METRIC_METADATA].total); items++; /* encode the dentry lease metric */ dlease = (struct ceph_metric_dlease *)(meta + 1); dlease->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_DENTRY_LEASE); dlease->header.ver = 1; dlease->header.compat = 1; dlease->header.data_len = cpu_to_le32(sizeof(*dlease) - header_len); dlease->hit = cpu_to_le64(percpu_counter_sum(&m->d_lease_hit)); dlease->mis = cpu_to_le64(percpu_counter_sum(&m->d_lease_mis)); dlease->total = cpu_to_le64(atomic64_read(&m->total_dentries)); items++; sum = percpu_counter_sum(&m->total_inodes); /* encode the opened files metric */ files = (struct ceph_opened_files *)(dlease + 1); files->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_OPENED_FILES); files->header.ver = 1; files->header.compat = 1; files->header.data_len = cpu_to_le32(sizeof(*files) - header_len); files->opened_files = cpu_to_le64(atomic64_read(&m->opened_files)); files->total = cpu_to_le64(sum); items++; /* encode the pinned icaps metric */ icaps = (struct ceph_pinned_icaps *)(files + 1); icaps->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_PINNED_ICAPS); icaps->header.ver = 1; icaps->header.compat = 1; icaps->header.data_len = cpu_to_le32(sizeof(*icaps) - header_len); icaps->pinned_icaps = cpu_to_le64(nr_caps); icaps->total = cpu_to_le64(sum); items++; /* encode the opened inodes metric */ inodes = (struct ceph_opened_inodes *)(icaps + 1); inodes->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_OPENED_INODES); inodes->header.ver = 1; inodes->header.compat = 1; inodes->header.data_len = cpu_to_le32(sizeof(*inodes) - header_len); inodes->opened_inodes = cpu_to_le64(percpu_counter_sum(&m->opened_inodes)); inodes->total = cpu_to_le64(sum); items++; /* encode the read io size metric */ rsize = (struct ceph_read_io_size *)(inodes + 1); rsize->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_READ_IO_SIZES); rsize->header.ver = 1; rsize->header.compat = 1; rsize->header.data_len = cpu_to_le32(sizeof(*rsize) - header_len); rsize->total_ops = cpu_to_le64(m->metric[METRIC_READ].total); rsize->total_size = cpu_to_le64(m->metric[METRIC_READ].size_sum); items++; /* encode the write io size metric */ wsize = (struct ceph_write_io_size *)(rsize + 1); wsize->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_WRITE_IO_SIZES); wsize->header.ver = 1; wsize->header.compat = 1; wsize->header.data_len = cpu_to_le32(sizeof(*wsize) - header_len); wsize->total_ops = cpu_to_le64(m->metric[METRIC_WRITE].total); wsize->total_size = cpu_to_le64(m->metric[METRIC_WRITE].size_sum); items++; cursor = wsize + 1; if (nr_subvols) { void *payload; void *payload_end; int ret; /* Emit only the type (le32), no ver/compat/data_len */ ceph_encode_32(&cursor, CLIENT_METRIC_TYPE_SUBVOLUME_METRICS); items++; payload = cursor; payload_end = (char *)payload + ceph_subvolume_metric_data_len(nr_subvols); ret = ceph_encode_subvolume_metrics(&payload, payload_end, subvols, nr_subvols); if (ret) { pr_warn_client(cl, "failed to encode subvolume metrics\n"); kfree(subvols); ceph_msg_put(msg); return false; } WARN_ON(payload != payload_end); cursor = payload; } put_unaligned_le32(items, &head->num); msg->front.iov_len = (char *)cursor - (char *)head; msg->hdr.version = cpu_to_le16(1); msg->hdr.compat_version = cpu_to_le16(1); msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); ceph_con_send(&s->s_con, msg); if (nr_subvols) { mutex_lock(&mdsc->subvol_metrics_last_mutex); kfree(mdsc->subvol_metrics_last); mdsc->subvol_metrics_last = subvols; mdsc->subvol_metrics_last_nr = nr_subvols; mdsc->subvol_metrics_sent += nr_subvols; mdsc->subvol_metrics_nonzero_sends++; mutex_unlock(&mdsc->subvol_metrics_last_mutex); subvols = NULL; } kfree(subvols); return true; } static void metric_get_session(struct ceph_mds_client *mdsc) { struct ceph_mds_session *s; int i; mutex_lock(&mdsc->mutex); for (i = 0; i < mdsc->max_sessions; i++) { s = __ceph_lookup_mds_session(mdsc, i); if (!s) continue; /* * Skip it if MDS doesn't support the metric collection, * or the MDS will close the session's socket connection * directly when it get this message. * * Also skip sessions that don't support SUBVOLUME_METRICS * when subvolume metrics collection is enabled. This ensures * we only send subvolume metrics to MDSs that understand them. * If no session supports the feature, metrics won't be sent. */ if (check_session_state(s) && test_bit(CEPHFS_FEATURE_METRIC_COLLECT, &s->s_features)) { if (ceph_subvolume_metrics_enabled(&mdsc->subvol_metrics) && !test_bit(CEPHFS_FEATURE_SUBVOLUME_METRICS, &s->s_features)) { ceph_put_mds_session(s); continue; } mdsc->metric.session = s; break; } ceph_put_mds_session(s); } mutex_unlock(&mdsc->mutex); } static void metric_delayed_work(struct work_struct *work) { struct ceph_client_metric *m = container_of(work, struct ceph_client_metric, delayed_work.work); struct ceph_mds_client *mdsc = container_of(m, struct ceph_mds_client, metric); if (mdsc->stopping) return; if (disable_send_metrics) { if (!metrics_disable_warned) { pr_info("ceph: metrics sending disabled via module parameter\n"); metrics_disable_warned = true; } return; } metrics_disable_warned = false; if (!m->session || !check_session_state(m->session)) { if (m->session) { ceph_put_mds_session(m->session); m->session = NULL; } metric_get_session(mdsc); } if (m->session) ceph_mdsc_send_metrics(mdsc, m->session); else pr_warn_ratelimited("ceph: metrics worker has no MDS session\n"); metric_schedule_delayed(m); } int ceph_metric_init(struct ceph_client_metric *m) { struct ceph_metric *metric; int ret, i; if (!m) return -EINVAL; atomic64_set(&m->total_dentries, 0); ret = percpu_counter_init(&m->d_lease_hit, 0, GFP_KERNEL); if (ret) return ret; ret = percpu_counter_init(&m->d_lease_mis, 0, GFP_KERNEL); if (ret) goto err_d_lease_mis; atomic64_set(&m->total_caps, 0); ret = percpu_counter_init(&m->i_caps_hit, 0, GFP_KERNEL); if (ret) goto err_i_caps_hit; ret = percpu_counter_init(&m->i_caps_mis, 0, GFP_KERNEL); if (ret) goto err_i_caps_mis; for (i = 0; i < METRIC_MAX; i++) { metric = &m->metric[i]; spin_lock_init(&metric->lock); metric->size_sum = 0; metric->size_min = U64_MAX; metric->size_max = 0; metric->total = 0; metric->latency_sum = 0; metric->latency_avg = 0; metric->latency_sq_sum = 0; metric->latency_min = KTIME_MAX; metric->latency_max = 0; } atomic64_set(&m->opened_files, 0); ret = percpu_counter_init(&m->opened_inodes, 0, GFP_KERNEL); if (ret) goto err_opened_inodes; ret = percpu_counter_init(&m->total_inodes, 0, GFP_KERNEL); if (ret) goto err_total_inodes; m->session = NULL; INIT_DELAYED_WORK(&m->delayed_work, metric_delayed_work); return 0; err_total_inodes: percpu_counter_destroy(&m->opened_inodes); err_opened_inodes: percpu_counter_destroy(&m->i_caps_mis); err_i_caps_mis: percpu_counter_destroy(&m->i_caps_hit); err_i_caps_hit: percpu_counter_destroy(&m->d_lease_mis); err_d_lease_mis: percpu_counter_destroy(&m->d_lease_hit); return ret; } void ceph_metric_destroy(struct ceph_client_metric *m) { if (!m) return; cancel_delayed_work_sync(&m->delayed_work); percpu_counter_destroy(&m->total_inodes); percpu_counter_destroy(&m->opened_inodes); percpu_counter_destroy(&m->i_caps_mis); percpu_counter_destroy(&m->i_caps_hit); percpu_counter_destroy(&m->d_lease_mis); percpu_counter_destroy(&m->d_lease_hit); ceph_put_mds_session(m->session); } #define METRIC_UPDATE_MIN_MAX(min, max, new) \ { \ if (unlikely(new < min)) \ min = new; \ if (unlikely(new > max)) \ max = new; \ } static inline void __update_mean_and_stdev(ktime_t total, ktime_t *lavg, ktime_t *sq_sump, ktime_t lat) { ktime_t avg; if (unlikely(total == 1)) { *lavg = lat; } else { /* the sq is (lat - old_avg) * (lat - new_avg) */ avg = *lavg + div64_s64(lat - *lavg, total); *sq_sump += (lat - *lavg)*(lat - avg); *lavg = avg; } } void ceph_update_metrics(struct ceph_metric *m, ktime_t r_start, ktime_t r_end, unsigned int size, int rc) { ktime_t lat = ktime_sub(r_end, r_start); ktime_t total; if (unlikely(rc < 0 && rc != -ENOENT && rc != -ETIMEDOUT)) return; spin_lock(&m->lock); total = ++m->total; m->size_sum += size; METRIC_UPDATE_MIN_MAX(m->size_min, m->size_max, size); m->latency_sum += lat; METRIC_UPDATE_MIN_MAX(m->latency_min, m->latency_max, lat); __update_mean_and_stdev(total, &m->latency_avg, &m->latency_sq_sum, lat); spin_unlock(&m->lock); }
9 9 9 9 5 5 5 1 1 1 15 4 2 1 2 1 1 4 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ocfs2/ioctl.c * * Copyright (C) 2006 Herbert Poetzl * adapted from Remy Card's ext2/ioctl.c */ #include <linux/fs.h> #include <linux/mount.h> #include <linux/blkdev.h> #include <linux/compat.h> #include <linux/fileattr.h> #include <cluster/masklog.h> #include "ocfs2.h" #include "alloc.h" #include "dlmglue.h" #include "file.h" #include "inode.h" #include "journal.h" #include "ocfs2_fs.h" #include "ioctl.h" #include "resize.h" #include "refcounttree.h" #include "sysfile.h" #include "dir.h" #include "buffer_head_io.h" #include "suballoc.h" #include "move_extents.h" #define o2info_from_user(a, b) \ copy_from_user(&(a), (b), sizeof(a)) #define o2info_to_user(a, b) \ copy_to_user((typeof(a) __user *)b, &(a), sizeof(a)) /* * This is just a best-effort to tell userspace that this request * caused the error. */ static inline void o2info_set_request_error(struct ocfs2_info_request *kreq, struct ocfs2_info_request __user *req) { kreq->ir_flags |= OCFS2_INFO_FL_ERROR; (void)put_user(kreq->ir_flags, (__u32 __user *)&(req->ir_flags)); } static inline void o2info_set_request_filled(struct ocfs2_info_request *req) { req->ir_flags |= OCFS2_INFO_FL_FILLED; } static inline void o2info_clear_request_filled(struct ocfs2_info_request *req) { req->ir_flags &= ~OCFS2_INFO_FL_FILLED; } static inline int o2info_coherent(struct ocfs2_info_request *req) { return (!(req->ir_flags & OCFS2_INFO_FL_NON_COHERENT)); } int ocfs2_fileattr_get(struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); unsigned int flags; int status; status = ocfs2_inode_lock(inode, NULL, 0); if (status < 0) { mlog_errno(status); return status; } ocfs2_get_inode_flags(OCFS2_I(inode)); flags = OCFS2_I(inode)->ip_attr; ocfs2_inode_unlock(inode, 0); fileattr_fill_flags(fa, flags & OCFS2_FL_VISIBLE); return status; } int ocfs2_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); unsigned int flags = fa->flags; struct ocfs2_inode_info *ocfs2_inode = OCFS2_I(inode); struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); handle_t *handle = NULL; struct buffer_head *bh = NULL; unsigned oldflags; int status; if (fileattr_has_fsx(fa)) return -EOPNOTSUPP; status = ocfs2_inode_lock(inode, &bh, 1); if (status < 0) { mlog_errno(status); goto bail; } if (!S_ISDIR(inode->i_mode)) flags &= ~OCFS2_DIRSYNC_FL; oldflags = ocfs2_inode->ip_attr; flags = flags & OCFS2_FL_MODIFIABLE; flags |= oldflags & ~OCFS2_FL_MODIFIABLE; /* Check already done by VFS, but repeat with ocfs lock */ status = -EPERM; if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL) && !capable(CAP_LINUX_IMMUTABLE)) goto bail_unlock; handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); if (IS_ERR(handle)) { status = PTR_ERR(handle); mlog_errno(status); goto bail_unlock; } ocfs2_inode->ip_attr = flags; ocfs2_set_inode_flags(inode); inode_set_ctime_current(inode); status = ocfs2_mark_inode_dirty(handle, inode, bh); if (status < 0) mlog_errno(status); ocfs2_commit_trans(osb, handle); bail_unlock: ocfs2_inode_unlock(inode, 1); bail: brelse(bh); return status; } static int ocfs2_info_handle_blocksize(struct inode *inode, struct ocfs2_info_request __user *req) { struct ocfs2_info_blocksize oib; if (o2info_from_user(oib, req)) return -EFAULT; oib.ib_blocksize = inode->i_sb->s_blocksize; o2info_set_request_filled(&oib.ib_req); if (o2info_to_user(oib, req)) return -EFAULT; return 0; } static int ocfs2_info_handle_clustersize(struct inode *inode, struct ocfs2_info_request __user *req) { struct ocfs2_info_clustersize oic; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (o2info_from_user(oic, req)) return -EFAULT; oic.ic_clustersize = osb->s_clustersize; o2info_set_request_filled(&oic.ic_req); if (o2info_to_user(oic, req)) return -EFAULT; return 0; } static int ocfs2_info_handle_maxslots(struct inode *inode, struct ocfs2_info_request __user *req) { struct ocfs2_info_maxslots oim; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (o2info_from_user(oim, req)) return -EFAULT; oim.im_max_slots = osb->max_slots; o2info_set_request_filled(&oim.im_req); if (o2info_to_user(oim, req)) return -EFAULT; return 0; } static int ocfs2_info_handle_label(struct inode *inode, struct ocfs2_info_request __user *req) { struct ocfs2_info_label oil; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (o2info_from_user(oil, req)) return -EFAULT; memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN); o2info_set_request_filled(&oil.il_req); if (o2info_to_user(oil, req)) return -EFAULT; return 0; } static int ocfs2_info_handle_uuid(struct inode *inode, struct ocfs2_info_request __user *req) { struct ocfs2_info_uuid oiu; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (o2info_from_user(oiu, req)) return -EFAULT; memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1); o2info_set_request_filled(&oiu.iu_req); if (o2info_to_user(oiu, req)) return -EFAULT; return 0; } static int ocfs2_info_handle_fs_features(struct inode *inode, struct ocfs2_info_request __user *req) { struct ocfs2_info_fs_features oif; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (o2info_from_user(oif, req)) return -EFAULT; oif.if_compat_features = osb->s_feature_compat; oif.if_incompat_features = osb->s_feature_incompat; oif.if_ro_compat_features = osb->s_feature_ro_compat; o2info_set_request_filled(&oif.if_req); if (o2info_to_user(oif, req)) return -EFAULT; return 0; } static int ocfs2_info_handle_journal_size(struct inode *inode, struct ocfs2_info_request __user *req) { struct ocfs2_info_journal_size oij; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (o2info_from_user(oij, req)) return -EFAULT; oij.ij_journal_size = i_size_read(osb->journal->j_inode); o2info_set_request_filled(&oij.ij_req); if (o2info_to_user(oij, req)) return -EFAULT; return 0; } static int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb, struct inode *inode_alloc, u64 blkno, struct ocfs2_info_freeinode *fi, u32 slot) { int status = 0, unlock = 0; struct buffer_head *bh = NULL; struct ocfs2_dinode *dinode_alloc = NULL; if (inode_alloc) inode_lock(inode_alloc); if (inode_alloc && o2info_coherent(&fi->ifi_req)) { status = ocfs2_inode_lock(inode_alloc, &bh, 0); if (status < 0) { mlog_errno(status); goto bail; } unlock = 1; } else { status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh); if (status < 0) { mlog_errno(status); goto bail; } } dinode_alloc = (struct ocfs2_dinode *)bh->b_data; fi->ifi_stat[slot].lfi_total = le32_to_cpu(dinode_alloc->id1.bitmap1.i_total); fi->ifi_stat[slot].lfi_free = le32_to_cpu(dinode_alloc->id1.bitmap1.i_total) - le32_to_cpu(dinode_alloc->id1.bitmap1.i_used); bail: if (unlock) ocfs2_inode_unlock(inode_alloc, 0); if (inode_alloc) inode_unlock(inode_alloc); brelse(bh); return status; } static int ocfs2_info_handle_freeinode(struct inode *inode, struct ocfs2_info_request __user *req) { u32 i; u64 blkno = -1; char namebuf[40]; int status, type = INODE_ALLOC_SYSTEM_INODE; struct ocfs2_info_freeinode *oifi = NULL; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct inode *inode_alloc = NULL; oifi = kzalloc_obj(struct ocfs2_info_freeinode); if (!oifi) { status = -ENOMEM; mlog_errno(status); goto out_err; } if (o2info_from_user(*oifi, req)) { status = -EFAULT; goto out_free; } oifi->ifi_slotnum = osb->max_slots; for (i = 0; i < oifi->ifi_slotnum; i++) { if (o2info_coherent(&oifi->ifi_req)) { inode_alloc = ocfs2_get_system_file_inode(osb, type, i); if (!inode_alloc) { mlog(ML_ERROR, "unable to get alloc inode in " "slot %u\n", i); status = -EIO; goto bail; } } else { int len = ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, i); status = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf, len, &blkno); if (status < 0) { status = -ENOENT; goto bail; } } status = ocfs2_info_scan_inode_alloc(osb, inode_alloc, blkno, oifi, i); iput(inode_alloc); inode_alloc = NULL; if (status < 0) goto bail; } o2info_set_request_filled(&oifi->ifi_req); if (o2info_to_user(*oifi, req)) { status = -EFAULT; goto out_free; } status = 0; bail: if (status) o2info_set_request_error(&oifi->ifi_req, req); out_free: kfree(oifi); out_err: return status; } static void o2ffg_update_histogram(struct ocfs2_info_free_chunk_list *hist, unsigned int chunksize) { u32 index; index = __ilog2_u32(chunksize); if (index >= OCFS2_INFO_MAX_HIST) index = OCFS2_INFO_MAX_HIST - 1; hist->fc_chunks[index]++; hist->fc_clusters[index] += chunksize; } static void o2ffg_update_stats(struct ocfs2_info_freefrag_stats *stats, unsigned int chunksize) { if (chunksize > stats->ffs_max) stats->ffs_max = chunksize; if (chunksize < stats->ffs_min) stats->ffs_min = chunksize; stats->ffs_avg += chunksize; stats->ffs_free_chunks_real++; } static void ocfs2_info_update_ffg(struct ocfs2_info_freefrag *ffg, unsigned int chunksize) { o2ffg_update_histogram(&(ffg->iff_ffs.ffs_fc_hist), chunksize); o2ffg_update_stats(&(ffg->iff_ffs), chunksize); } static int ocfs2_info_freefrag_scan_chain(struct ocfs2_super *osb, struct inode *gb_inode, struct ocfs2_dinode *gb_dinode, struct ocfs2_chain_rec *rec, struct ocfs2_info_freefrag *ffg, u32 chunks_in_group) { int status = 0, used; u64 blkno; struct buffer_head *bh = NULL; struct ocfs2_group_desc *bg = NULL; unsigned int max_bits, max_bitmap_bits, num_clusters; unsigned int offset = 0, cluster, chunk; unsigned int chunk_free, last_chunksize = 0; if (!le32_to_cpu(rec->c_free)) goto bail; max_bitmap_bits = 8 * ocfs2_group_bitmap_size(osb->sb, 0, osb->s_feature_incompat); do { if (!bg) blkno = le64_to_cpu(rec->c_blkno); else blkno = le64_to_cpu(bg->bg_next_group); if (bh) { brelse(bh); bh = NULL; } if (o2info_coherent(&ffg->iff_req)) status = ocfs2_read_group_descriptor(gb_inode, gb_dinode, blkno, &bh); else status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh); if (status < 0) { mlog(ML_ERROR, "Can't read the group descriptor # " "%llu from device.", (unsigned long long)blkno); status = -EIO; goto bail; } bg = (struct ocfs2_group_desc *)bh->b_data; if (!le16_to_cpu(bg->bg_free_bits_count)) continue; max_bits = le16_to_cpu(bg->bg_bits); /* * Non-coherent scans read raw blocks and do not get the * bg_bits validation from * ocfs2_read_group_descriptor(). */ if (max_bits > max_bitmap_bits) { mlog(ML_ERROR, "Group desc #%llu has %u bits, max bitmap bits %u\n", (unsigned long long)blkno, max_bits, max_bitmap_bits); max_bits = max_bitmap_bits; } offset = 0; for (chunk = 0; chunk < chunks_in_group; chunk++) { /* * last chunk may be not an entire one. */ if ((offset + ffg->iff_chunksize) > max_bits) num_clusters = max_bits - offset; else num_clusters = ffg->iff_chunksize; chunk_free = 0; for (cluster = 0; cluster < num_clusters; cluster++) { used = ocfs2_test_bit(offset, (unsigned long *)bg->bg_bitmap); /* * - chunk_free counts free clusters in #N chunk. * - last_chunksize records the size(in) clusters * for the last real free chunk being counted. */ if (!used) { last_chunksize++; chunk_free++; } if (used && last_chunksize) { ocfs2_info_update_ffg(ffg, last_chunksize); last_chunksize = 0; } offset++; } if (chunk_free == ffg->iff_chunksize) ffg->iff_ffs.ffs_free_chunks++; } /* * need to update the info for last free chunk. */ if (last_chunksize) ocfs2_info_update_ffg(ffg, last_chunksize); } while (le64_to_cpu(bg->bg_next_group)); bail: brelse(bh); return status; } static int ocfs2_info_freefrag_scan_bitmap(struct ocfs2_super *osb, struct inode *gb_inode, u64 blkno, struct ocfs2_info_freefrag *ffg) { u32 chunks_in_group; int status = 0, unlock = 0, i; struct buffer_head *bh = NULL; struct ocfs2_chain_list *cl = NULL; struct ocfs2_chain_rec *rec = NULL; struct ocfs2_dinode *gb_dinode = NULL; if (gb_inode) inode_lock(gb_inode); if (o2info_coherent(&ffg->iff_req)) { status = ocfs2_inode_lock(gb_inode, &bh, 0); if (status < 0) { mlog_errno(status); goto bail; } unlock = 1; } else { status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh); if (status < 0) { mlog_errno(status); goto bail; } } gb_dinode = (struct ocfs2_dinode *)bh->b_data; cl = &(gb_dinode->id2.i_chain); /* * Chunksize(in) clusters from userspace should be * less than clusters in a group. */ if (ffg->iff_chunksize > le16_to_cpu(cl->cl_cpg)) { status = -EINVAL; goto bail; } memset(&ffg->iff_ffs, 0, sizeof(struct ocfs2_info_freefrag_stats)); ffg->iff_ffs.ffs_min = ~0U; ffg->iff_ffs.ffs_clusters = le32_to_cpu(gb_dinode->id1.bitmap1.i_total); ffg->iff_ffs.ffs_free_clusters = ffg->iff_ffs.ffs_clusters - le32_to_cpu(gb_dinode->id1.bitmap1.i_used); chunks_in_group = le16_to_cpu(cl->cl_cpg) / ffg->iff_chunksize + 1; for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) { rec = &(cl->cl_recs[i]); status = ocfs2_info_freefrag_scan_chain(osb, gb_inode, gb_dinode, rec, ffg, chunks_in_group); if (status) goto bail; } if (ffg->iff_ffs.ffs_free_chunks_real) ffg->iff_ffs.ffs_avg = (ffg->iff_ffs.ffs_avg / ffg->iff_ffs.ffs_free_chunks_real); bail: if (unlock) ocfs2_inode_unlock(gb_inode, 0); if (gb_inode) inode_unlock(gb_inode); iput(gb_inode); brelse(bh); return status; } static int ocfs2_info_handle_freefrag(struct inode *inode, struct ocfs2_info_request __user *req) { u64 blkno = -1; char namebuf[40]; int status, type = GLOBAL_BITMAP_SYSTEM_INODE; struct ocfs2_info_freefrag *oiff; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct inode *gb_inode = NULL; oiff = kzalloc_obj(struct ocfs2_info_freefrag); if (!oiff) { status = -ENOMEM; mlog_errno(status); goto out_err; } if (o2info_from_user(*oiff, req)) { status = -EFAULT; goto out_free; } /* * chunksize from userspace should be power of 2. */ if ((oiff->iff_chunksize & (oiff->iff_chunksize - 1)) || (!oiff->iff_chunksize)) { status = -EINVAL; goto bail; } if (o2info_coherent(&oiff->iff_req)) { gb_inode = ocfs2_get_system_file_inode(osb, type, OCFS2_INVALID_SLOT); if (!gb_inode) { mlog(ML_ERROR, "unable to get global_bitmap inode\n"); status = -EIO; goto bail; } } else { int len = ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, OCFS2_INVALID_SLOT); status = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf, len, &blkno); if (status < 0) { status = -ENOENT; goto bail; } } status = ocfs2_info_freefrag_scan_bitmap(osb, gb_inode, blkno, oiff); if (status < 0) goto bail; o2info_set_request_filled(&oiff->iff_req); if (o2info_to_user(*oiff, req)) { status = -EFAULT; goto out_free; } status = 0; bail: if (status) o2info_set_request_error(&oiff->iff_req, req); out_free: kfree(oiff); out_err: return status; } static int ocfs2_info_handle_unknown(struct inode *inode, struct ocfs2_info_request __user *req) { struct ocfs2_info_request oir; if (o2info_from_user(oir, req)) return -EFAULT; o2info_clear_request_filled(&oir); if (o2info_to_user(oir, req)) return -EFAULT; return 0; } /* * Validate and distinguish OCFS2_IOC_INFO requests. * * - validate the magic number. * - distinguish different requests. * - validate size of different requests. */ static int ocfs2_info_handle_request(struct inode *inode, struct ocfs2_info_request __user *req) { int status = -EFAULT; struct ocfs2_info_request oir; if (o2info_from_user(oir, req)) goto bail; status = -EINVAL; if (oir.ir_magic != OCFS2_INFO_MAGIC) goto bail; switch (oir.ir_code) { case OCFS2_INFO_BLOCKSIZE: if (oir.ir_size == sizeof(struct ocfs2_info_blocksize)) status = ocfs2_info_handle_blocksize(inode, req); break; case OCFS2_INFO_CLUSTERSIZE: if (oir.ir_size == sizeof(struct ocfs2_info_clustersize)) status = ocfs2_info_handle_clustersize(inode, req); break; case OCFS2_INFO_MAXSLOTS: if (oir.ir_size == sizeof(struct ocfs2_info_maxslots)) status = ocfs2_info_handle_maxslots(inode, req); break; case OCFS2_INFO_LABEL: if (oir.ir_size == sizeof(struct ocfs2_info_label)) status = ocfs2_info_handle_label(inode, req); break; case OCFS2_INFO_UUID: if (oir.ir_size == sizeof(struct ocfs2_info_uuid)) status = ocfs2_info_handle_uuid(inode, req); break; case OCFS2_INFO_FS_FEATURES: if (oir.ir_size == sizeof(struct ocfs2_info_fs_features)) status = ocfs2_info_handle_fs_features(inode, req); break; case OCFS2_INFO_JOURNAL_SIZE: if (oir.ir_size == sizeof(struct ocfs2_info_journal_size)) status = ocfs2_info_handle_journal_size(inode, req); break; case OCFS2_INFO_FREEINODE: if (oir.ir_size == sizeof(struct ocfs2_info_freeinode)) status = ocfs2_info_handle_freeinode(inode, req); break; case OCFS2_INFO_FREEFRAG: if (oir.ir_size == sizeof(struct ocfs2_info_freefrag)) status = ocfs2_info_handle_freefrag(inode, req); break; default: status = ocfs2_info_handle_unknown(inode, req); break; } bail: return status; } static int ocfs2_get_request_ptr(struct ocfs2_info *info, int idx, u64 *req_addr, int compat_flag) { int status = -EFAULT; u64 __user *bp = NULL; if (compat_flag) { #ifdef CONFIG_COMPAT /* * pointer bp stores the base address of a pointers array, * which collects all addresses of separate request. */ bp = (u64 __user *)(unsigned long)compat_ptr(info->oi_requests); #else BUG(); #endif } else bp = (u64 __user *)(unsigned long)(info->oi_requests); if (o2info_from_user(*req_addr, bp + idx)) goto bail; status = 0; bail: return status; } /* * OCFS2_IOC_INFO handles an array of requests passed from userspace. * * ocfs2_info_handle() receives a large info aggregation, grab and * validate the request count from header, then break it into small * pieces, later specific handlers can handle them one by one. * * Idea here is to make each separate request small enough to ensure * a better backward&forward compatibility, since a small piece of * request will be less likely to be broken if disk layout get changed. */ static noinline_for_stack int ocfs2_info_handle(struct inode *inode, struct ocfs2_info *info, int compat_flag) { int i, status = 0; u64 req_addr; struct ocfs2_info_request __user *reqp; if ((info->oi_count > OCFS2_INFO_MAX_REQUEST) || (!info->oi_requests)) { status = -EINVAL; goto bail; } for (i = 0; i < info->oi_count; i++) { status = ocfs2_get_request_ptr(info, i, &req_addr, compat_flag); if (status) break; reqp = (struct ocfs2_info_request __user *)(unsigned long)req_addr; if (!reqp) { status = -EINVAL; goto bail; } status = ocfs2_info_handle_request(inode, reqp); if (status) break; } bail: return status; } long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); void __user *argp = (void __user *)arg; int status; switch (cmd) { case OCFS2_IOC_RESVSP: case OCFS2_IOC_RESVSP64: case OCFS2_IOC_UNRESVSP: case OCFS2_IOC_UNRESVSP64: { struct ocfs2_space_resv sr; if (copy_from_user(&sr, (int __user *) arg, sizeof(sr))) return -EFAULT; return ocfs2_change_file_space(filp, cmd, &sr); } case OCFS2_IOC_GROUP_EXTEND: { int new_clusters; if (!capable(CAP_SYS_RESOURCE)) return -EPERM; if (get_user(new_clusters, (int __user *)arg)) return -EFAULT; status = mnt_want_write_file(filp); if (status) return status; status = ocfs2_group_extend(inode, new_clusters); mnt_drop_write_file(filp); return status; } case OCFS2_IOC_GROUP_ADD: case OCFS2_IOC_GROUP_ADD64: { struct ocfs2_new_group_input input; if (!capable(CAP_SYS_RESOURCE)) return -EPERM; if (copy_from_user(&input, (int __user *) arg, sizeof(input))) return -EFAULT; status = mnt_want_write_file(filp); if (status) return status; status = ocfs2_group_add(inode, &input); mnt_drop_write_file(filp); return status; } case OCFS2_IOC_REFLINK: { struct reflink_arguments args; const char __user *old_path; const char __user *new_path; bool preserve; if (copy_from_user(&args, argp, sizeof(args))) return -EFAULT; old_path = (const char __user *)(unsigned long)args.old_path; new_path = (const char __user *)(unsigned long)args.new_path; preserve = (args.preserve != 0); return ocfs2_reflink_ioctl(inode, old_path, new_path, preserve); } case OCFS2_IOC_INFO: { struct ocfs2_info info; if (copy_from_user(&info, argp, sizeof(struct ocfs2_info))) return -EFAULT; return ocfs2_info_handle(inode, &info, 0); } case FITRIM: { struct super_block *sb = inode->i_sb; struct fstrim_range range; int ret = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (!bdev_max_discard_sectors(sb->s_bdev)) return -EOPNOTSUPP; if (copy_from_user(&range, argp, sizeof(range))) return -EFAULT; range.minlen = max_t(u64, bdev_discard_granularity(sb->s_bdev), range.minlen); ret = ocfs2_trim_fs(sb, &range); if (ret < 0) return ret; if (copy_to_user(argp, &range, sizeof(range))) return -EFAULT; return 0; } case OCFS2_IOC_MOVE_EXT: return ocfs2_ioctl_move_extents(filp, argp); default: return -ENOTTY; } } #ifdef CONFIG_COMPAT long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg) { bool preserve; struct reflink_arguments args; struct inode *inode = file_inode(file); struct ocfs2_info info; void __user *argp = (void __user *)arg; switch (cmd) { case OCFS2_IOC_RESVSP: case OCFS2_IOC_RESVSP64: case OCFS2_IOC_UNRESVSP: case OCFS2_IOC_UNRESVSP64: case OCFS2_IOC_GROUP_EXTEND: case OCFS2_IOC_GROUP_ADD: case OCFS2_IOC_GROUP_ADD64: break; case OCFS2_IOC_REFLINK: if (copy_from_user(&args, argp, sizeof(args))) return -EFAULT; preserve = (args.preserve != 0); return ocfs2_reflink_ioctl(inode, compat_ptr(args.old_path), compat_ptr(args.new_path), preserve); case OCFS2_IOC_INFO: if (copy_from_user(&info, argp, sizeof(struct ocfs2_info))) return -EFAULT; return ocfs2_info_handle(inode, &info, 1); case FITRIM: case OCFS2_IOC_MOVE_EXT: break; default: return -ENOIOCTLCMD; } return ocfs2_ioctl(file, cmd, arg); } #endif
4 4 1 1 4 4 4 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 // SPDX-License-Identifier: GPL-2.0 /* * HugeTLB Vmemmap Optimization (HVO) * * Copyright (c) 2020, ByteDance. All rights reserved. * * Author: Muchun Song <songmuchun@bytedance.com> * * See Documentation/mm/vmemmap_dedup.rst */ #define pr_fmt(fmt) "HugeTLB: " fmt #include <linux/pgtable.h> #include <linux/moduleparam.h> #include <linux/bootmem_info.h> #include <linux/mmdebug.h> #include <linux/pagewalk.h> #include <linux/pgalloc.h> #include <asm/tlbflush.h> #include "hugetlb_vmemmap.h" #include "internal.h" /** * struct vmemmap_remap_walk - walk vmemmap page table * * @remap_pte: called for each lowest-level entry (PTE). * @nr_walked: the number of walked pte. * @vmemmap_head: the page to be installed as first in the vmemmap range * @vmemmap_tail: the page to be installed as non-first in the vmemmap range * @vmemmap_pages: the list head of the vmemmap pages that can be freed * or is mapped from. * @flags: used to modify behavior in vmemmap page table walking * operations. */ struct vmemmap_remap_walk { void (*remap_pte)(pte_t *pte, unsigned long addr, struct vmemmap_remap_walk *walk); unsigned long nr_walked; struct page *vmemmap_head; struct page *vmemmap_tail; struct list_head *vmemmap_pages; /* Skip the TLB flush when we split the PMD */ #define VMEMMAP_SPLIT_NO_TLB_FLUSH BIT(0) /* Skip the TLB flush when we remap the PTE */ #define VMEMMAP_REMAP_NO_TLB_FLUSH BIT(1) unsigned long flags; }; static int vmemmap_split_pmd(pmd_t *pmd, struct page *head, unsigned long start, struct vmemmap_remap_walk *walk) { pmd_t __pmd; int i; unsigned long addr = start; pte_t *pgtable; pgtable = pte_alloc_one_kernel(&init_mm); if (!pgtable) return -ENOMEM; pmd_populate_kernel(&init_mm, &__pmd, pgtable); for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { pte_t entry, *pte; pgprot_t pgprot = PAGE_KERNEL; entry = mk_pte(head + i, pgprot); pte = pte_offset_kernel(&__pmd, addr); set_pte_at(&init_mm, addr, pte, entry); } spin_lock(&init_mm.page_table_lock); if (likely(pmd_leaf(*pmd))) { /* * Higher order allocations from buddy allocator must be able to * be treated as independent small pages (as they can be freed * individually). */ if (!PageReserved(head)) split_page(head, get_order(PMD_SIZE)); /* Make pte visible before pmd. See comment in pmd_install(). */ smp_wmb(); pmd_populate_kernel(&init_mm, pmd, pgtable); if (!(walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH)) flush_tlb_kernel_range(start, start + PMD_SIZE); } else { pte_free_kernel(&init_mm, pgtable); } spin_unlock(&init_mm.page_table_lock); return 0; } static int vmemmap_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next, struct mm_walk *walk) { int ret = 0; struct page *head; struct vmemmap_remap_walk *vmemmap_walk = walk->private; /* Only splitting, not remapping the vmemmap pages. */ if (!vmemmap_walk->remap_pte) walk->action = ACTION_CONTINUE; spin_lock(&init_mm.page_table_lock); head = pmd_leaf(*pmd) ? pmd_page(*pmd) : NULL; /* * Due to HugeTLB alignment requirements and the vmemmap * pages being at the start of the hotplugged memory * region in memory_hotplug.memmap_on_memory case. Checking * the vmemmap page associated with the first vmemmap page * if it is self-hosted is sufficient. * * [ hotplugged memory ] * [ section ][...][ section ] * [ vmemmap ][ usable memory ] * ^ | ^ | * +--+ | | * +------------------------+ */ if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && unlikely(!vmemmap_walk->nr_walked)) { struct page *page = head ? head + pte_index(addr) : pte_page(ptep_get(pte_offset_kernel(pmd, addr))); if (PageVmemmapSelfHosted(page)) ret = -ENOTSUPP; } spin_unlock(&init_mm.page_table_lock); if (!head || ret) return ret; return vmemmap_split_pmd(pmd, head, addr & PMD_MASK, vmemmap_walk); } static int vmemmap_pte_entry(pte_t *pte, unsigned long addr, unsigned long next, struct mm_walk *walk) { struct vmemmap_remap_walk *vmemmap_walk = walk->private; vmemmap_walk->remap_pte(pte, addr, vmemmap_walk); vmemmap_walk->nr_walked++; return 0; } static const struct mm_walk_ops vmemmap_remap_ops = { .pmd_entry = vmemmap_pmd_entry, .pte_entry = vmemmap_pte_entry, }; static int vmemmap_remap_range(unsigned long start, unsigned long end, struct vmemmap_remap_walk *walk) { int ret; VM_BUG_ON(!PAGE_ALIGNED(start | end)); mmap_read_lock(&init_mm); ret = walk_kernel_page_table_range(start, end, &vmemmap_remap_ops, NULL, walk); mmap_read_unlock(&init_mm); if (ret) return ret; if (walk->remap_pte && !(walk->flags & VMEMMAP_REMAP_NO_TLB_FLUSH)) flush_tlb_kernel_range(start, end); return 0; } /* * Free a vmemmap page. A vmemmap page can be allocated from the memblock * allocator or buddy allocator. If the PG_reserved flag is set, it means * that it allocated from the memblock allocator, just free it via the * free_bootmem_page(). Otherwise, use __free_page(). */ static inline void free_vmemmap_page(struct page *page) { if (PageReserved(page)) { memmap_boot_pages_add(-1); free_bootmem_page(page); } else { memmap_pages_add(-1); __free_page(page); } } /* Free a list of the vmemmap pages */ static void free_vmemmap_page_list(struct list_head *list) { struct page *page, *next; list_for_each_entry_safe(page, next, list, lru) free_vmemmap_page(page); } static void vmemmap_remap_pte(pte_t *pte, unsigned long addr, struct vmemmap_remap_walk *walk) { struct page *page = pte_page(ptep_get(pte)); pte_t entry; /* Remapping the head page requires r/w */ if (unlikely(walk->nr_walked == 0 && walk->vmemmap_head)) { list_del(&walk->vmemmap_head->lru); /* * Makes sure that preceding stores to the page contents from * vmemmap_remap_free() become visible before the set_pte_at() * write. */ smp_wmb(); entry = mk_pte(walk->vmemmap_head, PAGE_KERNEL); } else { /* * Remap the tail pages as read-only to catch illegal write * operation to the tail pages. */ entry = mk_pte(walk->vmemmap_tail, PAGE_KERNEL_RO); } list_add(&page->lru, walk->vmemmap_pages); set_pte_at(&init_mm, addr, pte, entry); } static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, struct vmemmap_remap_walk *walk) { struct page *page; struct page *from, *to; page = list_first_entry(walk->vmemmap_pages, struct page, lru); list_del(&page->lru); /* * Initialize tail pages in the newly allocated vmemmap page. * * There is folio-scope metadata that is encoded in the first few * tail pages. * * Use the value last tail page in the page with the head page * to initialize the rest of tail pages. */ from = compound_head((struct page *)addr) + PAGE_SIZE / sizeof(struct page) - 1; to = page_to_virt(page); for (int i = 0; i < PAGE_SIZE / sizeof(struct page); i++, to++) *to = *from; /* * Makes sure that preceding stores to the page contents become visible * before the set_pte_at() write. */ smp_wmb(); set_pte_at(&init_mm, addr, pte, mk_pte(page, PAGE_KERNEL)); } /** * vmemmap_remap_split - split the vmemmap virtual address range [@start, @end) * backing PMDs of the directmap into PTEs * @start: start address of the vmemmap virtual address range that we want * to remap. * @end: end address of the vmemmap virtual address range that we want to * remap. * Return: %0 on success, negative error code otherwise. */ static int vmemmap_remap_split(unsigned long start, unsigned long end) { struct vmemmap_remap_walk walk = { .remap_pte = NULL, .flags = VMEMMAP_SPLIT_NO_TLB_FLUSH, }; return vmemmap_remap_range(start, end, &walk); } /** * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end) * to use @vmemmap_head/tail, then free vmemmap which * the range are mapped to. * @start: start address of the vmemmap virtual address range that we want * to remap. * @end: end address of the vmemmap virtual address range that we want to * remap. * @vmemmap_head: the page to be installed as first in the vmemmap range * @vmemmap_tail: the page to be installed as non-first in the vmemmap range * @vmemmap_pages: list to deposit vmemmap pages to be freed. It is callers * responsibility to free pages. * @flags: modifications to vmemmap_remap_walk flags * * Return: %0 on success, negative error code otherwise. */ static int vmemmap_remap_free(unsigned long start, unsigned long end, struct page *vmemmap_head, struct page *vmemmap_tail, struct list_head *vmemmap_pages, unsigned long flags) { int ret; struct vmemmap_remap_walk walk = { .remap_pte = vmemmap_remap_pte, .vmemmap_head = vmemmap_head, .vmemmap_tail = vmemmap_tail, .vmemmap_pages = vmemmap_pages, .flags = flags, }; ret = vmemmap_remap_range(start, end, &walk); if (!ret || !walk.nr_walked) return ret; end = start + walk.nr_walked * PAGE_SIZE; /* * vmemmap_pages contains pages from the previous vmemmap_remap_range() * call which failed. These are pages which were removed from * the vmemmap. They will be restored in the following call. */ walk = (struct vmemmap_remap_walk) { .remap_pte = vmemmap_restore_pte, .vmemmap_pages = vmemmap_pages, .flags = 0, }; vmemmap_remap_range(start, end, &walk); return ret; } static int alloc_vmemmap_page_list(unsigned long start, unsigned long end, struct list_head *list) { gfp_t gfp_mask = GFP_KERNEL | __GFP_RETRY_MAYFAIL; unsigned long nr_pages = (end - start) >> PAGE_SHIFT; int nid = page_to_nid((struct page *)start); struct page *page, *next; int i; for (i = 0; i < nr_pages; i++) { page = alloc_pages_node(nid, gfp_mask, 0); if (!page) goto out; list_add(&page->lru, list); } memmap_pages_add(nr_pages); return 0; out: list_for_each_entry_safe(page, next, list, lru) __free_page(page); return -ENOMEM; } /** * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end) * to the page which is from the @vmemmap_pages * respectively. * @start: start address of the vmemmap virtual address range that we want * to remap. * @end: end address of the vmemmap virtual address range that we want to * remap. * @flags: modifications to vmemmap_remap_walk flags * * Return: %0 on success, negative error code otherwise. */ static int vmemmap_remap_alloc(unsigned long start, unsigned long end, unsigned long flags) { LIST_HEAD(vmemmap_pages); struct vmemmap_remap_walk walk = { .remap_pte = vmemmap_restore_pte, .vmemmap_pages = &vmemmap_pages, .flags = flags, }; if (alloc_vmemmap_page_list(start, end, &vmemmap_pages)) return -ENOMEM; return vmemmap_remap_range(start, end, &walk); } static bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON); static int __init hugetlb_vmemmap_optimize_param(char *buf) { return kstrtobool(buf, &vmemmap_optimize_enabled); } early_param("hugetlb_free_vmemmap", hugetlb_vmemmap_optimize_param); static int __hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio, unsigned long flags) { int ret; unsigned long vmemmap_start, vmemmap_end; VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio); VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio); if (!folio_test_hugetlb_vmemmap_optimized(folio)) return 0; vmemmap_start = (unsigned long)&folio->page; vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; /* * The pages which the vmemmap virtual address range [@vmemmap_start, * @vmemmap_end) are mapped to are freed to the buddy allocator. * When a HugeTLB page is freed to the buddy allocator, previously * discarded vmemmap pages must be allocated and remapping. */ ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, flags); if (!ret) folio_clear_hugetlb_vmemmap_optimized(folio); return ret; } /** * hugetlb_vmemmap_restore_folio - restore previously optimized (by * hugetlb_vmemmap_optimize_folio()) vmemmap pages which * will be reallocated and remapped. * @h: struct hstate. * @folio: the folio whose vmemmap pages will be restored. * * Return: %0 if @folio's vmemmap pages have been reallocated and remapped, * negative error code otherwise. */ int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio) { return __hugetlb_vmemmap_restore_folio(h, folio, 0); } /** * hugetlb_vmemmap_restore_folios - restore vmemmap for every folio on the list. * @h: hstate. * @folio_list: list of folios. * @non_hvo_folios: Output list of folios for which vmemmap exists. * * Return: number of folios for which vmemmap was restored, or an error code * if an error was encountered restoring vmemmap for a folio. * Folios that have vmemmap are moved to the non_hvo_folios * list. Processing of entries stops when the first error is * encountered. The folio that experienced the error and all * non-processed folios will remain on folio_list. */ long hugetlb_vmemmap_restore_folios(const struct hstate *h, struct list_head *folio_list, struct list_head *non_hvo_folios) { struct folio *folio, *t_folio; long restored = 0; long ret = 0; unsigned long flags = VMEMMAP_REMAP_NO_TLB_FLUSH; list_for_each_entry_safe(folio, t_folio, folio_list, lru) { if (folio_test_hugetlb_vmemmap_optimized(folio)) { ret = __hugetlb_vmemmap_restore_folio(h, folio, flags); if (ret) break; restored++; } /* Add non-optimized folios to output list */ list_move(&folio->lru, non_hvo_folios); } if (restored) flush_tlb_all(); if (!ret) ret = restored; return ret; } /* Return true iff a HugeTLB whose vmemmap should and can be optimized. */ static bool vmemmap_should_optimize_folio(const struct hstate *h, struct folio *folio) { if (folio_test_hugetlb_vmemmap_optimized(folio)) return false; if (!READ_ONCE(vmemmap_optimize_enabled)) return false; if (!hugetlb_vmemmap_optimizable(h)) return false; return true; } static struct page *vmemmap_get_tail(unsigned int order, struct zone *zone) { const unsigned int idx = order - VMEMMAP_TAIL_MIN_ORDER; struct page *tail, *p; int node = zone_to_nid(zone); tail = READ_ONCE(zone->vmemmap_tails[idx]); if (likely(tail)) return tail; tail = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); if (!tail) return NULL; p = page_to_virt(tail); for (int i = 0; i < PAGE_SIZE / sizeof(struct page); i++) init_compound_tail(p + i, NULL, order, zone); if (cmpxchg(&zone->vmemmap_tails[idx], NULL, tail)) { __free_page(tail); tail = READ_ONCE(zone->vmemmap_tails[idx]); } return tail; } static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio, struct list_head *vmemmap_pages, unsigned long flags) { unsigned long vmemmap_start, vmemmap_end; struct page *vmemmap_head, *vmemmap_tail; int nid, ret = 0; VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio); VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio); if (!vmemmap_should_optimize_folio(h, folio)) return ret; nid = folio_nid(folio); vmemmap_tail = vmemmap_get_tail(h->order, folio_zone(folio)); if (!vmemmap_tail) return -ENOMEM; /* * Very Subtle * If VMEMMAP_REMAP_NO_TLB_FLUSH is set, TLB flushing is not performed * immediately after remapping. As a result, subsequent accesses * and modifications to struct pages associated with the hugetlb * page could be to the OLD struct pages. Set the vmemmap optimized * flag here so that it is copied to the new head page. This keeps * the old and new struct pages in sync. * If there is an error during optimization, we will immediately FLUSH * the TLB and clear the flag below. */ folio_set_hugetlb_vmemmap_optimized(folio); vmemmap_head = alloc_pages_node(nid, GFP_KERNEL, 0); if (!vmemmap_head) { ret = -ENOMEM; goto out; } copy_page(page_to_virt(vmemmap_head), folio); list_add(&vmemmap_head->lru, vmemmap_pages); memmap_pages_add(1); vmemmap_start = (unsigned long)&folio->page; vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); /* * Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end). * Add pages previously mapping the range to vmemmap_pages list so that * they can be freed by the caller. */ ret = vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_head, vmemmap_tail, vmemmap_pages, flags); out: if (ret) folio_clear_hugetlb_vmemmap_optimized(folio); return ret; } /** * hugetlb_vmemmap_optimize_folio - optimize @folio's vmemmap pages. * @h: struct hstate. * @folio: the folio whose vmemmap pages will be optimized. * * This function only tries to optimize @folio's vmemmap pages and does not * guarantee that the optimization will succeed after it returns. The caller * can use folio_test_hugetlb_vmemmap_optimized(@folio) to detect if @folio's * vmemmap pages have been optimized. */ void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio) { LIST_HEAD(vmemmap_pages); __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, 0); free_vmemmap_page_list(&vmemmap_pages); } static int hugetlb_vmemmap_split_folio(const struct hstate *h, struct folio *folio) { unsigned long vmemmap_start, vmemmap_end; if (!vmemmap_should_optimize_folio(h, folio)) return 0; vmemmap_start = (unsigned long)&folio->page; vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); /* * Split PMDs on the vmemmap virtual address range [@vmemmap_start, * @vmemmap_end] */ return vmemmap_remap_split(vmemmap_start, vmemmap_end); } static void __hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list, bool boot) { struct folio *folio; int nr_to_optimize; LIST_HEAD(vmemmap_pages); unsigned long flags = VMEMMAP_REMAP_NO_TLB_FLUSH; nr_to_optimize = 0; list_for_each_entry(folio, folio_list, lru) { int ret; unsigned long spfn, epfn; if (boot && folio_test_hugetlb_vmemmap_optimized(folio)) { /* * Already optimized by pre-HVO, just map the * mirrored tail page structs RO. */ spfn = (unsigned long)&folio->page; epfn = spfn + pages_per_huge_page(h); vmemmap_wrprotect_hvo(spfn, epfn, folio_nid(folio), HUGETLB_VMEMMAP_RESERVE_SIZE); register_page_bootmem_memmap(pfn_to_section_nr(spfn), &folio->page, HUGETLB_VMEMMAP_RESERVE_SIZE); continue; } nr_to_optimize++; ret = hugetlb_vmemmap_split_folio(h, folio); /* * Splitting the PMD requires allocating a page, thus let's fail * early once we encounter the first OOM. No point in retrying * as it can be dynamically done on remap with the memory * we get back from the vmemmap deduplication. */ if (ret == -ENOMEM) break; } if (!nr_to_optimize) /* * All pre-HVO folios, nothing left to do. It's ok if * there is a mix of pre-HVO and not yet HVO-ed folios * here, as __hugetlb_vmemmap_optimize_folio() will * skip any folios that already have the optimized flag * set, see vmemmap_should_optimize_folio(). */ goto out; flush_tlb_all(); list_for_each_entry(folio, folio_list, lru) { int ret; ret = __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, flags); /* * Pages to be freed may have been accumulated. If we * encounter an ENOMEM, free what we have and try again. * This can occur in the case that both splitting fails * halfway and head page allocation also failed. In this * case __hugetlb_vmemmap_optimize_folio() would free memory * allowing more vmemmap remaps to occur. */ if (ret == -ENOMEM && !list_empty(&vmemmap_pages)) { flush_tlb_all(); free_vmemmap_page_list(&vmemmap_pages); INIT_LIST_HEAD(&vmemmap_pages); __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, flags); } } out: flush_tlb_all(); free_vmemmap_page_list(&vmemmap_pages); } void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list) { __hugetlb_vmemmap_optimize_folios(h, folio_list, false); } void hugetlb_vmemmap_optimize_bootmem_folios(struct hstate *h, struct list_head *folio_list) { __hugetlb_vmemmap_optimize_folios(h, folio_list, true); } #ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT /* Return true of a bootmem allocated HugeTLB page should be pre-HVO-ed */ static bool vmemmap_should_optimize_bootmem_page(struct huge_bootmem_page *m) { unsigned long section_size, psize, pmd_vmemmap_size; phys_addr_t paddr; if (!READ_ONCE(vmemmap_optimize_enabled)) return false; if (!hugetlb_vmemmap_optimizable(m->hstate)) return false; psize = huge_page_size(m->hstate); paddr = virt_to_phys(m); /* * Pre-HVO only works if the bootmem huge page * is aligned to the section size. */ section_size = (1UL << PA_SECTION_SHIFT); if (!IS_ALIGNED(paddr, section_size) || !IS_ALIGNED(psize, section_size)) return false; /* * The pre-HVO code does not deal with splitting PMDS, * so the bootmem page must be aligned to the number * of base pages that can be mapped with one vmemmap PMD. */ pmd_vmemmap_size = (PMD_SIZE / (sizeof(struct page))) << PAGE_SHIFT; if (!IS_ALIGNED(paddr, pmd_vmemmap_size) || !IS_ALIGNED(psize, pmd_vmemmap_size)) return false; return true; } /* * Initialize memmap section for a gigantic page, HVO-style. */ void __init hugetlb_vmemmap_init_early(int nid) { unsigned long psize, paddr, section_size; unsigned long ns, i, pnum, pfn, nr_pages; struct huge_bootmem_page *m = NULL; void *map; if (!READ_ONCE(vmemmap_optimize_enabled)) return; section_size = (1UL << PA_SECTION_SHIFT); list_for_each_entry(m, &huge_boot_pages[nid], list) { if (!vmemmap_should_optimize_bootmem_page(m)) continue; nr_pages = pages_per_huge_page(m->hstate); psize = nr_pages << PAGE_SHIFT; paddr = virt_to_phys(m); pfn = PHYS_PFN(paddr); map = pfn_to_page(pfn); pnum = pfn_to_section_nr(pfn); ns = psize / section_size; for (i = 0; i < ns; i++) { sparse_init_early_section(nid, map, pnum, SECTION_IS_VMEMMAP_PREINIT); map += section_map_size(); pnum++; } m->flags |= HUGE_BOOTMEM_HVO; } } static struct zone *pfn_to_zone(unsigned nid, unsigned long pfn) { struct zone *zone; enum zone_type zone_type; for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { zone = &NODE_DATA(nid)->node_zones[zone_type]; if (zone_spans_pfn(zone, pfn)) return zone; } return NULL; } void __init hugetlb_vmemmap_init_late(int nid) { struct huge_bootmem_page *m, *tm; unsigned long phys, nr_pages, start, end; unsigned long pfn, nr_mmap; struct zone *zone = NULL; struct hstate *h; void *map; if (!READ_ONCE(vmemmap_optimize_enabled)) return; list_for_each_entry_safe(m, tm, &huge_boot_pages[nid], list) { if (!(m->flags & HUGE_BOOTMEM_HVO)) continue; phys = virt_to_phys(m); h = m->hstate; pfn = PHYS_PFN(phys); nr_pages = pages_per_huge_page(h); map = pfn_to_page(pfn); start = (unsigned long)map; end = start + nr_pages * sizeof(struct page); if (!hugetlb_bootmem_page_zones_valid(nid, m)) { /* * Oops, the hugetlb page spans multiple zones. * Remove it from the list, and populate it normally. */ list_del(&m->list); vmemmap_populate(start, end, nid, NULL); nr_mmap = end - start; memmap_boot_pages_add(DIV_ROUND_UP(nr_mmap, PAGE_SIZE)); memblock_phys_free(phys, huge_page_size(h)); continue; } if (!zone || !zone_spans_pfn(zone, pfn)) zone = pfn_to_zone(nid, pfn); if (WARN_ON_ONCE(!zone)) continue; if (vmemmap_populate_hvo(start, end, huge_page_order(h), zone, HUGETLB_VMEMMAP_RESERVE_SIZE) < 0) { /* Fallback if HVO population fails */ vmemmap_populate(start, end, nid, NULL); nr_mmap = end - start; } else { m->flags |= HUGE_BOOTMEM_ZONES_VALID; nr_mmap = HUGETLB_VMEMMAP_RESERVE_SIZE; } memmap_boot_pages_add(DIV_ROUND_UP(nr_mmap, PAGE_SIZE)); } } #endif static const struct ctl_table hugetlb_vmemmap_sysctls[] = { { .procname = "hugetlb_optimize_vmemmap", .data = &vmemmap_optimize_enabled, .maxlen = sizeof(vmemmap_optimize_enabled), .mode = 0644, .proc_handler = proc_dobool, }, }; static int __init hugetlb_vmemmap_init(void) { const struct hstate *h; struct zone *zone; /* HUGETLB_VMEMMAP_RESERVE_SIZE should cover all used struct pages */ BUILD_BUG_ON(__NR_USED_SUBPAGE > HUGETLB_VMEMMAP_RESERVE_PAGES); for_each_zone(zone) { for (int i = 0; i < NR_VMEMMAP_TAILS; i++) { struct page *tail, *p; unsigned int order; tail = zone->vmemmap_tails[i]; if (!tail) continue; order = i + VMEMMAP_TAIL_MIN_ORDER; p = page_to_virt(tail); for (int j = 0; j < PAGE_SIZE / sizeof(struct page); j++) init_compound_tail(p + j, NULL, order, zone); } } for_each_hstate(h) { if (hugetlb_vmemmap_optimizable(h)) { register_sysctl_init("vm", hugetlb_vmemmap_sysctls); break; } } return 0; } late_initcall(hugetlb_vmemmap_init);
1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 // SPDX-License-Identifier: GPL-2.0 OR MIT /* * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. * * This is based in part on Andrew Moon's poly1305-donna, which is in the * public domain. */ #include <crypto/internal/poly1305.h> #include <linux/export.h> #include <linux/kernel.h> #include <linux/unaligned.h> void poly1305_core_setkey(struct poly1305_core_key *key, const u8 raw_key[POLY1305_BLOCK_SIZE]) { u64 t0, t1; /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ t0 = get_unaligned_le64(&raw_key[0]); t1 = get_unaligned_le64(&raw_key[8]); key->key.r64[0] = t0 & 0xffc0fffffffULL; key->key.r64[1] = ((t0 >> 44) | (t1 << 20)) & 0xfffffc0ffffULL; key->key.r64[2] = ((t1 >> 24)) & 0x00ffffffc0fULL; /* s = 20*r */ key->precomputed_s.r64[0] = key->key.r64[1] * 20; key->precomputed_s.r64[1] = key->key.r64[2] * 20; } EXPORT_SYMBOL(poly1305_core_setkey); void poly1305_core_blocks(struct poly1305_state *state, const struct poly1305_core_key *key, const void *src, unsigned int nblocks, u32 hibit) { const u8 *input = src; u64 hibit64; u64 r0, r1, r2; u64 s1, s2; u64 h0, h1, h2; u64 c; u128 d0, d1, d2, d; if (!nblocks) return; hibit64 = ((u64)hibit) << 40; r0 = key->key.r64[0]; r1 = key->key.r64[1]; r2 = key->key.r64[2]; h0 = state->h64[0]; h1 = state->h64[1]; h2 = state->h64[2]; s1 = key->precomputed_s.r64[0]; s2 = key->precomputed_s.r64[1]; do { u64 t0, t1; /* h += m[i] */ t0 = get_unaligned_le64(&input[0]); t1 = get_unaligned_le64(&input[8]); h0 += t0 & 0xfffffffffffULL; h1 += ((t0 >> 44) | (t1 << 20)) & 0xfffffffffffULL; h2 += (((t1 >> 24)) & 0x3ffffffffffULL) | hibit64; /* h *= r */ d0 = (u128)h0 * r0; d = (u128)h1 * s2; d0 += d; d = (u128)h2 * s1; d0 += d; d1 = (u128)h0 * r1; d = (u128)h1 * r0; d1 += d; d = (u128)h2 * s2; d1 += d; d2 = (u128)h0 * r2; d = (u128)h1 * r1; d2 += d; d = (u128)h2 * r0; d2 += d; /* (partial) h %= p */ c = (u64)(d0 >> 44); h0 = (u64)d0 & 0xfffffffffffULL; d1 += c; c = (u64)(d1 >> 44); h1 = (u64)d1 & 0xfffffffffffULL; d2 += c; c = (u64)(d2 >> 42); h2 = (u64)d2 & 0x3ffffffffffULL; h0 += c * 5; c = h0 >> 44; h0 = h0 & 0xfffffffffffULL; h1 += c; input += POLY1305_BLOCK_SIZE; } while (--nblocks); state->h64[0] = h0; state->h64[1] = h1; state->h64[2] = h2; } EXPORT_SYMBOL(poly1305_core_blocks); void poly1305_core_emit(const struct poly1305_state *state, const u32 nonce[4], void *dst) { u8 *mac = dst; u64 h0, h1, h2, c; u64 g0, g1, g2; u64 t0, t1; /* fully carry h */ h0 = state->h64[0]; h1 = state->h64[1]; h2 = state->h64[2]; c = h1 >> 44; h1 &= 0xfffffffffffULL; h2 += c; c = h2 >> 42; h2 &= 0x3ffffffffffULL; h0 += c * 5; c = h0 >> 44; h0 &= 0xfffffffffffULL; h1 += c; c = h1 >> 44; h1 &= 0xfffffffffffULL; h2 += c; c = h2 >> 42; h2 &= 0x3ffffffffffULL; h0 += c * 5; c = h0 >> 44; h0 &= 0xfffffffffffULL; h1 += c; /* compute h + -p */ g0 = h0 + 5; c = g0 >> 44; g0 &= 0xfffffffffffULL; g1 = h1 + c; c = g1 >> 44; g1 &= 0xfffffffffffULL; g2 = h2 + c - (1ULL << 42); /* select h if h < p, or h + -p if h >= p */ c = (g2 >> ((sizeof(u64) * 8) - 1)) - 1; g0 &= c; g1 &= c; g2 &= c; c = ~c; h0 = (h0 & c) | g0; h1 = (h1 & c) | g1; h2 = (h2 & c) | g2; if (likely(nonce)) { /* h = (h + nonce) */ t0 = ((u64)nonce[1] << 32) | nonce[0]; t1 = ((u64)nonce[3] << 32) | nonce[2]; h0 += t0 & 0xfffffffffffULL; c = h0 >> 44; h0 &= 0xfffffffffffULL; h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffffULL) + c; c = h1 >> 44; h1 &= 0xfffffffffffULL; h2 += (((t1 >> 24)) & 0x3ffffffffffULL) + c; h2 &= 0x3ffffffffffULL; } /* mac = h % (2^128) */ h0 = h0 | (h1 << 44); h1 = (h1 >> 20) | (h2 << 24); put_unaligned_le64(h0, &mac[0]); put_unaligned_le64(h1, &mac[8]); } EXPORT_SYMBOL(poly1305_core_emit);
4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM vmscan #if !defined(_TRACE_VMSCAN_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_VMSCAN_H #include <linux/types.h> #include <linux/tracepoint.h> #include <linux/mm.h> #include <linux/memcontrol.h> #include <trace/events/mmflags.h> #define RECLAIM_WB_ANON 0x0001u #define RECLAIM_WB_FILE 0x0002u #define RECLAIM_WB_MIXED 0x0010u #define RECLAIM_WB_SYNC 0x0004u /* Unused, all reclaim async */ #define RECLAIM_WB_ASYNC 0x0008u #define RECLAIM_WB_LRU (RECLAIM_WB_ANON|RECLAIM_WB_FILE) #define show_reclaim_flags(flags) \ (flags) ? __print_flags(flags, "|", \ {RECLAIM_WB_ANON, "RECLAIM_WB_ANON"}, \ {RECLAIM_WB_FILE, "RECLAIM_WB_FILE"}, \ {RECLAIM_WB_MIXED, "RECLAIM_WB_MIXED"}, \ {RECLAIM_WB_SYNC, "RECLAIM_WB_SYNC"}, \ {RECLAIM_WB_ASYNC, "RECLAIM_WB_ASYNC"} \ ) : "RECLAIM_WB_NONE" #define _VMSCAN_THROTTLE_WRITEBACK (1 << VMSCAN_THROTTLE_WRITEBACK) #define _VMSCAN_THROTTLE_ISOLATED (1 << VMSCAN_THROTTLE_ISOLATED) #define _VMSCAN_THROTTLE_NOPROGRESS (1 << VMSCAN_THROTTLE_NOPROGRESS) #define _VMSCAN_THROTTLE_CONGESTED (1 << VMSCAN_THROTTLE_CONGESTED) #define show_throttle_flags(flags) \ (flags) ? __print_flags(flags, "|", \ {_VMSCAN_THROTTLE_WRITEBACK, "VMSCAN_THROTTLE_WRITEBACK"}, \ {_VMSCAN_THROTTLE_ISOLATED, "VMSCAN_THROTTLE_ISOLATED"}, \ {_VMSCAN_THROTTLE_NOPROGRESS, "VMSCAN_THROTTLE_NOPROGRESS"}, \ {_VMSCAN_THROTTLE_CONGESTED, "VMSCAN_THROTTLE_CONGESTED"} \ ) : "VMSCAN_THROTTLE_NONE" TRACE_DEFINE_ENUM(KSWAPD_CLEAR_HOPELESS_OTHER); TRACE_DEFINE_ENUM(KSWAPD_CLEAR_HOPELESS_KSWAPD); TRACE_DEFINE_ENUM(KSWAPD_CLEAR_HOPELESS_DIRECT); TRACE_DEFINE_ENUM(KSWAPD_CLEAR_HOPELESS_PCP); #define kswapd_clear_hopeless_reason_ops \ {KSWAPD_CLEAR_HOPELESS_KSWAPD, "KSWAPD"}, \ {KSWAPD_CLEAR_HOPELESS_DIRECT, "DIRECT"}, \ {KSWAPD_CLEAR_HOPELESS_PCP, "PCP"}, \ {KSWAPD_CLEAR_HOPELESS_OTHER, "OTHER"} #define trace_reclaim_flags(file) ( \ (file ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \ (RECLAIM_WB_ASYNC) \ ) TRACE_EVENT(mm_vmscan_kswapd_sleep, TP_PROTO(int nid), TP_ARGS(nid), TP_STRUCT__entry( __field( int, nid ) ), TP_fast_assign( __entry->nid = nid; ), TP_printk("nid=%d", __entry->nid) ); TRACE_EVENT(mm_vmscan_kswapd_wake, TP_PROTO(int nid, int zid, int order), TP_ARGS(nid, zid, order), TP_STRUCT__entry( __field( int, nid ) __field( int, zid ) __field( int, order ) ), TP_fast_assign( __entry->nid = nid; __entry->zid = zid; __entry->order = order; ), TP_printk("nid=%d order=%d", __entry->nid, __entry->order) ); TRACE_EVENT(mm_vmscan_wakeup_kswapd, TP_PROTO(int nid, int zid, int order, gfp_t gfp_flags), TP_ARGS(nid, zid, order, gfp_flags), TP_STRUCT__entry( __field( int, nid ) __field( int, zid ) __field( int, order ) __field( unsigned long, gfp_flags ) ), TP_fast_assign( __entry->nid = nid; __entry->zid = zid; __entry->order = order; __entry->gfp_flags = (__force unsigned long)gfp_flags; ), TP_printk("nid=%d order=%d gfp_flags=%s", __entry->nid, __entry->order, show_gfp_flags(__entry->gfp_flags)) ); DECLARE_EVENT_CLASS(mm_vmscan_direct_reclaim_begin_template, TP_PROTO(gfp_t gfp_flags, int order, struct mem_cgroup *memcg), TP_ARGS(gfp_flags, order, memcg), TP_STRUCT__entry( __field( unsigned long, gfp_flags ) __field( u64, memcg_id ) __field( int, order ) ), TP_fast_assign( __entry->gfp_flags = (__force unsigned long)gfp_flags; __entry->order = order; __entry->memcg_id = mem_cgroup_id(memcg); ), TP_printk("order=%d gfp_flags=%s pid=%d memcg_id=%llu %s", __entry->order, show_gfp_flags(__entry->gfp_flags), __entry->ent.pid, __entry->memcg_id, __event_in_irq() ? "(in-irq)" : "") ); DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_direct_reclaim_begin, TP_PROTO(gfp_t gfp_flags, int order, struct mem_cgroup *memcg), TP_ARGS(gfp_flags, order, memcg) ); #ifdef CONFIG_MEMCG DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_memcg_reclaim_begin, TP_PROTO(gfp_t gfp_flags, int order, struct mem_cgroup *memcg), TP_ARGS(gfp_flags, order, memcg) ); DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_memcg_softlimit_reclaim_begin, TP_PROTO(gfp_t gfp_flags, int order, struct mem_cgroup *memcg), TP_ARGS(gfp_flags, order, memcg) ); #endif /* CONFIG_MEMCG */ DECLARE_EVENT_CLASS(mm_vmscan_direct_reclaim_end_template, TP_PROTO(unsigned long nr_reclaimed, struct mem_cgroup *memcg), TP_ARGS(nr_reclaimed, memcg), TP_STRUCT__entry( __field( unsigned long, nr_reclaimed ) __field( u64, memcg_id ) ), TP_fast_assign( __entry->nr_reclaimed = nr_reclaimed; __entry->memcg_id = mem_cgroup_id(memcg); ), TP_printk("nr_reclaimed=%lu pid=%d memcg_id=%llu %s", __entry->nr_reclaimed, __entry->ent.pid, __entry->memcg_id, __event_in_irq() ? "(in-irq)" : "") ); DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_direct_reclaim_end, TP_PROTO(unsigned long nr_reclaimed, struct mem_cgroup *memcg), TP_ARGS(nr_reclaimed, memcg) ); #ifdef CONFIG_MEMCG DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_memcg_reclaim_end, TP_PROTO(unsigned long nr_reclaimed, struct mem_cgroup *memcg), TP_ARGS(nr_reclaimed, memcg) ); DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_memcg_softlimit_reclaim_end, TP_PROTO(unsigned long nr_reclaimed, struct mem_cgroup *memcg), TP_ARGS(nr_reclaimed, memcg) ); #endif /* CONFIG_MEMCG */ TRACE_EVENT(mm_shrink_slab_start, TP_PROTO(struct shrinker *shr, struct shrink_control *sc, long nr_objects_to_shrink, unsigned long cache_items, unsigned long long delta, unsigned long total_scan, int priority, struct mem_cgroup *memcg), TP_ARGS(shr, sc, nr_objects_to_shrink, cache_items, delta, total_scan, priority, memcg), TP_STRUCT__entry( __field(struct shrinker *, shr) __field(void *, shrink) __field(long, nr_objects_to_shrink) __field(unsigned long, gfp_flags) __field(unsigned long, cache_items) __field(unsigned long long, delta) __field(unsigned long, total_scan) __field(int, priority) __field(int, nid) __field(u64, memcg_id) ), TP_fast_assign( __entry->shr = shr; __entry->shrink = shr->scan_objects; __entry->nr_objects_to_shrink = nr_objects_to_shrink; __entry->gfp_flags = (__force unsigned long)sc->gfp_mask; __entry->cache_items = cache_items; __entry->delta = delta; __entry->total_scan = total_scan; __entry->priority = priority; __entry->nid = sc->nid; __entry->memcg_id = mem_cgroup_id(memcg); ), TP_printk("%pS %p: nid: %d pid: %d memcg_id: %llu objects to shrink %ld gfp_flags %s cache items %ld delta %lld total_scan %ld priority %d %s", __entry->shrink, __entry->shr, __entry->nid, __entry->ent.pid, __entry->memcg_id, __entry->nr_objects_to_shrink, show_gfp_flags(__entry->gfp_flags), __entry->cache_items, __entry->delta, __entry->total_scan, __entry->priority, __event_in_irq() ? "(in-irq)" : "") ); TRACE_EVENT(mm_shrink_slab_end, TP_PROTO(struct shrinker *shr, int nid, int shrinker_retval, long unused_scan_cnt, long new_scan_cnt, long total_scan, struct mem_cgroup *memcg), TP_ARGS(shr, nid, shrinker_retval, unused_scan_cnt, new_scan_cnt, total_scan, memcg), TP_STRUCT__entry( __field(struct shrinker *, shr) __field(void *, shrink) __field(long, unused_scan) __field(long, new_scan) __field(long, total_scan) __field(int, nid) __field(int, retval) __field(u64, memcg_id) ), TP_fast_assign( __entry->shr = shr; __entry->shrink = shr->scan_objects; __entry->unused_scan = unused_scan_cnt; __entry->new_scan = new_scan_cnt; __entry->total_scan = total_scan; __entry->nid = nid; __entry->retval = shrinker_retval; __entry->memcg_id = mem_cgroup_id(memcg); ), TP_printk("%pS %p: nid: %d pid: %d memcg_id: %llu unused scan count %ld new scan count %ld total_scan %ld last shrinker return val %d %s", __entry->shrink, __entry->shr, __entry->nid, __entry->ent.pid, __entry->memcg_id, __entry->unused_scan, __entry->new_scan, __entry->total_scan, __entry->retval, __event_in_irq() ? "(in-irq)" : "") ); TRACE_EVENT(mm_vmscan_lru_isolate, TP_PROTO(int highest_zoneidx, int order, unsigned long nr_requested, unsigned long nr_scanned, unsigned long nr_skipped, unsigned long nr_taken, int lru), TP_ARGS(highest_zoneidx, order, nr_requested, nr_scanned, nr_skipped, nr_taken, lru), TP_STRUCT__entry( __field(int, highest_zoneidx) __field(int, order) __field(unsigned long, nr_requested) __field(unsigned long, nr_scanned) __field(unsigned long, nr_skipped) __field(unsigned long, nr_taken) __field(int, lru) ), TP_fast_assign( __entry->highest_zoneidx = highest_zoneidx; __entry->order = order; __entry->nr_requested = nr_requested; __entry->nr_scanned = nr_scanned; __entry->nr_skipped = nr_skipped; __entry->nr_taken = nr_taken; __entry->lru = lru; ), /* * classzone is previous name of the highest_zoneidx. * Reason not to change it is the ABI requirement of the tracepoint. */ TP_printk("classzone=%d order=%d nr_requested=%lu nr_scanned=%lu nr_skipped=%lu nr_taken=%lu lru=%s", __entry->highest_zoneidx, __entry->order, __entry->nr_requested, __entry->nr_scanned, __entry->nr_skipped, __entry->nr_taken, __print_symbolic(__entry->lru, LRU_NAMES)) ); TRACE_EVENT(mm_vmscan_write_folio, TP_PROTO(struct folio *folio), TP_ARGS(folio), TP_STRUCT__entry( __field(unsigned long, pfn) __field(int, reclaim_flags) ), TP_fast_assign( __entry->pfn = folio_pfn(folio); __entry->reclaim_flags = trace_reclaim_flags( folio_is_file_lru(folio)); ), TP_printk("page=%p pfn=0x%lx flags=%s", pfn_to_page(__entry->pfn), __entry->pfn, show_reclaim_flags(__entry->reclaim_flags)) ); TRACE_EVENT(mm_vmscan_reclaim_pages, TP_PROTO(int nid, unsigned long nr_scanned, unsigned long nr_reclaimed, struct reclaim_stat *stat), TP_ARGS(nid, nr_scanned, nr_reclaimed, stat), TP_STRUCT__entry( __field(int, nid) __field(unsigned long, nr_scanned) __field(unsigned long, nr_reclaimed) __field(unsigned long, nr_dirty) __field(unsigned long, nr_writeback) __field(unsigned long, nr_congested) __field(unsigned long, nr_immediate) __field(unsigned int, nr_activate0) __field(unsigned int, nr_activate1) __field(unsigned long, nr_ref_keep) __field(unsigned long, nr_unmap_fail) ), TP_fast_assign( __entry->nid = nid; __entry->nr_scanned = nr_scanned; __entry->nr_reclaimed = nr_reclaimed; __entry->nr_dirty = stat->nr_dirty; __entry->nr_writeback = stat->nr_writeback; __entry->nr_congested = stat->nr_congested; __entry->nr_immediate = stat->nr_immediate; __entry->nr_activate0 = stat->nr_activate[0]; __entry->nr_activate1 = stat->nr_activate[1]; __entry->nr_ref_keep = stat->nr_ref_keep; __entry->nr_unmap_fail = stat->nr_unmap_fail; ), TP_printk("nid=%d nr_scanned=%ld nr_reclaimed=%ld nr_dirty=%ld nr_writeback=%ld nr_congested=%ld nr_immediate=%ld nr_activate_anon=%d nr_activate_file=%d nr_ref_keep=%ld nr_unmap_fail=%ld", __entry->nid, __entry->nr_scanned, __entry->nr_reclaimed, __entry->nr_dirty, __entry->nr_writeback, __entry->nr_congested, __entry->nr_immediate, __entry->nr_activate0, __entry->nr_activate1, __entry->nr_ref_keep, __entry->nr_unmap_fail) ); TRACE_EVENT(mm_vmscan_lru_shrink_inactive, TP_PROTO(int nid, unsigned long nr_scanned, unsigned long nr_reclaimed, struct reclaim_stat *stat, int priority, int file), TP_ARGS(nid, nr_scanned, nr_reclaimed, stat, priority, file), TP_STRUCT__entry( __field(int, nid) __field(unsigned long, nr_scanned) __field(unsigned long, nr_reclaimed) __field(unsigned long, nr_dirty) __field(unsigned long, nr_writeback) __field(unsigned long, nr_congested) __field(unsigned long, nr_immediate) __field(unsigned int, nr_activate0) __field(unsigned int, nr_activate1) __field(unsigned long, nr_ref_keep) __field(unsigned long, nr_unmap_fail) __field(int, priority) __field(int, reclaim_flags) ), TP_fast_assign( __entry->nid = nid; __entry->nr_scanned = nr_scanned; __entry->nr_reclaimed = nr_reclaimed; __entry->nr_dirty = stat->nr_dirty; __entry->nr_writeback = stat->nr_writeback; __entry->nr_congested = stat->nr_congested; __entry->nr_immediate = stat->nr_immediate; __entry->nr_activate0 = stat->nr_activate[0]; __entry->nr_activate1 = stat->nr_activate[1]; __entry->nr_ref_keep = stat->nr_ref_keep; __entry->nr_unmap_fail = stat->nr_unmap_fail; __entry->priority = priority; __entry->reclaim_flags = trace_reclaim_flags(file); ), TP_printk("nid=%d nr_scanned=%ld nr_reclaimed=%ld nr_dirty=%ld nr_writeback=%ld nr_congested=%ld nr_immediate=%ld nr_activate_anon=%d nr_activate_file=%d nr_ref_keep=%ld nr_unmap_fail=%ld priority=%d flags=%s", __entry->nid, __entry->nr_scanned, __entry->nr_reclaimed, __entry->nr_dirty, __entry->nr_writeback, __entry->nr_congested, __entry->nr_immediate, __entry->nr_activate0, __entry->nr_activate1, __entry->nr_ref_keep, __entry->nr_unmap_fail, __entry->priority, show_reclaim_flags(__entry->reclaim_flags)) ); TRACE_EVENT(mm_vmscan_lru_shrink_active, TP_PROTO(int nid, unsigned long nr_taken, unsigned long nr_active, unsigned long nr_deactivated, unsigned long nr_referenced, int priority, int file), TP_ARGS(nid, nr_taken, nr_active, nr_deactivated, nr_referenced, priority, file), TP_STRUCT__entry( __field(int, nid) __field(unsigned long, nr_taken) __field(unsigned long, nr_active) __field(unsigned long, nr_deactivated) __field(unsigned long, nr_referenced) __field(int, priority) __field(int, reclaim_flags) ), TP_fast_assign( __entry->nid = nid; __entry->nr_taken = nr_taken; __entry->nr_active = nr_active; __entry->nr_deactivated = nr_deactivated; __entry->nr_referenced = nr_referenced; __entry->priority = priority; __entry->reclaim_flags = trace_reclaim_flags(file); ), TP_printk("nid=%d nr_taken=%ld nr_active=%ld nr_deactivated=%ld nr_referenced=%ld priority=%d flags=%s", __entry->nid, __entry->nr_taken, __entry->nr_active, __entry->nr_deactivated, __entry->nr_referenced, __entry->priority, show_reclaim_flags(__entry->reclaim_flags)) ); TRACE_EVENT(mm_vmscan_node_reclaim_begin, TP_PROTO(int nid, int order, gfp_t gfp_flags), TP_ARGS(nid, order, gfp_flags), TP_STRUCT__entry( __field(int, nid) __field(int, order) __field(unsigned long, gfp_flags) ), TP_fast_assign( __entry->nid = nid; __entry->order = order; __entry->gfp_flags = (__force unsigned long)gfp_flags; ), TP_printk("nid=%d order=%d gfp_flags=%s", __entry->nid, __entry->order, show_gfp_flags(__entry->gfp_flags)) ); DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_node_reclaim_end, TP_PROTO(unsigned long nr_reclaimed, struct mem_cgroup *memcg), TP_ARGS(nr_reclaimed, memcg) ); TRACE_EVENT(mm_vmscan_throttled, TP_PROTO(int nid, int usec_timeout, int usec_delayed, int reason), TP_ARGS(nid, usec_timeout, usec_delayed, reason), TP_STRUCT__entry( __field(int, nid) __field(int, usec_timeout) __field(int, usec_delayed) __field(int, reason) ), TP_fast_assign( __entry->nid = nid; __entry->usec_timeout = usec_timeout; __entry->usec_delayed = usec_delayed; __entry->reason = 1U << reason; ), TP_printk("nid=%d usec_timeout=%d usect_delayed=%d reason=%s", __entry->nid, __entry->usec_timeout, __entry->usec_delayed, show_throttle_flags(__entry->reason)) ); TRACE_EVENT(mm_vmscan_kswapd_reclaim_fail, TP_PROTO(int nid, int failures), TP_ARGS(nid, failures), TP_STRUCT__entry( __field(int, nid) __field(int, failures) ), TP_fast_assign( __entry->nid = nid; __entry->failures = failures; ), TP_printk("nid=%d failures=%d", __entry->nid, __entry->failures) ); TRACE_EVENT(mm_vmscan_kswapd_clear_hopeless, TP_PROTO(int nid, int reason), TP_ARGS(nid, reason), TP_STRUCT__entry( __field(int, nid) __field(int, reason) ), TP_fast_assign( __entry->nid = nid; __entry->reason = reason; ), TP_printk("nid=%d reason=%s", __entry->nid, __print_symbolic(__entry->reason, kswapd_clear_hopeless_reason_ops)) ); #endif /* _TRACE_VMSCAN_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _MM_SWAP_H #define _MM_SWAP_H #include <linux/atomic.h> /* for atomic_long_t */ struct mempolicy; struct swap_iocb; extern int page_cluster; #ifdef CONFIG_THP_SWAP #define SWAPFILE_CLUSTER HPAGE_PMD_NR #define swap_entry_order(order) (order) #else #define SWAPFILE_CLUSTER 256 #define swap_entry_order(order) 0 #endif extern struct swap_info_struct *swap_info[]; /* * We use this to track usage of a cluster. A cluster is a block of swap disk * space with SWAPFILE_CLUSTER pages long and naturally aligns in disk. All * free clusters are organized into a list. We fetch an entry from the list to * get a free cluster. * * The flags field determines if a cluster is free. This is * protected by cluster lock. */ struct swap_cluster_info { spinlock_t lock; /* * Protect swap_cluster_info fields * other than list, and swap_info_struct->swap_map * elements corresponding to the swap cluster. */ u16 count; u8 flags; u8 order; atomic_long_t __rcu *table; /* Swap table entries, see mm/swap_table.h */ unsigned int *extend_table; /* For large swap count, protected by ci->lock */ struct list_head list; }; /* All on-list cluster must have a non-zero flag. */ enum swap_cluster_flags { CLUSTER_FLAG_NONE = 0, /* For temporary off-list cluster */ CLUSTER_FLAG_FREE, CLUSTER_FLAG_NONFULL, CLUSTER_FLAG_FRAG, /* Clusters with flags above are allocatable */ CLUSTER_FLAG_USABLE = CLUSTER_FLAG_FRAG, CLUSTER_FLAG_FULL, CLUSTER_FLAG_DISCARD, CLUSTER_FLAG_MAX, }; #ifdef CONFIG_SWAP #include <linux/swapops.h> /* for swp_offset */ #include <linux/blk_types.h> /* for bio_end_io_t */ static inline unsigned int swp_cluster_offset(swp_entry_t entry) { return swp_offset(entry) % SWAPFILE_CLUSTER; } /* * Callers of all helpers below must ensure the entry, type, or offset is * valid, and protect the swap device with reference count or locks. */ static inline struct swap_info_struct *__swap_type_to_info(int type) { struct swap_info_struct *si; si = READ_ONCE(swap_info[type]); /* rcu_dereference() */ VM_WARN_ON_ONCE(percpu_ref_is_zero(&si->users)); /* race with swapoff */ return si; } static inline struct swap_info_struct *__swap_entry_to_info(swp_entry_t entry) { return __swap_type_to_info(swp_type(entry)); } static inline struct swap_cluster_info *__swap_offset_to_cluster( struct swap_info_struct *si, pgoff_t offset) { VM_WARN_ON_ONCE(percpu_ref_is_zero(&si->users)); /* race with swapoff */ VM_WARN_ON_ONCE(offset >= roundup(si->max, SWAPFILE_CLUSTER)); return &si->cluster_info[offset / SWAPFILE_CLUSTER]; } static inline struct swap_cluster_info *__swap_entry_to_cluster(swp_entry_t entry) { return __swap_offset_to_cluster(__swap_entry_to_info(entry), swp_offset(entry)); } static __always_inline struct swap_cluster_info *__swap_cluster_lock( struct swap_info_struct *si, unsigned long offset, bool irq) { struct swap_cluster_info *ci = __swap_offset_to_cluster(si, offset); /* * Nothing modifies swap cache in an IRQ context. All access to * swap cache is wrapped by swap_cache_* helpers, and swap cache * writeback is handled outside of IRQs. Swapin or swapout never * occurs in IRQ, and neither does in-place split or replace. * * Besides, modifying swap cache requires synchronization with * swap_map, which was never IRQ safe. */ VM_WARN_ON_ONCE(!in_task()); VM_WARN_ON_ONCE(percpu_ref_is_zero(&si->users)); /* race with swapoff */ if (irq) spin_lock_irq(&ci->lock); else spin_lock(&ci->lock); return ci; } /** * swap_cluster_lock - Lock and return the swap cluster of given offset. * @si: swap device the cluster belongs to. * @offset: the swap entry offset, pointing to a valid slot. * * Context: The caller must ensure the offset is in the valid range and * protect the swap device with reference count or locks. */ static inline struct swap_cluster_info *swap_cluster_lock( struct swap_info_struct *si, unsigned long offset) { return __swap_cluster_lock(si, offset, false); } static inline struct swap_cluster_info *__swap_cluster_get_and_lock( const struct folio *folio, bool irq) { VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio); return __swap_cluster_lock(__swap_entry_to_info(folio->swap), swp_offset(folio->swap), irq); } /* * swap_cluster_get_and_lock - Locks the cluster that holds a folio's entries. * @folio: The folio. * * This locks and returns the swap cluster that contains a folio's swap * entries. The swap entries of a folio are always in one single cluster. * The folio has to be locked so its swap entries won't change and the * cluster won't be freed. * * Context: Caller must ensure the folio is locked and in the swap cache. * Return: Pointer to the swap cluster. */ static inline struct swap_cluster_info *swap_cluster_get_and_lock( const struct folio *folio) { return __swap_cluster_get_and_lock(folio, false); } /* * swap_cluster_get_and_lock_irq - Locks the cluster that holds a folio's entries. * @folio: The folio. * * Same as swap_cluster_get_and_lock but also disable IRQ. * * Context: Caller must ensure the folio is locked and in the swap cache. * Return: Pointer to the swap cluster. */ static inline struct swap_cluster_info *swap_cluster_get_and_lock_irq( const struct folio *folio) { return __swap_cluster_get_and_lock(folio, true); } static inline void swap_cluster_unlock(struct swap_cluster_info *ci) { spin_unlock(&ci->lock); } static inline void swap_cluster_unlock_irq(struct swap_cluster_info *ci) { spin_unlock_irq(&ci->lock); } extern int swap_retry_table_alloc(swp_entry_t entry, gfp_t gfp); /* * Below are the core routines for doing swap for a folio. * All helpers requires the folio to be locked, and a locked folio * in the swap cache pins the swap entries / slots allocated to the * folio, swap relies heavily on the swap cache and folio lock for * synchronization. * * folio_alloc_swap(): the entry point for a folio to be swapped * out. It allocates swap slots and pins the slots with swap cache. * The slots start with a swap count of zero. The slots are pinned * by swap cache reference which doesn't contribute to swap count. * * folio_dup_swap(): increases the swap count of a folio, usually * during it gets unmapped and a swap entry is installed to replace * it (e.g., swap entry in page table). A swap slot with swap * count == 0 can only be increased by this helper. * * folio_put_swap(): does the opposite thing of folio_dup_swap(). */ int folio_alloc_swap(struct folio *folio); int folio_dup_swap(struct folio *folio, struct page *subpage); void folio_put_swap(struct folio *folio, struct page *subpage); /* For internal use */ extern void __swap_cluster_free_entries(struct swap_info_struct *si, struct swap_cluster_info *ci, unsigned int ci_off, unsigned int nr_pages); /* linux/mm/page_io.c */ int sio_pool_init(void); struct swap_iocb; void swap_read_folio(struct folio *folio, struct swap_iocb **plug); void __swap_read_unplug(struct swap_iocb *plug); static inline void swap_read_unplug(struct swap_iocb *plug) { if (unlikely(plug)) __swap_read_unplug(plug); } void swap_write_unplug(struct swap_iocb *sio); int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug); void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug); /* linux/mm/swap_state.c */ extern struct address_space swap_space __read_mostly; static inline struct address_space *swap_address_space(swp_entry_t entry) { return &swap_space; } /* * Return the swap device position of the swap entry. */ static inline loff_t swap_dev_pos(swp_entry_t entry) { return ((loff_t)swp_offset(entry)) << PAGE_SHIFT; } /** * folio_matches_swap_entry - Check if a folio matches a given swap entry. * @folio: The folio. * @entry: The swap entry to check against. * * Context: The caller should have the folio locked to ensure it's stable * and nothing will move it in or out of the swap cache. * Return: true or false. */ static inline bool folio_matches_swap_entry(const struct folio *folio, swp_entry_t entry) { swp_entry_t folio_entry = folio->swap; long nr_pages = folio_nr_pages(folio); VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); if (!folio_test_swapcache(folio)) return false; VM_WARN_ON_ONCE_FOLIO(!IS_ALIGNED(folio_entry.val, nr_pages), folio); return folio_entry.val == round_down(entry.val, nr_pages); } /* * All swap cache helpers below require the caller to ensure the swap entries * used are valid and stabilize the device by any of the following ways: * - Hold a reference by get_swap_device(): this ensures a single entry is * valid and increases the swap device's refcount. * - Locking a folio in the swap cache: this ensures the folio's swap entries * are valid and pinned, also implies reference to the device. * - Locking anything referencing the swap entry: e.g. PTL that protects * swap entries in the page table, similar to locking swap cache folio. * - See the comment of get_swap_device() for more complex usage. */ bool swap_cache_has_folio(swp_entry_t entry); struct folio *swap_cache_get_folio(swp_entry_t entry); void *swap_cache_get_shadow(swp_entry_t entry); void swap_cache_del_folio(struct folio *folio); struct folio *swap_cache_alloc_folio(swp_entry_t entry, gfp_t gfp_flags, struct mempolicy *mpol, pgoff_t ilx, bool *alloced); /* Below helpers require the caller to lock and pass in the swap cluster. */ void __swap_cache_add_folio(struct swap_cluster_info *ci, struct folio *folio, swp_entry_t entry); void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio, swp_entry_t entry, void *shadow); void __swap_cache_replace_folio(struct swap_cluster_info *ci, struct folio *old, struct folio *new); void show_swap_cache_info(void); void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr); struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr, struct swap_iocb **plug); struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t flag, struct mempolicy *mpol, pgoff_t ilx); struct folio *swapin_readahead(swp_entry_t entry, gfp_t flag, struct vm_fault *vmf); struct folio *swapin_folio(swp_entry_t entry, struct folio *folio); void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma, unsigned long addr); static inline unsigned int folio_swap_flags(struct folio *folio) { return __swap_entry_to_info(folio->swap)->flags; } /* * Return the count of contiguous swap entries that share the same * zeromap status as the starting entry. If is_zeromap is not NULL, * it will return the zeromap status of the starting entry. */ static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr, bool *is_zeromap) { struct swap_info_struct *sis = __swap_entry_to_info(entry); unsigned long start = swp_offset(entry); unsigned long end = start + max_nr; bool first_bit; first_bit = test_bit(start, sis->zeromap); if (is_zeromap) *is_zeromap = first_bit; if (max_nr <= 1) return max_nr; if (first_bit) return find_next_zero_bit(sis->zeromap, end, start) - start; else return find_next_bit(sis->zeromap, end, start) - start; } static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) { int i; /* * While allocating a large folio and doing mTHP swapin, we need to * ensure all entries are not cached, otherwise, the mTHP folio will * be in conflict with the folio in swap cache. */ for (i = 0; i < max_nr; i++) { if (swap_cache_has_folio(entry)) return i; entry.val++; } return i; } #else /* CONFIG_SWAP */ struct swap_iocb; static inline struct swap_cluster_info *swap_cluster_lock( struct swap_info_struct *si, pgoff_t offset, bool irq) { return NULL; } static inline struct swap_cluster_info *swap_cluster_get_and_lock( struct folio *folio) { return NULL; } static inline struct swap_cluster_info *swap_cluster_get_and_lock_irq( struct folio *folio) { return NULL; } static inline void swap_cluster_unlock(struct swap_cluster_info *ci) { } static inline void swap_cluster_unlock_irq(struct swap_cluster_info *ci) { } static inline struct swap_info_struct *__swap_entry_to_info(swp_entry_t entry) { return NULL; } static inline int folio_alloc_swap(struct folio *folio) { return -EINVAL; } static inline int folio_dup_swap(struct folio *folio, struct page *page) { return -EINVAL; } static inline void folio_put_swap(struct folio *folio, struct page *page) { } static inline void swap_read_folio(struct folio *folio, struct swap_iocb **plug) { } static inline void swap_write_unplug(struct swap_iocb *sio) { } static inline struct address_space *swap_address_space(swp_entry_t entry) { return NULL; } static inline bool folio_matches_swap_entry(const struct folio *folio, swp_entry_t entry) { return false; } static inline void show_swap_cache_info(void) { } static inline struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, struct mempolicy *mpol, pgoff_t ilx) { return NULL; } static inline struct folio *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask, struct vm_fault *vmf) { return NULL; } static inline struct folio *swapin_folio(swp_entry_t entry, struct folio *folio) { return NULL; } static inline void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma, unsigned long addr) { } static inline int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug) { return 0; } static inline int swap_retry_table_alloc(swp_entry_t entry, gfp_t gfp) { return -EINVAL; } static inline bool swap_cache_has_folio(swp_entry_t entry) { return false; } static inline struct folio *swap_cache_get_folio(swp_entry_t entry) { return NULL; } static inline void *swap_cache_get_shadow(swp_entry_t entry) { return NULL; } static inline void swap_cache_del_folio(struct folio *folio) { } static inline void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio, swp_entry_t entry, void *shadow) { } static inline void __swap_cache_replace_folio(struct swap_cluster_info *ci, struct folio *old, struct folio *new) { } static inline unsigned int folio_swap_flags(struct folio *folio) { return 0; } static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr, bool *has_zeromap) { return 0; } static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) { return 0; } #endif /* CONFIG_SWAP */ #endif /* _MM_SWAP_H */
155 95 95 95 95 94 95 122 122 120 121 3 4 118 118 119 3 4 117 120 181 182 181 113 182 1 170 156 182 169 156 156 111 155 111 5 155 114 6 1 82 158 170 156 182 182 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 // SPDX-License-Identifier: GPL-2.0-or-later /* * io.c * * Buffer cache handling * * Copyright (C) 2002, 2004 Oracle. All rights reserved. */ #include <linux/fs.h> #include <linux/types.h> #include <linux/highmem.h> #include <linux/bio.h> #include <cluster/masklog.h> #include "ocfs2.h" #include "alloc.h" #include "inode.h" #include "journal.h" #include "uptodate.h" #include "buffer_head_io.h" #include "ocfs2_trace.h" /* * Bits on bh->b_state used by ocfs2. * * These MUST be after the JBD2 bits. Hence, we use BH_JBDPrivateStart. */ enum ocfs2_state_bits { BH_NeedsValidate = BH_JBDPrivateStart, }; /* Expand the magic b_state functions */ BUFFER_FNS(NeedsValidate, needs_validate); int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh, struct ocfs2_caching_info *ci) { int ret = 0; trace_ocfs2_write_block((unsigned long long)bh->b_blocknr, ci); BUG_ON(bh->b_blocknr < OCFS2_SUPER_BLOCK_BLKNO); BUG_ON(buffer_jbd(bh)); /* No need to check for a soft readonly file system here. non * journalled writes are only ever done on system files which * can get modified during recovery even if read-only. */ if (ocfs2_is_hard_readonly(osb)) { ret = -EROFS; mlog_errno(ret); goto out; } ocfs2_metadata_cache_io_lock(ci); lock_buffer(bh); set_buffer_uptodate(bh); /* remove from dirty list before I/O. */ clear_buffer_dirty(bh); get_bh(bh); /* for end_buffer_write_sync() */ bh->b_end_io = end_buffer_write_sync; submit_bh(REQ_OP_WRITE, bh); wait_on_buffer(bh); if (buffer_uptodate(bh)) { ocfs2_set_buffer_uptodate(ci, bh); } else { /* We don't need to remove the clustered uptodate * information for this bh as it's not marked locally * uptodate. */ ret = -EIO; mlog_errno(ret); } ocfs2_metadata_cache_io_unlock(ci); out: return ret; } /* Caller must provide a bhs[] with all NULL or non-NULL entries, so it * will be easier to handle read failure. */ int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block, unsigned int nr, struct buffer_head *bhs[]) { int status = 0; unsigned int i; struct buffer_head *bh; int new_bh = 0; trace_ocfs2_read_blocks_sync((unsigned long long)block, nr); if (!nr) goto bail; /* Don't put buffer head and re-assign it to NULL if it is allocated * outside since the caller can't be aware of this alternation! */ new_bh = (bhs[0] == NULL); for (i = 0 ; i < nr ; i++) { if (bhs[i] == NULL) { bhs[i] = sb_getblk(osb->sb, block++); if (bhs[i] == NULL) { status = -ENOMEM; mlog_errno(status); break; } } bh = bhs[i]; if (buffer_jbd(bh)) { trace_ocfs2_read_blocks_sync_jbd( (unsigned long long)bh->b_blocknr); continue; } if (buffer_dirty(bh)) { /* This should probably be a BUG, or * at least return an error. */ mlog(ML_ERROR, "trying to sync read a dirty " "buffer! (blocknr = %llu), skipping\n", (unsigned long long)bh->b_blocknr); continue; } lock_buffer(bh); if (buffer_jbd(bh)) { #ifdef CATCH_BH_JBD_RACES mlog(ML_ERROR, "block %llu had the JBD bit set " "while I was in lock_buffer!", (unsigned long long)bh->b_blocknr); BUG(); #else unlock_buffer(bh); continue; #endif } get_bh(bh); /* for end_buffer_read_sync() */ bh->b_end_io = end_buffer_read_sync; submit_bh(REQ_OP_READ, bh); } read_failure: for (i = nr; i > 0; i--) { bh = bhs[i - 1]; if (unlikely(status)) { if (new_bh && bh) { /* If middle bh fails, let previous bh * finish its read and then put it to * avoid bh leak */ if (!buffer_jbd(bh)) wait_on_buffer(bh); put_bh(bh); bhs[i - 1] = NULL; } else if (bh && buffer_uptodate(bh)) { clear_buffer_uptodate(bh); } continue; } /* No need to wait on the buffer if it's managed by JBD. */ if (!buffer_jbd(bh)) wait_on_buffer(bh); if (!buffer_uptodate(bh)) { /* Status won't be cleared from here on out, * so we can safely record this and loop back * to cleanup the other buffers. */ status = -EIO; goto read_failure; } } bail: return status; } /* Caller must provide a bhs[] with all NULL or non-NULL entries, so it * will be easier to handle read failure. */ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr, struct buffer_head *bhs[], int flags, int (*validate)(struct super_block *sb, struct buffer_head *bh)) { int status = 0; int i, ignore_cache = 0; struct buffer_head *bh; struct super_block *sb = ocfs2_metadata_cache_get_super(ci); int new_bh = 0; trace_ocfs2_read_blocks_begin(ci, (unsigned long long)block, nr, flags); BUG_ON(!ci); BUG_ON((flags & OCFS2_BH_READAHEAD) && (flags & OCFS2_BH_IGNORE_CACHE)); if (bhs == NULL) { status = -EINVAL; mlog_errno(status); goto bail; } if (nr < 0) { mlog(ML_ERROR, "asked to read %d blocks!\n", nr); status = -EINVAL; mlog_errno(status); goto bail; } if (nr == 0) { status = 0; goto bail; } /* Don't put buffer head and re-assign it to NULL if it is allocated * outside since the caller can't be aware of this alternation! */ new_bh = (bhs[0] == NULL); ocfs2_metadata_cache_io_lock(ci); for (i = 0 ; i < nr ; i++) { if (bhs[i] == NULL) { bhs[i] = sb_getblk(sb, block++); if (bhs[i] == NULL) { status = -ENOMEM; mlog_errno(status); /* Don't forget to put previous bh! */ break; } } bh = bhs[i]; ignore_cache = (flags & OCFS2_BH_IGNORE_CACHE); /* There are three read-ahead cases here which we need to * be concerned with. All three assume a buffer has * previously been submitted with OCFS2_BH_READAHEAD * and it hasn't yet completed I/O. * * 1) The current request is sync to disk. This rarely * happens these days, and never when performance * matters - the code can just wait on the buffer * lock and re-submit. * * 2) The current request is cached, but not * readahead. ocfs2_buffer_uptodate() will return * false anyway, so we'll wind up waiting on the * buffer lock to do I/O. We re-check the request * with after getting the lock to avoid a re-submit. * * 3) The current request is readahead (and so must * also be a caching one). We short circuit if the * buffer is locked (under I/O) and if it's in the * uptodate cache. The re-check from #2 catches the * case that the previous read-ahead completes just * before our is-it-in-flight check. */ if (!ignore_cache && !ocfs2_buffer_uptodate(ci, bh)) { trace_ocfs2_read_blocks_from_disk( (unsigned long long)bh->b_blocknr, (unsigned long long)ocfs2_metadata_cache_owner(ci)); /* We're using ignore_cache here to say * "go to disk" */ ignore_cache = 1; } trace_ocfs2_read_blocks_bh((unsigned long long)bh->b_blocknr, ignore_cache, buffer_jbd(bh), buffer_dirty(bh)); if (buffer_jbd(bh)) { continue; } if (ignore_cache) { if (buffer_dirty(bh)) { /* This should probably be a BUG, or * at least return an error. */ continue; } /* A read-ahead request was made - if the * buffer is already under read-ahead from a * previously submitted request than we are * done here. */ if ((flags & OCFS2_BH_READAHEAD) && ocfs2_buffer_read_ahead(ci, bh)) continue; lock_buffer(bh); if (buffer_jbd(bh)) { #ifdef CATCH_BH_JBD_RACES mlog(ML_ERROR, "block %llu had the JBD bit set " "while I was in lock_buffer!", (unsigned long long)bh->b_blocknr); BUG(); #else unlock_buffer(bh); continue; #endif } /* Re-check ocfs2_buffer_uptodate() as a * previously read-ahead buffer may have * completed I/O while we were waiting for the * buffer lock. */ if (!(flags & OCFS2_BH_IGNORE_CACHE) && !(flags & OCFS2_BH_READAHEAD) && ocfs2_buffer_uptodate(ci, bh)) { unlock_buffer(bh); continue; } get_bh(bh); /* for end_buffer_read_sync() */ if (validate) set_buffer_needs_validate(bh); bh->b_end_io = end_buffer_read_sync; submit_bh(REQ_OP_READ, bh); continue; } } read_failure: for (i = (nr - 1); i >= 0; i--) { bh = bhs[i]; if (!(flags & OCFS2_BH_READAHEAD)) { if (unlikely(status)) { /* Clear the buffers on error including those * ever succeeded in reading */ if (new_bh && bh) { /* If middle bh fails, let previous bh * finish its read and then put it to * avoid bh leak */ if (!buffer_jbd(bh)) wait_on_buffer(bh); put_bh(bh); bhs[i] = NULL; } else if (bh && buffer_uptodate(bh)) { clear_buffer_uptodate(bh); } continue; } /* We know this can't have changed as we hold the * owner sem. Avoid doing any work on the bh if the * journal has it. */ if (!buffer_jbd(bh)) wait_on_buffer(bh); if (!buffer_uptodate(bh)) { /* Status won't be cleared from here on out, * so we can safely record this and loop back * to cleanup the other buffers. Don't need to * remove the clustered uptodate information * for this bh as it's not marked locally * uptodate. */ status = -EIO; clear_buffer_needs_validate(bh); goto read_failure; } if (buffer_needs_validate(bh)) { /* We never set NeedsValidate if the * buffer was held by the journal, so * that better not have changed */ BUG_ON(buffer_jbd(bh)); clear_buffer_needs_validate(bh); status = validate(sb, bh); if (status) goto read_failure; } } /* Always set the buffer in the cache, even if it was * a forced read, or read-ahead which hasn't yet * completed. */ if (bh) ocfs2_set_buffer_uptodate(ci, bh); } ocfs2_metadata_cache_io_unlock(ci); trace_ocfs2_read_blocks_end((unsigned long long)block, nr, flags, ignore_cache); bail: return status; } /* Check whether the blkno is the super block or one of the backups. */ static void ocfs2_check_super_or_backup(struct super_block *sb, sector_t blkno) { int i; u64 backup_blkno; if (blkno == OCFS2_SUPER_BLOCK_BLKNO) return; for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) { backup_blkno = ocfs2_backup_super_blkno(sb, i); if (backup_blkno == blkno) return; } BUG(); } /* * Write super block and backups doesn't need to collaborate with journal, * so we don't need to lock ip_io_mutex and ci doesn't need to bea passed * into this function. */ int ocfs2_write_super_or_backup(struct ocfs2_super *osb, struct buffer_head *bh) { int ret = 0; struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; BUG_ON(buffer_jbd(bh)); ocfs2_check_super_or_backup(osb->sb, bh->b_blocknr); if (unlikely(ocfs2_emergency_state(osb))) { ret = -EROFS; mlog_errno(ret); goto out; } lock_buffer(bh); set_buffer_uptodate(bh); /* remove from dirty list before I/O. */ clear_buffer_dirty(bh); get_bh(bh); /* for end_buffer_write_sync() */ bh->b_end_io = end_buffer_write_sync; ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &di->i_check); submit_bh(REQ_OP_WRITE, bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { ret = -EIO; mlog_errno(ret); } out: return ret; }
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 // SPDX-License-Identifier: GPL-2.0-or-later /* * X.25 Packet Layer release 002 * * This is ALPHA test software. This code may break your machine, * randomly fail to work with new releases, misbehave and/or generally * screw up. It might even work. * * This code REQUIRES 2.1.15 or higher * * History * X.25 001 Jonathan Naylor Started coding. * X.25 002 Jonathan Naylor Centralised disconnect handling. * New timer architecture. * 2000-03-11 Henner Eisen MSG_EOR handling more POSIX compliant. * 2000-03-22 Daniela Squassoni Allowed disabling/enabling of * facilities negotiation and increased * the throughput upper limit. * 2000-08-27 Arnaldo C. Melo s/suser/capable/ + micro cleanups * 2000-09-04 Henner Eisen Set sock->state in x25_accept(). * Fixed x25_output() related skb leakage. * 2000-10-02 Henner Eisen Made x25_kick() single threaded per socket. * 2000-10-27 Henner Eisen MSG_DONTWAIT for fragment allocation. * 2000-11-14 Henner Eisen Closing datalink from NETDEV_GOING_DOWN * 2002-10-06 Arnaldo C. Melo Get rid of cli/sti, move proc stuff to * x25_proc.c, using seq_file * 2005-04-02 Shaun Pereira Selective sub address matching * with call user data * 2005-04-15 Shaun Pereira Fast select with no restriction on * response */ #define pr_fmt(fmt) "X25: " fmt #include <linux/module.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <net/sock.h> #include <net/tcp_states.h> #include <linux/uaccess.h> #include <linux/fcntl.h> #include <linux/termios.h> /* For TIOCINQ/OUTQ */ #include <linux/notifier.h> #include <linux/init.h> #include <linux/compat.h> #include <linux/ctype.h> #include <net/x25.h> #include <net/compat.h> int sysctl_x25_restart_request_timeout = X25_DEFAULT_T20; int sysctl_x25_call_request_timeout = X25_DEFAULT_T21; int sysctl_x25_reset_request_timeout = X25_DEFAULT_T22; int sysctl_x25_clear_request_timeout = X25_DEFAULT_T23; int sysctl_x25_ack_holdback_timeout = X25_DEFAULT_T2; int sysctl_x25_forward = 0; HLIST_HEAD(x25_list); DEFINE_RWLOCK(x25_list_lock); static const struct proto_ops x25_proto_ops; static const struct x25_address null_x25_address = {" "}; #ifdef CONFIG_COMPAT struct compat_x25_subscrip_struct { char device[200-sizeof(compat_ulong_t)]; compat_ulong_t global_facil_mask; compat_uint_t extended; }; #endif int x25_parse_address_block(struct sk_buff *skb, struct x25_address *called_addr, struct x25_address *calling_addr) { unsigned char len; int needed; int rc; if (!pskb_may_pull(skb, 1)) { /* packet has no address block */ rc = 0; goto empty; } len = *skb->data; needed = 1 + ((len >> 4) + (len & 0x0f) + 1) / 2; if (!pskb_may_pull(skb, needed)) { /* packet is too short to hold the addresses it claims to hold */ rc = -1; goto empty; } return x25_addr_ntoa(skb->data, called_addr, calling_addr); empty: *called_addr->x25_addr = 0; *calling_addr->x25_addr = 0; return rc; } int x25_addr_ntoa(unsigned char *p, struct x25_address *called_addr, struct x25_address *calling_addr) { unsigned int called_len, calling_len; char *called, *calling; unsigned int i; called_len = (*p >> 0) & 0x0F; calling_len = (*p >> 4) & 0x0F; called = called_addr->x25_addr; calling = calling_addr->x25_addr; p++; for (i = 0; i < (called_len + calling_len); i++) { if (i < called_len) { if (i % 2 != 0) { *called++ = ((*p >> 0) & 0x0F) + '0'; p++; } else { *called++ = ((*p >> 4) & 0x0F) + '0'; } } else { if (i % 2 != 0) { *calling++ = ((*p >> 0) & 0x0F) + '0'; p++; } else { *calling++ = ((*p >> 4) & 0x0F) + '0'; } } } *called = *calling = '\0'; return 1 + (called_len + calling_len + 1) / 2; } int x25_addr_aton(unsigned char *p, struct x25_address *called_addr, struct x25_address *calling_addr) { unsigned int called_len, calling_len; char *called, *calling; int i; called = called_addr->x25_addr; calling = calling_addr->x25_addr; called_len = strlen(called); calling_len = strlen(calling); *p++ = (calling_len << 4) | (called_len << 0); for (i = 0; i < (called_len + calling_len); i++) { if (i < called_len) { if (i % 2 != 0) { *p |= (*called++ - '0') << 0; p++; } else { *p = 0x00; *p |= (*called++ - '0') << 4; } } else { if (i % 2 != 0) { *p |= (*calling++ - '0') << 0; p++; } else { *p = 0x00; *p |= (*calling++ - '0') << 4; } } } return 1 + (called_len + calling_len + 1) / 2; } /* * Socket removal during an interrupt is now safe. */ static void x25_remove_socket(struct sock *sk) { write_lock_bh(&x25_list_lock); sk_del_node_init(sk); write_unlock_bh(&x25_list_lock); } /* * Handle device status changes. */ static int x25_device_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct x25_neigh *nb; if (!net_eq(dev_net(dev), &init_net)) return NOTIFY_DONE; if (dev->type == ARPHRD_X25) { switch (event) { case NETDEV_REGISTER: case NETDEV_POST_TYPE_CHANGE: x25_link_device_up(dev); break; case NETDEV_DOWN: nb = x25_get_neigh(dev); if (nb) { x25_link_terminated(nb); x25_neigh_put(nb); } x25_route_device_down(dev); break; case NETDEV_PRE_TYPE_CHANGE: case NETDEV_UNREGISTER: x25_link_device_down(dev); break; case NETDEV_CHANGE: if (!netif_carrier_ok(dev)) { nb = x25_get_neigh(dev); if (nb) { x25_link_terminated(nb); x25_neigh_put(nb); } } break; } } return NOTIFY_DONE; } /* * Add a socket to the bound sockets list. */ static void x25_insert_socket(struct sock *sk) { write_lock_bh(&x25_list_lock); sk_add_node(sk, &x25_list); write_unlock_bh(&x25_list_lock); } /* * Find a socket that wants to accept the Call Request we just * received. Check the full list for an address/cud match. * If no cuds match return the next_best thing, an address match. * Note: if a listening socket has cud set it must only get calls * with matching cud. */ static struct sock *x25_find_listener(struct x25_address *addr, struct sk_buff *skb) { struct sock *s; struct sock *next_best; read_lock_bh(&x25_list_lock); next_best = NULL; sk_for_each(s, &x25_list) if ((!strcmp(addr->x25_addr, x25_sk(s)->source_addr.x25_addr) || !strcmp(x25_sk(s)->source_addr.x25_addr, null_x25_address.x25_addr)) && s->sk_state == TCP_LISTEN) { /* * Found a listening socket, now check the incoming * call user data vs this sockets call user data */ if (x25_sk(s)->cudmatchlength > 0 && skb->len >= x25_sk(s)->cudmatchlength) { if((memcmp(x25_sk(s)->calluserdata.cuddata, skb->data, x25_sk(s)->cudmatchlength)) == 0) { sock_hold(s); goto found; } } else next_best = s; } if (next_best) { s = next_best; sock_hold(s); goto found; } s = NULL; found: read_unlock_bh(&x25_list_lock); return s; } /* * Find a connected X.25 socket given my LCI and neighbour. */ static struct sock *__x25_find_socket(unsigned int lci, struct x25_neigh *nb) { struct sock *s; sk_for_each(s, &x25_list) if (x25_sk(s)->lci == lci && x25_sk(s)->neighbour == nb) { sock_hold(s); goto found; } s = NULL; found: return s; } struct sock *x25_find_socket(unsigned int lci, struct x25_neigh *nb) { struct sock *s; read_lock_bh(&x25_list_lock); s = __x25_find_socket(lci, nb); read_unlock_bh(&x25_list_lock); return s; } /* * Find a unique LCI for a given device. */ static unsigned int x25_new_lci(struct x25_neigh *nb) { unsigned int lci = 1; struct sock *sk; while ((sk = x25_find_socket(lci, nb)) != NULL) { sock_put(sk); if (++lci == 4096) { lci = 0; break; } cond_resched(); } return lci; } /* * Deferred destroy. */ static void __x25_destroy_socket(struct sock *); /* * handler for deferred kills. */ static void x25_destroy_timer(struct timer_list *t) { struct sock *sk = timer_container_of(sk, t, sk_timer); x25_destroy_socket_from_timer(sk); } /* * This is called from user mode and the timers. Thus it protects itself * against interrupting users but doesn't worry about being called during * work. Once it is removed from the queue no interrupt or bottom half * will touch it and we are (fairly 8-) ) safe. * Not static as it's used by the timer */ static void __x25_destroy_socket(struct sock *sk) { struct sk_buff *skb; x25_stop_heartbeat(sk); x25_stop_timer(sk); x25_remove_socket(sk); x25_clear_queues(sk); /* Flush the queues */ while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { if (skb->sk != sk) { /* A pending connection */ /* * Queue the unaccepted socket for death */ skb->sk->sk_state = TCP_LISTEN; sock_set_flag(skb->sk, SOCK_DEAD); x25_start_heartbeat(skb->sk); x25_sk(skb->sk)->state = X25_STATE_0; } kfree_skb(skb); } if (sk_has_allocations(sk)) { /* Defer: outstanding buffers */ sk->sk_timer.expires = jiffies + 10 * HZ; sk->sk_timer.function = x25_destroy_timer; add_timer(&sk->sk_timer); } else { /* drop last reference so sock_put will free */ __sock_put(sk); } } void x25_destroy_socket_from_timer(struct sock *sk) { sock_hold(sk); bh_lock_sock(sk); __x25_destroy_socket(sk); bh_unlock_sock(sk); sock_put(sk); } /* * Handling for system calls applied via the various interfaces to a * X.25 socket object. */ static int x25_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { int opt; struct sock *sk = sock->sk; int rc = -ENOPROTOOPT; if (level != SOL_X25 || optname != X25_QBITINCL) goto out; rc = -EINVAL; if (optlen < sizeof(int)) goto out; rc = -EFAULT; if (copy_from_sockptr(&opt, optval, sizeof(int))) goto out; if (opt) set_bit(X25_Q_BIT_FLAG, &x25_sk(sk)->flags); else clear_bit(X25_Q_BIT_FLAG, &x25_sk(sk)->flags); rc = 0; out: return rc; } static int x25_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; int val, len, rc = -ENOPROTOOPT; if (level != SOL_X25 || optname != X25_QBITINCL) goto out; rc = -EFAULT; if (get_user(len, optlen)) goto out; rc = -EINVAL; if (len < 0) goto out; len = min_t(unsigned int, len, sizeof(int)); rc = -EFAULT; if (put_user(len, optlen)) goto out; val = test_bit(X25_Q_BIT_FLAG, &x25_sk(sk)->flags); rc = copy_to_user(optval, &val, len) ? -EFAULT : 0; out: return rc; } static int x25_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; int rc = -EOPNOTSUPP; lock_sock(sk); if (sock->state != SS_UNCONNECTED) { rc = -EINVAL; release_sock(sk); return rc; } if (sk->sk_state != TCP_LISTEN) { memset(&x25_sk(sk)->dest_addr, 0, X25_ADDR_LEN); sk->sk_max_ack_backlog = backlog; sk->sk_state = TCP_LISTEN; rc = 0; } release_sock(sk); return rc; } static struct proto x25_proto = { .name = "X25", .owner = THIS_MODULE, .obj_size = sizeof(struct x25_sock), }; static struct sock *x25_alloc_socket(struct net *net, int kern) { struct x25_sock *x25; struct sock *sk = sk_alloc(net, AF_X25, GFP_ATOMIC, &x25_proto, kern); if (!sk) goto out; sock_init_data(NULL, sk); x25 = x25_sk(sk); skb_queue_head_init(&x25->ack_queue); skb_queue_head_init(&x25->fragment_queue); skb_queue_head_init(&x25->interrupt_in_queue); skb_queue_head_init(&x25->interrupt_out_queue); out: return sk; } static int x25_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; struct x25_sock *x25; int rc = -EAFNOSUPPORT; if (!net_eq(net, &init_net)) goto out; rc = -ESOCKTNOSUPPORT; if (sock->type != SOCK_SEQPACKET) goto out; rc = -EINVAL; if (protocol) goto out; rc = -ENOMEM; if ((sk = x25_alloc_socket(net, kern)) == NULL) goto out; x25 = x25_sk(sk); sock_init_data(sock, sk); x25_init_timers(sk); sock->ops = &x25_proto_ops; sk->sk_protocol = protocol; sk->sk_backlog_rcv = x25_backlog_rcv; x25->t21 = sysctl_x25_call_request_timeout; x25->t22 = sysctl_x25_reset_request_timeout; x25->t23 = sysctl_x25_clear_request_timeout; x25->t2 = sysctl_x25_ack_holdback_timeout; x25->state = X25_STATE_0; x25->cudmatchlength = 0; set_bit(X25_ACCPT_APPRV_FLAG, &x25->flags); /* normally no cud */ /* on call accept */ x25->facilities.winsize_in = X25_DEFAULT_WINDOW_SIZE; x25->facilities.winsize_out = X25_DEFAULT_WINDOW_SIZE; x25->facilities.pacsize_in = X25_DEFAULT_PACKET_SIZE; x25->facilities.pacsize_out = X25_DEFAULT_PACKET_SIZE; x25->facilities.throughput = 0; /* by default don't negotiate throughput */ x25->facilities.reverse = X25_DEFAULT_REVERSE; x25->dte_facilities.calling_len = 0; x25->dte_facilities.called_len = 0; memset(x25->dte_facilities.called_ae, '\0', sizeof(x25->dte_facilities.called_ae)); memset(x25->dte_facilities.calling_ae, '\0', sizeof(x25->dte_facilities.calling_ae)); rc = 0; out: return rc; } static struct sock *x25_make_new(struct sock *osk) { struct sock *sk = NULL; struct x25_sock *x25, *ox25; if (osk->sk_type != SOCK_SEQPACKET) goto out; if ((sk = x25_alloc_socket(sock_net(osk), 0)) == NULL) goto out; x25 = x25_sk(sk); sk->sk_type = osk->sk_type; sk->sk_priority = READ_ONCE(osk->sk_priority); sk->sk_protocol = osk->sk_protocol; sk->sk_rcvbuf = osk->sk_rcvbuf; sk->sk_sndbuf = osk->sk_sndbuf; sk->sk_state = TCP_ESTABLISHED; sk->sk_backlog_rcv = osk->sk_backlog_rcv; sock_copy_flags(sk, osk); ox25 = x25_sk(osk); x25->t21 = ox25->t21; x25->t22 = ox25->t22; x25->t23 = ox25->t23; x25->t2 = ox25->t2; x25->flags = ox25->flags; x25->facilities = ox25->facilities; x25->dte_facilities = ox25->dte_facilities; x25->cudmatchlength = ox25->cudmatchlength; clear_bit(X25_INTERRUPT_FLAG, &x25->flags); x25_init_timers(sk); out: return sk; } static int x25_release(struct socket *sock) { struct sock *sk = sock->sk; struct x25_sock *x25; if (!sk) return 0; x25 = x25_sk(sk); sock_hold(sk); lock_sock(sk); switch (x25->state) { case X25_STATE_0: case X25_STATE_2: x25_disconnect(sk, 0, 0, 0); __x25_destroy_socket(sk); goto out; case X25_STATE_1: case X25_STATE_3: case X25_STATE_4: x25_clear_queues(sk); x25_write_internal(sk, X25_CLEAR_REQUEST); x25_start_t23timer(sk); x25->state = X25_STATE_2; sk->sk_state = TCP_CLOSE; sk->sk_shutdown |= SEND_SHUTDOWN; sk->sk_state_change(sk); sock_set_flag(sk, SOCK_DEAD); sock_set_flag(sk, SOCK_DESTROY); break; case X25_STATE_5: x25_write_internal(sk, X25_CLEAR_REQUEST); x25_disconnect(sk, 0, 0, 0); __x25_destroy_socket(sk); goto out; } sock_orphan(sk); out: release_sock(sk); sock_put(sk); return 0; } static int x25_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len) { struct sock *sk = sock->sk; struct sockaddr_x25 *addr = (struct sockaddr_x25 *)uaddr; int len, i, rc = 0; if (addr_len != sizeof(struct sockaddr_x25) || addr->sx25_family != AF_X25 || strnlen(addr->sx25_addr.x25_addr, X25_ADDR_LEN) == X25_ADDR_LEN) { rc = -EINVAL; goto out; } /* check for the null_x25_address */ if (strcmp(addr->sx25_addr.x25_addr, null_x25_address.x25_addr)) { len = strlen(addr->sx25_addr.x25_addr); for (i = 0; i < len; i++) { if (!isdigit(addr->sx25_addr.x25_addr[i])) { rc = -EINVAL; goto out; } } } lock_sock(sk); if (sock_flag(sk, SOCK_ZAPPED)) { x25_sk(sk)->source_addr = addr->sx25_addr; x25_insert_socket(sk); sock_reset_flag(sk, SOCK_ZAPPED); } else { rc = -EINVAL; } release_sock(sk); net_dbg_ratelimited("x25_bind: socket is bound\n"); out: return rc; } static int x25_wait_for_connection_establishment(struct sock *sk) { DECLARE_WAITQUEUE(wait, current); int rc; add_wait_queue_exclusive(sk_sleep(sk), &wait); for (;;) { __set_current_state(TASK_INTERRUPTIBLE); rc = -ERESTARTSYS; if (signal_pending(current)) break; rc = sock_error(sk); if (rc) { sk->sk_socket->state = SS_UNCONNECTED; break; } rc = -ENOTCONN; if (sk->sk_state == TCP_CLOSE) { sk->sk_socket->state = SS_UNCONNECTED; break; } rc = 0; if (sk->sk_state != TCP_ESTABLISHED) { release_sock(sk); schedule(); lock_sock(sk); } else break; } __set_current_state(TASK_RUNNING); remove_wait_queue(sk_sleep(sk), &wait); return rc; } static int x25_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; struct x25_sock *x25 = x25_sk(sk); struct sockaddr_x25 *addr = (struct sockaddr_x25 *)uaddr; struct x25_route *rt; int rc = 0; lock_sock(sk); if (sk->sk_state == TCP_ESTABLISHED && sock->state == SS_CONNECTING) { sock->state = SS_CONNECTED; goto out; /* Connect completed during a ERESTARTSYS event */ } rc = -ECONNREFUSED; if (sk->sk_state == TCP_CLOSE && sock->state == SS_CONNECTING) { sock->state = SS_UNCONNECTED; goto out; } rc = -EISCONN; /* No reconnect on a seqpacket socket */ if (sk->sk_state == TCP_ESTABLISHED) goto out; rc = -EALREADY; /* Do nothing if call is already in progress */ if (sk->sk_state == TCP_SYN_SENT) goto out; sk->sk_state = TCP_CLOSE; sock->state = SS_UNCONNECTED; rc = -EINVAL; if (addr_len != sizeof(struct sockaddr_x25) || addr->sx25_family != AF_X25 || strnlen(addr->sx25_addr.x25_addr, X25_ADDR_LEN) == X25_ADDR_LEN) goto out; rc = -ENETUNREACH; rt = x25_get_route(&addr->sx25_addr); if (!rt) goto out; x25->neighbour = x25_get_neigh(rt->dev); if (!x25->neighbour) goto out_put_route; x25_limit_facilities(&x25->facilities, x25->neighbour); x25->lci = x25_new_lci(x25->neighbour); if (!x25->lci) goto out_put_neigh; rc = -EINVAL; if (sock_flag(sk, SOCK_ZAPPED)) /* Must bind first - autobinding does not work */ goto out_put_neigh; if (!strcmp(x25->source_addr.x25_addr, null_x25_address.x25_addr)) memset(&x25->source_addr, '\0', X25_ADDR_LEN); x25->dest_addr = addr->sx25_addr; /* Move to connecting socket, start sending Connect Requests */ sock->state = SS_CONNECTING; sk->sk_state = TCP_SYN_SENT; x25->state = X25_STATE_1; x25_write_internal(sk, X25_CALL_REQUEST); x25_start_heartbeat(sk); x25_start_t21timer(sk); /* Now the loop */ rc = -EINPROGRESS; if (sk->sk_state != TCP_ESTABLISHED && (flags & O_NONBLOCK)) goto out; rc = x25_wait_for_connection_establishment(sk); if (rc) goto out_put_neigh; sock->state = SS_CONNECTED; rc = 0; out_put_neigh: if (rc && x25->neighbour) { read_lock_bh(&x25_list_lock); x25_neigh_put(x25->neighbour); x25->neighbour = NULL; read_unlock_bh(&x25_list_lock); x25->state = X25_STATE_0; } out_put_route: x25_route_put(rt); out: release_sock(sk); return rc; } static int x25_wait_for_data(struct sock *sk, long timeout) { DECLARE_WAITQUEUE(wait, current); int rc = 0; add_wait_queue_exclusive(sk_sleep(sk), &wait); for (;;) { __set_current_state(TASK_INTERRUPTIBLE); if (sk->sk_shutdown & RCV_SHUTDOWN) break; rc = -ERESTARTSYS; if (signal_pending(current)) break; rc = -EAGAIN; if (!timeout) break; rc = 0; if (skb_queue_empty(&sk->sk_receive_queue)) { release_sock(sk); timeout = schedule_timeout(timeout); lock_sock(sk); } else break; } __set_current_state(TASK_RUNNING); remove_wait_queue(sk_sleep(sk), &wait); return rc; } static int x25_accept(struct socket *sock, struct socket *newsock, struct proto_accept_arg *arg) { struct sock *sk = sock->sk; struct sock *newsk; struct sk_buff *skb; int rc = -EINVAL; if (!sk) goto out; rc = -EOPNOTSUPP; if (sk->sk_type != SOCK_SEQPACKET) goto out; lock_sock(sk); rc = -EINVAL; if (sk->sk_state != TCP_LISTEN) goto out2; rc = x25_wait_for_data(sk, READ_ONCE(sk->sk_rcvtimeo)); if (rc) goto out2; skb = skb_dequeue(&sk->sk_receive_queue); rc = -EINVAL; if (!skb->sk) goto out2; newsk = skb->sk; sock_graft(newsk, newsock); /* Now attach up the new socket */ skb->sk = NULL; kfree_skb(skb); sk_acceptq_removed(sk); newsock->state = SS_CONNECTED; rc = 0; out2: release_sock(sk); out: return rc; } static int x25_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct sockaddr_x25 *sx25 = (struct sockaddr_x25 *)uaddr; struct sock *sk = sock->sk; struct x25_sock *x25 = x25_sk(sk); int rc = 0; if (peer) { if (sk->sk_state != TCP_ESTABLISHED) { rc = -ENOTCONN; goto out; } sx25->sx25_addr = x25->dest_addr; } else sx25->sx25_addr = x25->source_addr; sx25->sx25_family = AF_X25; rc = sizeof(*sx25); out: return rc; } int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb, unsigned int lci) { struct sock *sk; struct sock *make; struct x25_sock *makex25; struct x25_address source_addr, dest_addr; struct x25_facilities facilities; struct x25_dte_facilities dte_facilities; int len, addr_len, rc; /* * Remove the LCI and frame type. */ skb_pull(skb, X25_STD_MIN_LEN); /* * Extract the X.25 addresses and convert them to ASCII strings, * and remove them. * * Address block is mandatory in call request packets */ addr_len = x25_parse_address_block(skb, &source_addr, &dest_addr); if (addr_len <= 0) goto out_clear_request; skb_pull(skb, addr_len); /* * Get the length of the facilities, skip past them for the moment * get the call user data because this is needed to determine * the correct listener * * Facilities length is mandatory in call request packets */ if (!pskb_may_pull(skb, 1)) goto out_clear_request; len = skb->data[0] + 1; if (!pskb_may_pull(skb, len)) goto out_clear_request; skb_pull(skb,len); /* * Ensure that the amount of call user data is valid. */ if (skb->len > X25_MAX_CUD_LEN) goto out_clear_request; /* * Get all the call user data so it can be used in * x25_find_listener and skb_copy_from_linear_data up ahead. */ if (!pskb_may_pull(skb, skb->len)) goto out_clear_request; /* * Find a listener for the particular address/cud pair. */ sk = x25_find_listener(&source_addr,skb); skb_push(skb,len); if (sk != NULL && sk_acceptq_is_full(sk)) { goto out_sock_put; } /* * We dont have any listeners for this incoming call. * Try forwarding it. */ if (sk == NULL) { skb_push(skb, addr_len + X25_STD_MIN_LEN); if (sysctl_x25_forward && x25_forward_call(&dest_addr, nb, skb, lci) > 0) { /* Call was forwarded, dont process it any more */ kfree_skb(skb); rc = 1; goto out; } else { /* No listeners, can't forward, clear the call */ goto out_clear_request; } } /* * Try to reach a compromise on the requested facilities. */ len = x25_negotiate_facilities(skb, sk, &facilities, &dte_facilities); if (len == -1) goto out_sock_put; /* * current neighbour/link might impose additional limits * on certain facilities */ x25_limit_facilities(&facilities, nb); /* * Try to create a new socket. */ make = x25_make_new(sk); if (!make) goto out_sock_put; /* * Remove the facilities */ skb_pull(skb, len); skb->sk = make; make->sk_state = TCP_ESTABLISHED; makex25 = x25_sk(make); makex25->lci = lci; makex25->dest_addr = dest_addr; makex25->source_addr = source_addr; x25_neigh_hold(nb); makex25->neighbour = nb; makex25->facilities = facilities; makex25->dte_facilities= dte_facilities; makex25->vc_facil_mask = x25_sk(sk)->vc_facil_mask; /* ensure no reverse facil on accept */ makex25->vc_facil_mask &= ~X25_MASK_REVERSE; /* ensure no calling address extension on accept */ makex25->vc_facil_mask &= ~X25_MASK_CALLING_AE; makex25->cudmatchlength = x25_sk(sk)->cudmatchlength; /* Normally all calls are accepted immediately */ if (test_bit(X25_ACCPT_APPRV_FLAG, &makex25->flags)) { x25_write_internal(make, X25_CALL_ACCEPTED); makex25->state = X25_STATE_3; } else { makex25->state = X25_STATE_5; } /* * Incoming Call User Data. */ skb_copy_from_linear_data(skb, makex25->calluserdata.cuddata, skb->len); makex25->calluserdata.cudlength = skb->len; sk_acceptq_added(sk); x25_insert_socket(make); skb_queue_head(&sk->sk_receive_queue, skb); x25_start_heartbeat(make); if (!sock_flag(sk, SOCK_DEAD)) sk->sk_data_ready(sk); rc = 1; sock_put(sk); out: return rc; out_sock_put: sock_put(sk); out_clear_request: rc = 0; x25_transmit_clear_request(nb, lci, 0x01); goto out; } static int x25_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct x25_sock *x25 = x25_sk(sk); DECLARE_SOCKADDR(struct sockaddr_x25 *, usx25, msg->msg_name); struct sockaddr_x25 sx25; struct sk_buff *skb; unsigned char *asmptr; int noblock = msg->msg_flags & MSG_DONTWAIT; size_t size; int qbit = 0, rc = -EINVAL; lock_sock(sk); if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_OOB|MSG_EOR|MSG_CMSG_COMPAT)) goto out; /* we currently don't support segmented records at the user interface */ if (!(msg->msg_flags & (MSG_EOR|MSG_OOB))) goto out; rc = -EADDRNOTAVAIL; if (sock_flag(sk, SOCK_ZAPPED)) goto out; rc = -EPIPE; if (sk->sk_shutdown & SEND_SHUTDOWN) { send_sig(SIGPIPE, current, 0); goto out; } rc = -ENETUNREACH; if (!x25->neighbour) goto out; if (usx25) { rc = -EINVAL; if (msg->msg_namelen < sizeof(sx25)) goto out; memcpy(&sx25, usx25, sizeof(sx25)); rc = -EISCONN; if (strcmp(x25->dest_addr.x25_addr, sx25.sx25_addr.x25_addr)) goto out; rc = -EINVAL; if (sx25.sx25_family != AF_X25) goto out; } else { /* * FIXME 1003.1g - if the socket is like this because * it has become closed (not started closed) we ought * to SIGPIPE, EPIPE; */ rc = -ENOTCONN; if (sk->sk_state != TCP_ESTABLISHED) goto out; sx25.sx25_family = AF_X25; sx25.sx25_addr = x25->dest_addr; } /* Sanity check the packet size */ if (len > 65535) { rc = -EMSGSIZE; goto out; } net_dbg_ratelimited("x25_sendmsg: sendto: Addresses built.\n"); /* Build a packet */ net_dbg_ratelimited("x25_sendmsg: sendto: building packet.\n"); if ((msg->msg_flags & MSG_OOB) && len > 32) len = 32; size = len + X25_MAX_L2_LEN + X25_EXT_MIN_LEN; release_sock(sk); skb = sock_alloc_send_skb(sk, size, noblock, &rc); lock_sock(sk); if (!skb) goto out; X25_SKB_CB(skb)->flags = msg->msg_flags; skb_reserve(skb, X25_MAX_L2_LEN + X25_EXT_MIN_LEN); /* * Put the data on the end */ net_dbg_ratelimited("x25_sendmsg: Copying user data\n"); skb_reset_transport_header(skb); skb_put(skb, len); rc = memcpy_from_msg(skb_transport_header(skb), msg, len); if (rc) goto out_kfree_skb; /* * If the Q BIT Include socket option is in force, the first * byte of the user data is the logical value of the Q Bit. */ if (test_bit(X25_Q_BIT_FLAG, &x25->flags)) { if (!pskb_may_pull(skb, 1)) goto out_kfree_skb; qbit = skb->data[0]; skb_pull(skb, 1); } /* * Push down the X.25 header */ net_dbg_ratelimited("x25_sendmsg: Building X.25 Header.\n"); if (msg->msg_flags & MSG_OOB) { if (x25->neighbour->extended) { asmptr = skb_push(skb, X25_STD_MIN_LEN); *asmptr++ = ((x25->lci >> 8) & 0x0F) | X25_GFI_EXTSEQ; *asmptr++ = (x25->lci >> 0) & 0xFF; *asmptr++ = X25_INTERRUPT; } else { asmptr = skb_push(skb, X25_STD_MIN_LEN); *asmptr++ = ((x25->lci >> 8) & 0x0F) | X25_GFI_STDSEQ; *asmptr++ = (x25->lci >> 0) & 0xFF; *asmptr++ = X25_INTERRUPT; } } else { if (x25->neighbour->extended) { /* Build an Extended X.25 header */ asmptr = skb_push(skb, X25_EXT_MIN_LEN); *asmptr++ = ((x25->lci >> 8) & 0x0F) | X25_GFI_EXTSEQ; *asmptr++ = (x25->lci >> 0) & 0xFF; *asmptr++ = X25_DATA; *asmptr++ = X25_DATA; } else { /* Build an Standard X.25 header */ asmptr = skb_push(skb, X25_STD_MIN_LEN); *asmptr++ = ((x25->lci >> 8) & 0x0F) | X25_GFI_STDSEQ; *asmptr++ = (x25->lci >> 0) & 0xFF; *asmptr++ = X25_DATA; } if (qbit) skb->data[0] |= X25_Q_BIT; } net_dbg_ratelimited("x25_sendmsg: Built header.\n"); net_dbg_ratelimited("x25_sendmsg: Transmitting buffer\n"); rc = -ENOTCONN; if (sk->sk_state != TCP_ESTABLISHED) goto out_kfree_skb; if (msg->msg_flags & MSG_OOB) skb_queue_tail(&x25->interrupt_out_queue, skb); else { rc = x25_output(sk, skb); len = rc; if (rc < 0) kfree_skb(skb); else if (test_bit(X25_Q_BIT_FLAG, &x25->flags)) len++; } x25_kick(sk); rc = len; out: release_sock(sk); return rc; out_kfree_skb: kfree_skb(skb); goto out; } static int x25_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; struct x25_sock *x25 = x25_sk(sk); DECLARE_SOCKADDR(struct sockaddr_x25 *, sx25, msg->msg_name); size_t copied; int qbit, header_len; struct sk_buff *skb; unsigned char *asmptr; int rc = -ENOTCONN; lock_sock(sk); if (x25->neighbour == NULL) goto out; header_len = x25->neighbour->extended ? X25_EXT_MIN_LEN : X25_STD_MIN_LEN; /* * This works for seqpacket too. The receiver has ordered the queue for * us! We do one quick check first though */ if (sk->sk_state != TCP_ESTABLISHED) goto out; if (flags & MSG_OOB) { rc = -EINVAL; if (sock_flag(sk, SOCK_URGINLINE) || !skb_peek(&x25->interrupt_in_queue)) goto out; skb = skb_dequeue(&x25->interrupt_in_queue); if (!pskb_may_pull(skb, X25_STD_MIN_LEN)) goto out_free_dgram; skb_pull(skb, X25_STD_MIN_LEN); /* * No Q bit information on Interrupt data. */ if (test_bit(X25_Q_BIT_FLAG, &x25->flags)) { asmptr = skb_push(skb, 1); *asmptr = 0x00; } msg->msg_flags |= MSG_OOB; } else { /* Now we can treat all alike */ release_sock(sk); skb = skb_recv_datagram(sk, flags, &rc); lock_sock(sk); if (!skb) goto out; if (!pskb_may_pull(skb, header_len)) goto out_free_dgram; qbit = (skb->data[0] & X25_Q_BIT) == X25_Q_BIT; skb_pull(skb, header_len); if (test_bit(X25_Q_BIT_FLAG, &x25->flags)) { asmptr = skb_push(skb, 1); *asmptr = qbit; } } skb_reset_transport_header(skb); copied = skb->len; if (copied > size) { copied = size; msg->msg_flags |= MSG_TRUNC; } /* Currently, each datagram always contains a complete record */ msg->msg_flags |= MSG_EOR; rc = skb_copy_datagram_msg(skb, 0, msg, copied); if (rc) goto out_free_dgram; if (sx25) { sx25->sx25_family = AF_X25; sx25->sx25_addr = x25->dest_addr; msg->msg_namelen = sizeof(*sx25); } x25_check_rbuf(sk); rc = copied; out_free_dgram: skb_free_datagram(sk, skb); out: release_sock(sk); return rc; } static int x25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk = sock->sk; struct x25_sock *x25 = x25_sk(sk); void __user *argp = (void __user *)arg; int rc; switch (cmd) { case TIOCOUTQ: { int amount; amount = sk->sk_sndbuf - sk_wmem_alloc_get(sk); if (amount < 0) amount = 0; rc = put_user(amount, (unsigned int __user *)argp); break; } case TIOCINQ: { struct sk_buff *skb; int amount = 0; /* * These two are safe on a single CPU system as * only user tasks fiddle here */ lock_sock(sk); if ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) amount = skb->len; release_sock(sk); rc = put_user(amount, (unsigned int __user *)argp); break; } case SIOCGIFADDR: case SIOCSIFADDR: case SIOCGIFDSTADDR: case SIOCSIFDSTADDR: case SIOCGIFBRDADDR: case SIOCSIFBRDADDR: case SIOCGIFNETMASK: case SIOCSIFNETMASK: case SIOCGIFMETRIC: case SIOCSIFMETRIC: rc = -EINVAL; break; case SIOCADDRT: case SIOCDELRT: rc = -EPERM; if (!capable(CAP_NET_ADMIN)) break; rc = x25_route_ioctl(cmd, argp); break; case SIOCX25GSUBSCRIP: rc = x25_subscr_ioctl(cmd, argp); break; case SIOCX25SSUBSCRIP: rc = -EPERM; if (!capable(CAP_NET_ADMIN)) break; rc = x25_subscr_ioctl(cmd, argp); break; case SIOCX25GFACILITIES: { lock_sock(sk); rc = copy_to_user(argp, &x25->facilities, sizeof(x25->facilities)) ? -EFAULT : 0; release_sock(sk); break; } case SIOCX25SFACILITIES: { struct x25_facilities facilities; rc = -EFAULT; if (copy_from_user(&facilities, argp, sizeof(facilities))) break; rc = -EINVAL; lock_sock(sk); if (sk->sk_state != TCP_LISTEN && sk->sk_state != TCP_CLOSE) goto out_fac_release; if (facilities.pacsize_in < X25_PS16 || facilities.pacsize_in > X25_PS4096) goto out_fac_release; if (facilities.pacsize_out < X25_PS16 || facilities.pacsize_out > X25_PS4096) goto out_fac_release; if (facilities.winsize_in < 1 || facilities.winsize_in > 127) goto out_fac_release; if (facilities.throughput) { int out = facilities.throughput & 0xf0; int in = facilities.throughput & 0x0f; if (!out) facilities.throughput |= X25_DEFAULT_THROUGHPUT << 4; else if (out < 0x30 || out > 0xD0) goto out_fac_release; if (!in) facilities.throughput |= X25_DEFAULT_THROUGHPUT; else if (in < 0x03 || in > 0x0D) goto out_fac_release; } if (facilities.reverse && (facilities.reverse & 0x81) != 0x81) goto out_fac_release; x25->facilities = facilities; rc = 0; out_fac_release: release_sock(sk); break; } case SIOCX25GDTEFACILITIES: { lock_sock(sk); rc = copy_to_user(argp, &x25->dte_facilities, sizeof(x25->dte_facilities)); release_sock(sk); if (rc) rc = -EFAULT; break; } case SIOCX25SDTEFACILITIES: { struct x25_dte_facilities dtefacs; rc = -EFAULT; if (copy_from_user(&dtefacs, argp, sizeof(dtefacs))) break; rc = -EINVAL; lock_sock(sk); if (sk->sk_state != TCP_LISTEN && sk->sk_state != TCP_CLOSE) goto out_dtefac_release; if (dtefacs.calling_len > X25_MAX_AE_LEN) goto out_dtefac_release; if (dtefacs.called_len > X25_MAX_AE_LEN) goto out_dtefac_release; x25->dte_facilities = dtefacs; rc = 0; out_dtefac_release: release_sock(sk); break; } case SIOCX25GCALLUSERDATA: { lock_sock(sk); rc = copy_to_user(argp, &x25->calluserdata, sizeof(x25->calluserdata)) ? -EFAULT : 0; release_sock(sk); break; } case SIOCX25SCALLUSERDATA: { struct x25_calluserdata calluserdata; rc = -EFAULT; if (copy_from_user(&calluserdata, argp, sizeof(calluserdata))) break; rc = -EINVAL; if (calluserdata.cudlength > X25_MAX_CUD_LEN) break; lock_sock(sk); x25->calluserdata = calluserdata; release_sock(sk); rc = 0; break; } case SIOCX25GCAUSEDIAG: { lock_sock(sk); rc = copy_to_user(argp, &x25->causediag, sizeof(x25->causediag)) ? -EFAULT : 0; release_sock(sk); break; } case SIOCX25SCAUSEDIAG: { struct x25_causediag causediag; rc = -EFAULT; if (copy_from_user(&causediag, argp, sizeof(causediag))) break; lock_sock(sk); x25->causediag = causediag; release_sock(sk); rc = 0; break; } case SIOCX25SCUDMATCHLEN: { struct x25_subaddr sub_addr; rc = -EINVAL; lock_sock(sk); if(sk->sk_state != TCP_CLOSE) goto out_cud_release; rc = -EFAULT; if (copy_from_user(&sub_addr, argp, sizeof(sub_addr))) goto out_cud_release; rc = -EINVAL; if (sub_addr.cudmatchlength > X25_MAX_CUD_LEN) goto out_cud_release; x25->cudmatchlength = sub_addr.cudmatchlength; rc = 0; out_cud_release: release_sock(sk); break; } case SIOCX25CALLACCPTAPPRV: { rc = -EINVAL; lock_sock(sk); if (sk->sk_state == TCP_CLOSE) { clear_bit(X25_ACCPT_APPRV_FLAG, &x25->flags); rc = 0; } release_sock(sk); break; } case SIOCX25SENDCALLACCPT: { rc = -EINVAL; lock_sock(sk); if (sk->sk_state != TCP_ESTABLISHED) goto out_sendcallaccpt_release; /* must call accptapprv above */ if (test_bit(X25_ACCPT_APPRV_FLAG, &x25->flags)) goto out_sendcallaccpt_release; x25_write_internal(sk, X25_CALL_ACCEPTED); x25->state = X25_STATE_3; rc = 0; out_sendcallaccpt_release: release_sock(sk); break; } default: rc = -ENOIOCTLCMD; break; } return rc; } static const struct net_proto_family x25_family_ops = { .family = AF_X25, .create = x25_create, .owner = THIS_MODULE, }; #ifdef CONFIG_COMPAT static int compat_x25_subscr_ioctl(unsigned int cmd, struct compat_x25_subscrip_struct __user *x25_subscr32) { struct compat_x25_subscrip_struct x25_subscr; struct x25_neigh *nb; struct net_device *dev; int rc = -EINVAL; rc = -EFAULT; if (copy_from_user(&x25_subscr, x25_subscr32, sizeof(*x25_subscr32))) goto out; rc = -EINVAL; dev = x25_dev_get(x25_subscr.device); if (dev == NULL) goto out; nb = x25_get_neigh(dev); if (nb == NULL) goto out_dev_put; dev_put(dev); if (cmd == SIOCX25GSUBSCRIP) { read_lock_bh(&x25_neigh_list_lock); x25_subscr.extended = nb->extended; x25_subscr.global_facil_mask = nb->global_facil_mask; read_unlock_bh(&x25_neigh_list_lock); rc = copy_to_user(x25_subscr32, &x25_subscr, sizeof(*x25_subscr32)) ? -EFAULT : 0; } else { rc = -EINVAL; if (x25_subscr.extended == 0 || x25_subscr.extended == 1) { rc = 0; write_lock_bh(&x25_neigh_list_lock); nb->extended = x25_subscr.extended; nb->global_facil_mask = x25_subscr.global_facil_mask; write_unlock_bh(&x25_neigh_list_lock); } } x25_neigh_put(nb); out: return rc; out_dev_put: dev_put(dev); goto out; } static int compat_x25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { void __user *argp = compat_ptr(arg); int rc = -ENOIOCTLCMD; switch(cmd) { case TIOCOUTQ: case TIOCINQ: rc = x25_ioctl(sock, cmd, (unsigned long)argp); break; case SIOCGIFADDR: case SIOCSIFADDR: case SIOCGIFDSTADDR: case SIOCSIFDSTADDR: case SIOCGIFBRDADDR: case SIOCSIFBRDADDR: case SIOCGIFNETMASK: case SIOCSIFNETMASK: case SIOCGIFMETRIC: case SIOCSIFMETRIC: rc = -EINVAL; break; case SIOCADDRT: case SIOCDELRT: rc = -EPERM; if (!capable(CAP_NET_ADMIN)) break; rc = x25_route_ioctl(cmd, argp); break; case SIOCX25GSUBSCRIP: rc = compat_x25_subscr_ioctl(cmd, argp); break; case SIOCX25SSUBSCRIP: rc = -EPERM; if (!capable(CAP_NET_ADMIN)) break; rc = compat_x25_subscr_ioctl(cmd, argp); break; case SIOCX25GFACILITIES: case SIOCX25SFACILITIES: case SIOCX25GDTEFACILITIES: case SIOCX25SDTEFACILITIES: case SIOCX25GCALLUSERDATA: case SIOCX25SCALLUSERDATA: case SIOCX25GCAUSEDIAG: case SIOCX25SCAUSEDIAG: case SIOCX25SCUDMATCHLEN: case SIOCX25CALLACCPTAPPRV: case SIOCX25SENDCALLACCPT: rc = x25_ioctl(sock, cmd, (unsigned long)argp); break; default: rc = -ENOIOCTLCMD; break; } return rc; } #endif static const struct proto_ops x25_proto_ops = { .family = AF_X25, .owner = THIS_MODULE, .release = x25_release, .bind = x25_bind, .connect = x25_connect, .socketpair = sock_no_socketpair, .accept = x25_accept, .getname = x25_getname, .poll = datagram_poll, .ioctl = x25_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = compat_x25_ioctl, #endif .gettstamp = sock_gettstamp, .listen = x25_listen, .shutdown = sock_no_shutdown, .setsockopt = x25_setsockopt, .getsockopt = x25_getsockopt, .sendmsg = x25_sendmsg, .recvmsg = x25_recvmsg, .mmap = sock_no_mmap, }; static struct packet_type x25_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_X25), .func = x25_lapb_receive_frame, }; static struct notifier_block x25_dev_notifier = { .notifier_call = x25_device_event, }; void x25_kill_by_neigh(struct x25_neigh *nb) { struct sock *s; write_lock_bh(&x25_list_lock); sk_for_each(s, &x25_list) { if (x25_sk(s)->neighbour == nb) { write_unlock_bh(&x25_list_lock); lock_sock(s); x25_disconnect(s, ENETUNREACH, 0, 0); release_sock(s); write_lock_bh(&x25_list_lock); } } write_unlock_bh(&x25_list_lock); /* Remove any related forwards */ x25_clear_forward_by_dev(nb->dev); } static int __init x25_init(void) { int rc; rc = proto_register(&x25_proto, 0); if (rc) goto out; rc = sock_register(&x25_family_ops); if (rc) goto out_proto; dev_add_pack(&x25_packet_type); rc = register_netdevice_notifier(&x25_dev_notifier); if (rc) goto out_sock; rc = x25_register_sysctl(); if (rc) goto out_dev; rc = x25_proc_init(); if (rc) goto out_sysctl; pr_info("Linux Version 0.2\n"); out: return rc; out_sysctl: x25_unregister_sysctl(); out_dev: unregister_netdevice_notifier(&x25_dev_notifier); out_sock: dev_remove_pack(&x25_packet_type); sock_unregister(AF_X25); out_proto: proto_unregister(&x25_proto); goto out; } module_init(x25_init); static void __exit x25_exit(void) { x25_proc_exit(); x25_link_free(); x25_route_free(); x25_unregister_sysctl(); unregister_netdevice_notifier(&x25_dev_notifier); dev_remove_pack(&x25_packet_type); sock_unregister(AF_X25); proto_unregister(&x25_proto); } module_exit(x25_exit); MODULE_AUTHOR("Jonathan Naylor <g4klx@g4klx.demon.co.uk>"); MODULE_DESCRIPTION("The X.25 Packet Layer network layer protocol"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_X25);
784 783 785 782 1 783 783 783 785 784 782 783 780 1 1 1 784 784 784 5 808 785 19 5 24 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs * * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes * 2000-2002 x86-64 support by Andi Kleen */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/sched.h> #include <linux/sched/task_stack.h> #include <linux/mm.h> #include <linux/smp.h> #include <linux/kernel.h> #include <linux/kstrtox.h> #include <linux/errno.h> #include <linux/wait.h> #include <linux/unistd.h> #include <linux/stddef.h> #include <linux/personality.h> #include <linux/uaccess.h> #include <linux/user-return-notifier.h> #include <linux/uprobes.h> #include <linux/context_tracking.h> #include <linux/entry-common.h> #include <linux/syscalls.h> #include <linux/rseq.h> #include <asm/processor.h> #include <asm/ucontext.h> #include <asm/fpu/signal.h> #include <asm/fpu/xstate.h> #include <asm/vdso.h> #include <asm/mce.h> #include <asm/sighandling.h> #include <asm/vm86.h> #include <asm/syscall.h> #include <asm/sigframe.h> #include <asm/signal.h> #include <asm/shstk.h> static inline int is_ia32_compat_frame(struct ksignal *ksig) { return IS_ENABLED(CONFIG_IA32_EMULATION) && ksig->ka.sa.sa_flags & SA_IA32_ABI; } static inline int is_ia32_frame(struct ksignal *ksig) { return IS_ENABLED(CONFIG_X86_32) || is_ia32_compat_frame(ksig); } static inline int is_x32_frame(struct ksignal *ksig) { return IS_ENABLED(CONFIG_X86_X32_ABI) && ksig->ka.sa.sa_flags & SA_X32_ABI; } /* * Enable all pkeys temporarily, so as to ensure that both the current * execution stack as well as the alternate signal stack are writeable. * The application can use any of the available pkeys to protect the * alternate signal stack, and we don't know which one it is, so enable * all. The PKRU register will be reset to init_pkru later in the flow, * in fpu__clear_user_states(), and it is the application's responsibility * to enable the appropriate pkey as the first step in the signal handler * so that the handler does not segfault. */ static inline u32 sig_prepare_pkru(void) { u32 orig_pkru = read_pkru(); write_pkru(0); return orig_pkru; } /* * Set up a signal frame. */ /* x86 ABI requires 16-byte alignment */ #define FRAME_ALIGNMENT 16UL #define MAX_FRAME_PADDING (FRAME_ALIGNMENT - 1) /* * Determine which stack to use.. */ void __user * get_sigframe(struct ksignal *ksig, struct pt_regs *regs, size_t frame_size, void __user **fpstate) { struct k_sigaction *ka = &ksig->ka; int ia32_frame = is_ia32_frame(ksig); /* Default to using normal stack */ bool nested_altstack = on_sig_stack(regs->sp); bool entering_altstack = false; unsigned long math_size = 0; unsigned long sp = regs->sp; unsigned long buf_fx = 0; u32 pkru; /* redzone */ if (!ia32_frame) sp -= 128; /* This is the X/Open sanctioned signal stack switching. */ if (ka->sa.sa_flags & SA_ONSTACK) { /* * This checks nested_altstack via sas_ss_flags(). Sensible * programs use SS_AUTODISARM, which disables that check, and * programs that don't use SS_AUTODISARM get compatible. */ if (sas_ss_flags(sp) == 0) { sp = current->sas_ss_sp + current->sas_ss_size; entering_altstack = true; } } else if (ia32_frame && !nested_altstack && regs->ss != __USER_DS && !(ka->sa.sa_flags & SA_RESTORER) && ka->sa.sa_restorer) { /* This is the legacy signal stack switching. */ sp = (unsigned long) ka->sa.sa_restorer; entering_altstack = true; } sp = fpu__alloc_mathframe(sp, ia32_frame, &buf_fx, &math_size); *fpstate = (void __user *)sp; sp -= frame_size; if (ia32_frame) /* * Align the stack pointer according to the i386 ABI, * i.e. so that on function entry ((sp + 4) & 15) == 0. */ sp = ((sp + 4) & -FRAME_ALIGNMENT) - 4; else sp = round_down(sp, FRAME_ALIGNMENT) - 8; /* * If we are on the alternate signal stack and would overflow it, don't. * Return an always-bogus address instead so we will die with SIGSEGV. */ if (unlikely((nested_altstack || entering_altstack) && !__on_sig_stack(sp))) { if (show_unhandled_signals && printk_ratelimit()) pr_info("%s[%d] overflowed sigaltstack\n", current->comm, task_pid_nr(current)); return (void __user *)-1L; } /* Update PKRU to enable access to the alternate signal stack. */ pkru = sig_prepare_pkru(); /* save i387 and extended state */ if (!copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size, pkru)) { /* * Restore PKRU to the original, user-defined value; disable * extra pkeys enabled for the alternate signal stack, if any. */ write_pkru(pkru); return (void __user *)-1L; } return (void __user *)sp; } /* * There are four different struct types for signal frame: sigframe_ia32, * rt_sigframe_ia32, rt_sigframe_x32, and rt_sigframe. Use the worst case * -- the largest size. It means the size for 64-bit apps is a bit more * than needed, but this keeps the code simple. */ #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) # define MAX_FRAME_SIGINFO_UCTXT_SIZE sizeof(struct sigframe_ia32) #else # define MAX_FRAME_SIGINFO_UCTXT_SIZE sizeof(struct rt_sigframe) #endif /* * The FP state frame contains an XSAVE buffer which must be 64-byte aligned. * If a signal frame starts at an unaligned address, extra space is required. * This is the max alignment padding, conservatively. */ #define MAX_XSAVE_PADDING 63UL /* * The frame data is composed of the following areas and laid out as: * * ------------------------- * | alignment padding | * ------------------------- * | (f)xsave frame | * ------------------------- * | fsave header | * ------------------------- * | alignment padding | * ------------------------- * | siginfo + ucontext | * ------------------------- */ /* max_frame_size tells userspace the worst case signal stack size. */ static unsigned long __ro_after_init max_frame_size; static unsigned int __ro_after_init fpu_default_state_size; static int __init init_sigframe_size(void) { fpu_default_state_size = fpu__get_fpstate_size(); max_frame_size = MAX_FRAME_SIGINFO_UCTXT_SIZE + MAX_FRAME_PADDING; max_frame_size += fpu_default_state_size + MAX_XSAVE_PADDING; /* Userspace expects an aligned size. */ max_frame_size = round_up(max_frame_size, FRAME_ALIGNMENT); pr_info("max sigframe size: %lu\n", max_frame_size); return 0; } early_initcall(init_sigframe_size); unsigned long get_sigframe_size(void) { return max_frame_size; } static int setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) { /* Perform fixup for the pre-signal frame. */ rseq_signal_deliver(ksig, regs); /* Set up the stack frame */ if (is_ia32_frame(ksig)) { if (ksig->ka.sa.sa_flags & SA_SIGINFO) return ia32_setup_rt_frame(ksig, regs); else return ia32_setup_frame(ksig, regs); } else if (is_x32_frame(ksig)) { return x32_setup_rt_frame(ksig, regs); } else { return x64_setup_rt_frame(ksig, regs); } } static void handle_signal(struct ksignal *ksig, struct pt_regs *regs) { bool stepping, failed; struct fpu *fpu = x86_task_fpu(current); if (v8086_mode(regs)) save_v86_state((struct kernel_vm86_regs *) regs, VM86_SIGNAL); /* Are we from a system call? */ if (syscall_get_nr(current, regs) != -1) { /* If so, check system call restarting.. */ switch (syscall_get_error(current, regs)) { case -ERESTART_RESTARTBLOCK: case -ERESTARTNOHAND: regs->ax = -EINTR; break; case -ERESTARTSYS: if (!(ksig->ka.sa.sa_flags & SA_RESTART)) { regs->ax = -EINTR; break; } fallthrough; case -ERESTARTNOINTR: regs->ax = regs->orig_ax; regs->ip -= 2; break; } } /* * If TF is set due to a debugger (TIF_FORCED_TF), clear TF now * so that register information in the sigcontext is correct and * then notify the tracer before entering the signal handler. */ stepping = test_thread_flag(TIF_SINGLESTEP); if (stepping) user_disable_single_step(current); failed = (setup_rt_frame(ksig, regs) < 0); if (!failed) { /* * Clear the direction flag as per the ABI for function entry. * * Clear RF when entering the signal handler, because * it might disable possible debug exception from the * signal handler. * * Clear TF for the case when it wasn't set by debugger to * avoid the recursive send_sigtrap() in SIGTRAP handler. */ regs->flags &= ~(X86_EFLAGS_DF|X86_EFLAGS_RF|X86_EFLAGS_TF); /* * Ensure the signal handler starts with the new fpu state. */ fpu__clear_user_states(fpu); } signal_setup_done(failed, ksig, stepping); } static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs) { #ifdef CONFIG_IA32_EMULATION if (current->restart_block.arch_data & TS_COMPAT) return __NR_ia32_restart_syscall; #endif #ifdef CONFIG_X86_X32_ABI return __NR_restart_syscall | (regs->orig_ax & __X32_SYSCALL_BIT); #else return __NR_restart_syscall; #endif } /* * Note that 'init' is a special process: it doesn't get signals it doesn't * want to handle. Thus you cannot kill init even with a SIGKILL even by * mistake. */ void arch_do_signal_or_restart(struct pt_regs *regs) { struct ksignal ksig; if (get_signal(&ksig)) { /* Whee! Actually deliver the signal. */ handle_signal(&ksig, regs); return; } /* Did we come from a system call? */ if (syscall_get_nr(current, regs) != -1) { /* Restart the system call - no handlers present */ switch (syscall_get_error(current, regs)) { case -ERESTARTNOHAND: case -ERESTARTSYS: case -ERESTARTNOINTR: regs->ax = regs->orig_ax; regs->ip -= 2; break; case -ERESTART_RESTARTBLOCK: regs->ax = get_nr_restart_syscall(regs); regs->ip -= 2; break; } } /* * If there's no signal to deliver, we just put the saved sigmask * back. */ restore_saved_sigmask(); } void signal_fault(struct pt_regs *regs, void __user *frame, char *where) { struct task_struct *me = current; if (show_unhandled_signals && printk_ratelimit()) { printk("%s" "%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx", task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG, me->comm, me->pid, where, frame, regs->ip, regs->sp, regs->orig_ax); print_vma_addr(KERN_CONT " in ", regs->ip); pr_cont("\n"); } force_sig(SIGSEGV); } #ifdef CONFIG_DYNAMIC_SIGFRAME #ifdef CONFIG_STRICT_SIGALTSTACK_SIZE static bool strict_sigaltstack_size __ro_after_init = true; #else static bool strict_sigaltstack_size __ro_after_init = false; #endif static int __init strict_sas_size(char *arg) { return kstrtobool(arg, &strict_sigaltstack_size) == 0; } __setup("strict_sas_size", strict_sas_size); /* * MINSIGSTKSZ is 2048 and can't be changed despite the fact that AVX512 * exceeds that size already. As such programs might never use the * sigaltstack they just continued to work. While always checking against * the real size would be correct, this might be considered a regression. * * Therefore avoid the sanity check, unless enforced by kernel * configuration or command line option. * * When dynamic FPU features are supported, the check is also enforced when * the task has permissions to use dynamic features. Tasks which have no * permission are checked against the size of the non-dynamic feature set * if strict checking is enabled. This avoids forcing all tasks on the * system to allocate large sigaltstacks even if they are never going * to use a dynamic feature. As this is serialized via sighand::siglock * any permission request for a dynamic feature either happened already * or will see the newly install sigaltstack size in the permission checks. */ bool sigaltstack_size_valid(size_t ss_size) { unsigned long fsize = max_frame_size - fpu_default_state_size; u64 mask; lockdep_assert_held(&current->sighand->siglock); if (!fpu_state_size_dynamic() && !strict_sigaltstack_size) return true; fsize += x86_task_fpu(current->group_leader)->perm.__user_state_size; if (likely(ss_size > fsize)) return true; if (strict_sigaltstack_size) return ss_size > fsize; mask = x86_task_fpu(current->group_leader)->perm.__state_perm; if (mask & XFEATURE_MASK_USER_DYNAMIC) return ss_size > fsize; return true; } #endif /* CONFIG_DYNAMIC_SIGFRAME */
16 11 11 11 3 3 3 2 2 2 2 2 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 // SPDX-License-Identifier: GPL-2.0 #include <linux/compat.h> #include <linux/console.h> #include <linux/fb.h> #include <linux/fbcon.h> #include <linux/major.h> #include "fb_internal.h" /* * We hold a reference to the fb_info in file->private_data, * but if the current registered fb has changed, we don't * actually want to use it. * * So look up the fb_info using the inode minor number, * and just verify it against the reference we have. */ static struct fb_info *file_fb_info(struct file *file) { struct inode *inode = file_inode(file); int fbidx = iminor(inode); struct fb_info *info = registered_fb[fbidx]; if (info != file->private_data) info = NULL; return info; } static ssize_t fb_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct fb_info *info = file_fb_info(file); if (!info) return -ENODEV; if (fb_WARN_ON_ONCE(info, !info->fbops->fb_read)) return -EINVAL; if (info->state != FBINFO_STATE_RUNNING) return -EPERM; return info->fbops->fb_read(info, buf, count, ppos); } static ssize_t fb_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct fb_info *info = file_fb_info(file); if (!info) return -ENODEV; if (fb_WARN_ON_ONCE(info, !info->fbops->fb_write)) return -EINVAL; if (info->state != FBINFO_STATE_RUNNING) return -EPERM; return info->fbops->fb_write(info, buf, count, ppos); } static long do_fb_ioctl(struct fb_info *info, unsigned int cmd, unsigned long arg) { const struct fb_ops *fb; struct fb_var_screeninfo var; struct fb_fix_screeninfo fix; struct fb_cmap cmap_from; struct fb_cmap_user cmap; void __user *argp = (void __user *)arg; long ret = 0; switch (cmd) { case FBIOGET_VSCREENINFO: lock_fb_info(info); var = info->var; unlock_fb_info(info); ret = copy_to_user(argp, &var, sizeof(var)) ? -EFAULT : 0; break; case FBIOPUT_VSCREENINFO: if (copy_from_user(&var, argp, sizeof(var))) return -EFAULT; /* only for kernel-internal use */ var.activate &= ~FB_ACTIVATE_KD_TEXT; console_lock(); lock_fb_info(info); ret = fbcon_modechange_possible(info, &var); if (!ret) ret = fb_set_var(info, &var); if (!ret) fbcon_update_vcs(info, var.activate & FB_ACTIVATE_ALL); unlock_fb_info(info); console_unlock(); if (!ret && copy_to_user(argp, &var, sizeof(var))) ret = -EFAULT; break; case FBIOGET_FSCREENINFO: lock_fb_info(info); memcpy(&fix, &info->fix, sizeof(fix)); if (info->flags & FBINFO_HIDE_SMEM_START) fix.smem_start = 0; unlock_fb_info(info); ret = copy_to_user(argp, &fix, sizeof(fix)) ? -EFAULT : 0; break; case FBIOPUTCMAP: if (copy_from_user(&cmap, argp, sizeof(cmap))) return -EFAULT; ret = fb_set_user_cmap(&cmap, info); break; case FBIOGETCMAP: if (copy_from_user(&cmap, argp, sizeof(cmap))) return -EFAULT; lock_fb_info(info); cmap_from = info->cmap; unlock_fb_info(info); ret = fb_cmap_to_user(&cmap_from, &cmap); break; case FBIOPAN_DISPLAY: if (copy_from_user(&var, argp, sizeof(var))) return -EFAULT; console_lock(); lock_fb_info(info); ret = fb_pan_display(info, &var); unlock_fb_info(info); console_unlock(); if (ret == 0 && copy_to_user(argp, &var, sizeof(var))) return -EFAULT; break; case FBIO_CURSOR: ret = -EINVAL; break; case FBIOGET_CON2FBMAP: ret = fbcon_get_con2fb_map_ioctl(argp); break; case FBIOPUT_CON2FBMAP: ret = fbcon_set_con2fb_map_ioctl(argp); break; case FBIOBLANK: if (arg > FB_BLANK_POWERDOWN) return -EINVAL; console_lock(); lock_fb_info(info); ret = fb_blank(info, arg); /* might again call into fb_blank */ fbcon_fb_blanked(info, arg); unlock_fb_info(info); console_unlock(); break; default: lock_fb_info(info); fb = info->fbops; if (fb->fb_ioctl) ret = fb->fb_ioctl(info, cmd, arg); else ret = -ENOTTY; unlock_fb_info(info); } return ret; } static long fb_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct fb_info *info = file_fb_info(file); if (!info) return -ENODEV; return do_fb_ioctl(info, cmd, arg); } #ifdef CONFIG_COMPAT struct fb_fix_screeninfo32 { char id[16]; compat_caddr_t smem_start; u32 smem_len; u32 type; u32 type_aux; u32 visual; u16 xpanstep; u16 ypanstep; u16 ywrapstep; u32 line_length; compat_caddr_t mmio_start; u32 mmio_len; u32 accel; u16 reserved[3]; }; struct fb_cmap32 { u32 start; u32 len; compat_caddr_t red; compat_caddr_t green; compat_caddr_t blue; compat_caddr_t transp; }; static int fb_getput_cmap(struct fb_info *info, unsigned int cmd, unsigned long arg) { struct fb_cmap32 cmap32; struct fb_cmap cmap_from; struct fb_cmap_user cmap; if (copy_from_user(&cmap32, compat_ptr(arg), sizeof(cmap32))) return -EFAULT; cmap = (struct fb_cmap_user) { .start = cmap32.start, .len = cmap32.len, .red = compat_ptr(cmap32.red), .green = compat_ptr(cmap32.green), .blue = compat_ptr(cmap32.blue), .transp = compat_ptr(cmap32.transp), }; if (cmd == FBIOPUTCMAP) return fb_set_user_cmap(&cmap, info); lock_fb_info(info); cmap_from = info->cmap; unlock_fb_info(info); return fb_cmap_to_user(&cmap_from, &cmap); } static int do_fscreeninfo_to_user(struct fb_fix_screeninfo *fix, struct fb_fix_screeninfo32 __user *fix32) { __u32 data; int err; err = copy_to_user(&fix32->id, &fix->id, sizeof(fix32->id)); data = (__u32) (unsigned long) fix->smem_start; err |= put_user(data, &fix32->smem_start); err |= put_user(fix->smem_len, &fix32->smem_len); err |= put_user(fix->type, &fix32->type); err |= put_user(fix->type_aux, &fix32->type_aux); err |= put_user(fix->visual, &fix32->visual); err |= put_user(fix->xpanstep, &fix32->xpanstep); err |= put_user(fix->ypanstep, &fix32->ypanstep); err |= put_user(fix->ywrapstep, &fix32->ywrapstep); err |= put_user(fix->line_length, &fix32->line_length); data = (__u32) (unsigned long) fix->mmio_start; err |= put_user(data, &fix32->mmio_start); err |= put_user(fix->mmio_len, &fix32->mmio_len); err |= put_user(fix->accel, &fix32->accel); err |= copy_to_user(fix32->reserved, fix->reserved, sizeof(fix->reserved)); if (err) return -EFAULT; return 0; } static int fb_get_fscreeninfo(struct fb_info *info, unsigned int cmd, unsigned long arg) { struct fb_fix_screeninfo fix; lock_fb_info(info); fix = info->fix; if (info->flags & FBINFO_HIDE_SMEM_START) fix.smem_start = 0; unlock_fb_info(info); return do_fscreeninfo_to_user(&fix, compat_ptr(arg)); } static long fb_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct fb_info *info = file_fb_info(file); const struct fb_ops *fb; long ret = -ENOIOCTLCMD; if (!info) return -ENODEV; fb = info->fbops; switch (cmd) { case FBIOGET_VSCREENINFO: case FBIOPUT_VSCREENINFO: case FBIOPAN_DISPLAY: case FBIOGET_CON2FBMAP: case FBIOPUT_CON2FBMAP: arg = (unsigned long) compat_ptr(arg); fallthrough; case FBIOBLANK: ret = do_fb_ioctl(info, cmd, arg); break; case FBIOGET_FSCREENINFO: ret = fb_get_fscreeninfo(info, cmd, arg); break; case FBIOGETCMAP: case FBIOPUTCMAP: ret = fb_getput_cmap(info, cmd, arg); break; default: if (fb->fb_compat_ioctl) ret = fb->fb_compat_ioctl(info, cmd, arg); break; } return ret; } #endif static int fb_mmap(struct file *file, struct vm_area_struct *vma) { struct fb_info *info = file_fb_info(file); int res; if (!info) return -ENODEV; if (fb_WARN_ON_ONCE(info, !info->fbops->fb_mmap)) return -ENODEV; mutex_lock(&info->mm_lock); res = info->fbops->fb_mmap(info, vma); mutex_unlock(&info->mm_lock); return res; } static int fb_open(struct inode *inode, struct file *file) __acquires(&info->lock) __releases(&info->lock) { int fbidx = iminor(inode); struct fb_info *info; int res = 0; info = get_fb_info(fbidx); if (!info) { request_module("fb%d", fbidx); info = get_fb_info(fbidx); if (!info) return -ENODEV; } if (IS_ERR(info)) return PTR_ERR(info); lock_fb_info(info); if (!try_module_get(info->fbops->owner)) { res = -ENODEV; goto out; } file->private_data = info; if (info->fbops->fb_open) { res = info->fbops->fb_open(info, 1); if (res) module_put(info->fbops->owner); } #ifdef CONFIG_FB_DEFERRED_IO if (info->fbdefio) fb_deferred_io_open(info, inode, file); #endif out: unlock_fb_info(info); if (res) put_fb_info(info); return res; } static int fb_release(struct inode *inode, struct file *file) __acquires(&info->lock) __releases(&info->lock) { struct fb_info * const info = file->private_data; lock_fb_info(info); #if IS_ENABLED(CONFIG_FB_DEFERRED_IO) if (info->fbdefio) fb_deferred_io_release(info); #endif if (info->fbops->fb_release) info->fbops->fb_release(info, 1); module_put(info->fbops->owner); unlock_fb_info(info); put_fb_info(info); return 0; } #if defined(CONFIG_FB_PROVIDE_GET_FB_UNMAPPED_AREA) && !defined(CONFIG_MMU) static unsigned long get_fb_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { struct fb_info * const info = filp->private_data; unsigned long fb_size = PAGE_ALIGN(info->fix.smem_len); if (pgoff > fb_size || len > fb_size - pgoff) return -EINVAL; return (unsigned long)info->screen_base + pgoff; } #endif static const struct file_operations fb_fops = { .owner = THIS_MODULE, .read = fb_read, .write = fb_write, .unlocked_ioctl = fb_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = fb_compat_ioctl, #endif .mmap = fb_mmap, .open = fb_open, .release = fb_release, #if defined(HAVE_ARCH_FB_UNMAPPED_AREA) || \ (defined(CONFIG_FB_PROVIDE_GET_FB_UNMAPPED_AREA) && \ !defined(CONFIG_MMU)) .get_unmapped_area = get_fb_unmapped_area, #endif #ifdef CONFIG_FB_DEFERRED_IO .fsync = fb_deferred_io_fsync, #endif .llseek = default_llseek, }; int fb_register_chrdev(void) { int ret; ret = register_chrdev(FB_MAJOR, "fb", &fb_fops); if (ret) { pr_err("Unable to get major %d for fb devs\n", FB_MAJOR); return ret; } return ret; } void fb_unregister_chrdev(void) { unregister_chrdev(FB_MAJOR, "fb"); }
1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2018, Intel Corporation. */ /* A common module to handle registrations and notifications for paravirtual * drivers to enable accelerated datapath and support VF live migration. * * The notifier and event handling code is based on netvsc driver. */ #include <linux/module.h> #include <linux/etherdevice.h> #include <uapi/linux/if_arp.h> #include <linux/rtnetlink.h> #include <linux/if_vlan.h> #include <net/netdev_lock.h> #include <net/failover.h> static LIST_HEAD(failover_list); static DEFINE_SPINLOCK(failover_lock); static struct net_device *failover_get_bymac(u8 *mac, struct failover_ops **ops) { struct net_device *failover_dev; struct failover *failover; spin_lock(&failover_lock); list_for_each_entry(failover, &failover_list, list) { failover_dev = rtnl_dereference(failover->failover_dev); if (ether_addr_equal(failover_dev->perm_addr, mac)) { *ops = rtnl_dereference(failover->ops); spin_unlock(&failover_lock); return failover_dev; } } spin_unlock(&failover_lock); return NULL; } /** * failover_slave_register - Register a slave netdev * * @slave_dev: slave netdev that is being registered * * Registers a slave device to a failover instance. Only ethernet devices * are supported. */ static int failover_slave_register(struct net_device *slave_dev) { struct netdev_lag_upper_info lag_upper_info; struct net_device *failover_dev; struct failover_ops *fops; int err; if (slave_dev->type != ARPHRD_ETHER) goto done; ASSERT_RTNL(); failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops); if (!failover_dev) goto done; if (fops->slave_pre_register && fops->slave_pre_register(slave_dev, failover_dev)) goto done; err = netdev_rx_handler_register(slave_dev, fops->slave_handle_frame, failover_dev); if (err) { netdev_err(slave_dev, "can not register failover rx handler (err = %d)\n", err); goto done; } lag_upper_info.tx_type = NETDEV_LAG_TX_TYPE_ACTIVEBACKUP; err = netdev_master_upper_dev_link(slave_dev, failover_dev, NULL, &lag_upper_info, NULL); if (err) { netdev_err(slave_dev, "can not set failover device %s (err = %d)\n", failover_dev->name, err); goto err_upper_link; } slave_dev->priv_flags |= (IFF_FAILOVER_SLAVE | IFF_NO_ADDRCONF); if (fops->slave_register && !fops->slave_register(slave_dev, failover_dev)) return NOTIFY_OK; netdev_upper_dev_unlink(slave_dev, failover_dev); slave_dev->priv_flags &= ~(IFF_FAILOVER_SLAVE | IFF_NO_ADDRCONF); err_upper_link: netdev_rx_handler_unregister(slave_dev); done: return NOTIFY_DONE; } /** * failover_slave_unregister - Unregister a slave netdev * * @slave_dev: slave netdev that is being unregistered * * Unregisters a slave device from a failover instance. */ int failover_slave_unregister(struct net_device *slave_dev) { struct net_device *failover_dev; struct failover_ops *fops; if (!netif_is_failover_slave(slave_dev)) goto done; ASSERT_RTNL(); failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops); if (!failover_dev) goto done; if (fops->slave_pre_unregister && fops->slave_pre_unregister(slave_dev, failover_dev)) goto done; netdev_rx_handler_unregister(slave_dev); netdev_upper_dev_unlink(slave_dev, failover_dev); slave_dev->priv_flags &= ~(IFF_FAILOVER_SLAVE | IFF_NO_ADDRCONF); if (fops->slave_unregister && !fops->slave_unregister(slave_dev, failover_dev)) return NOTIFY_OK; done: return NOTIFY_DONE; } EXPORT_SYMBOL_GPL(failover_slave_unregister); static int failover_slave_link_change(struct net_device *slave_dev) { struct net_device *failover_dev; struct failover_ops *fops; if (!netif_is_failover_slave(slave_dev)) goto done; ASSERT_RTNL(); failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops); if (!failover_dev) goto done; if (!netif_running(failover_dev)) goto done; if (fops->slave_link_change && !fops->slave_link_change(slave_dev, failover_dev)) return NOTIFY_OK; done: return NOTIFY_DONE; } static int failover_slave_name_change(struct net_device *slave_dev) { struct net_device *failover_dev; struct failover_ops *fops; if (!netif_is_failover_slave(slave_dev)) goto done; ASSERT_RTNL(); failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops); if (!failover_dev) goto done; if (!netif_running(failover_dev)) goto done; if (fops->slave_name_change && !fops->slave_name_change(slave_dev, failover_dev)) return NOTIFY_OK; done: return NOTIFY_DONE; } static int failover_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *event_dev = netdev_notifier_info_to_dev(ptr); /* Skip parent events */ if (netif_is_failover(event_dev)) return NOTIFY_DONE; switch (event) { case NETDEV_REGISTER: return failover_slave_register(event_dev); case NETDEV_UNREGISTER: return failover_slave_unregister(event_dev); case NETDEV_UP: case NETDEV_DOWN: case NETDEV_CHANGE: return failover_slave_link_change(event_dev); case NETDEV_CHANGENAME: return failover_slave_name_change(event_dev); default: return NOTIFY_DONE; } } static struct notifier_block failover_notifier = { .notifier_call = failover_event, }; static void failover_existing_slave_register(struct net_device *failover_dev) { struct net *net = dev_net(failover_dev); struct net_device *dev; rtnl_lock(); for_each_netdev(net, dev) { if (netif_is_failover(dev)) continue; if (ether_addr_equal(failover_dev->perm_addr, dev->perm_addr)) { netdev_lock_ops(dev); failover_slave_register(dev); netdev_unlock_ops(dev); } } rtnl_unlock(); } /** * failover_register - Register a failover instance * * @dev: failover netdev * @ops: failover ops * * Allocate and register a failover instance for a failover netdev. ops * provides handlers for slave device register/unregister/link change/ * name change events. * * Return: pointer to failover instance */ struct failover *failover_register(struct net_device *dev, struct failover_ops *ops) { struct failover *failover; if (dev->type != ARPHRD_ETHER || !ops) return ERR_PTR(-EINVAL); failover = kzalloc_obj(*failover); if (!failover) return ERR_PTR(-ENOMEM); rcu_assign_pointer(failover->ops, ops); netdev_hold(dev, &failover->dev_tracker, GFP_KERNEL); dev->priv_flags |= IFF_FAILOVER; rcu_assign_pointer(failover->failover_dev, dev); spin_lock(&failover_lock); list_add_tail(&failover->list, &failover_list); spin_unlock(&failover_lock); netdev_info(dev, "failover master:%s registered\n", dev->name); failover_existing_slave_register(dev); return failover; } EXPORT_SYMBOL_GPL(failover_register); /** * failover_unregister - Unregister a failover instance * * @failover: pointer to failover instance * * Unregisters and frees a failover instance. */ void failover_unregister(struct failover *failover) { struct net_device *failover_dev; failover_dev = rcu_dereference(failover->failover_dev); netdev_info(failover_dev, "failover master:%s unregistered\n", failover_dev->name); failover_dev->priv_flags &= ~IFF_FAILOVER; netdev_put(failover_dev, &failover->dev_tracker); spin_lock(&failover_lock); list_del(&failover->list); spin_unlock(&failover_lock); kfree(failover); } EXPORT_SYMBOL_GPL(failover_unregister); static __init int failover_init(void) { register_netdevice_notifier(&failover_notifier); return 0; } module_init(failover_init); static __exit void failover_exit(void) { unregister_netdevice_notifier(&failover_notifier); } module_exit(failover_exit); MODULE_DESCRIPTION("Generic failover infrastructure/interface"); MODULE_LICENSE("GPL v2");
70 71 71 2 1 1 5 5 5 5 5 2 2 5 5 5 7 1 5 2 5 1 5 3 3 3 3 1 1 1 11 11 1 10 10 9 10 10 1 9 4 4 12 12 12 12 2 10 10 12 9 3 8 2 3 3 3 3 3 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2006 Silicon Graphics, Inc. * Copyright (c) 2012 Red Hat, Inc. * All Rights Reserved. */ #include "xfs_platform.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" #include "xfs_mount.h" #include "xfs_defer.h" #include "xfs_inode.h" #include "xfs_btree.h" #include "xfs_trans.h" #include "xfs_alloc.h" #include "xfs_bmap.h" #include "xfs_bmap_util.h" #include "xfs_bmap_btree.h" #include "xfs_rtalloc.h" #include "xfs_error.h" #include "xfs_quota.h" #include "xfs_trans_space.h" #include "xfs_trace.h" #include "xfs_icache.h" #include "xfs_iomap.h" #include "xfs_reflink.h" #include "xfs_rtbitmap.h" #include "xfs_rtgroup.h" #include "xfs_zone_alloc.h" /* Kernel only BMAP related definitions and functions */ /* * Convert the given file system block to a disk block. We have to treat it * differently based on whether the file is a real time file or not, because the * bmap code does. */ xfs_daddr_t xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb) { if (XFS_IS_REALTIME_INODE(ip)) return xfs_rtb_to_daddr(ip->i_mount, fsb); return XFS_FSB_TO_DADDR(ip->i_mount, fsb); } /* * Routine to zero an extent on disk allocated to the specific inode. */ int xfs_zero_extent( struct xfs_inode *ip, xfs_fsblock_t start_fsb, xfs_off_t count_fsb) { return blkdev_issue_zeroout(xfs_inode_buftarg(ip)->bt_bdev, xfs_fsb_to_db(ip, start_fsb), XFS_FSB_TO_BB(ip->i_mount, count_fsb), GFP_KERNEL, 0); } /* * Extent tree block counting routines. */ /* * Count leaf blocks given a range of extent records. Delayed allocation * extents are not counted towards the totals. */ xfs_extnum_t xfs_bmap_count_leaves( struct xfs_ifork *ifp, xfs_filblks_t *count) { struct xfs_iext_cursor icur; struct xfs_bmbt_irec got; xfs_extnum_t numrecs = 0; for_each_xfs_iext(ifp, &icur, &got) { if (!isnullstartblock(got.br_startblock)) { *count += got.br_blockcount; numrecs++; } } return numrecs; } /* * Count fsblocks of the given fork. Delayed allocation extents are * not counted towards the totals. */ int xfs_bmap_count_blocks( struct xfs_trans *tp, struct xfs_inode *ip, int whichfork, xfs_extnum_t *nextents, xfs_filblks_t *count) { struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_btree_cur *cur; xfs_filblks_t btblocks = 0; int error; *nextents = 0; *count = 0; if (!ifp) return 0; switch (ifp->if_format) { case XFS_DINODE_FMT_BTREE: error = xfs_iread_extents(tp, ip, whichfork); if (error) return error; cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); error = xfs_btree_count_blocks(cur, &btblocks); xfs_btree_del_cursor(cur, error); if (error) return error; /* * xfs_btree_count_blocks includes the root block contained in * the inode fork in @btblocks, so subtract one because we're * only interested in allocated disk blocks. */ *count += btblocks - 1; fallthrough; case XFS_DINODE_FMT_EXTENTS: *nextents = xfs_bmap_count_leaves(ifp, count); break; } return 0; } static int xfs_getbmap_report_one( struct xfs_inode *ip, struct getbmapx *bmv, struct kgetbmap *out, int64_t bmv_end, struct xfs_bmbt_irec *got) { struct kgetbmap *p = out + bmv->bmv_entries; bool shared = false; int error; error = xfs_reflink_trim_around_shared(ip, got, &shared); if (error) return error; if (isnullstartblock(got->br_startblock) || got->br_startblock == DELAYSTARTBLOCK) { /* * Take the flush completion as being a point-in-time snapshot * where there are no delalloc extents, and if any new ones * have been created racily, just skip them as being 'after' * the flush and so don't get reported. */ if (!(bmv->bmv_iflags & BMV_IF_DELALLOC)) return 0; p->bmv_oflags |= BMV_OF_DELALLOC; p->bmv_block = -2; } else { p->bmv_block = xfs_fsb_to_db(ip, got->br_startblock); } if (got->br_state == XFS_EXT_UNWRITTEN && (bmv->bmv_iflags & BMV_IF_PREALLOC)) p->bmv_oflags |= BMV_OF_PREALLOC; if (shared) p->bmv_oflags |= BMV_OF_SHARED; p->bmv_offset = XFS_FSB_TO_BB(ip->i_mount, got->br_startoff); p->bmv_length = XFS_FSB_TO_BB(ip->i_mount, got->br_blockcount); bmv->bmv_offset = p->bmv_offset + p->bmv_length; bmv->bmv_length = max(0LL, bmv_end - bmv->bmv_offset); bmv->bmv_entries++; return 0; } static void xfs_getbmap_report_hole( struct xfs_inode *ip, struct getbmapx *bmv, struct kgetbmap *out, int64_t bmv_end, xfs_fileoff_t bno, xfs_fileoff_t end) { struct kgetbmap *p = out + bmv->bmv_entries; if (bmv->bmv_iflags & BMV_IF_NO_HOLES) return; p->bmv_block = -1; p->bmv_offset = XFS_FSB_TO_BB(ip->i_mount, bno); p->bmv_length = XFS_FSB_TO_BB(ip->i_mount, end - bno); bmv->bmv_offset = p->bmv_offset + p->bmv_length; bmv->bmv_length = max(0LL, bmv_end - bmv->bmv_offset); bmv->bmv_entries++; } static inline bool xfs_getbmap_full( struct getbmapx *bmv) { return bmv->bmv_length == 0 || bmv->bmv_entries >= bmv->bmv_count - 1; } static bool xfs_getbmap_next_rec( struct xfs_bmbt_irec *rec, xfs_fileoff_t total_end) { xfs_fileoff_t end = rec->br_startoff + rec->br_blockcount; if (end == total_end) return false; rec->br_startoff += rec->br_blockcount; if (!isnullstartblock(rec->br_startblock) && rec->br_startblock != DELAYSTARTBLOCK) rec->br_startblock += rec->br_blockcount; rec->br_blockcount = total_end - end; return true; } /* * Get inode's extents as described in bmv, and format for output. * Calls formatter to fill the user's buffer until all extents * are mapped, until the passed-in bmv->bmv_count slots have * been filled, or until the formatter short-circuits the loop, * if it is tracking filled-in extents on its own. */ int /* error code */ xfs_getbmap( struct xfs_inode *ip, struct getbmapx *bmv, /* user bmap structure */ struct kgetbmap *out) { struct xfs_mount *mp = ip->i_mount; int iflags = bmv->bmv_iflags; int whichfork, lock, error = 0; int64_t bmv_end, max_len; xfs_fileoff_t bno, first_bno; struct xfs_ifork *ifp; struct xfs_bmbt_irec got, rec; xfs_filblks_t len; struct xfs_iext_cursor icur; if (bmv->bmv_iflags & ~BMV_IF_VALID) return -EINVAL; #ifndef DEBUG /* Only allow CoW fork queries if we're debugging. */ if (iflags & BMV_IF_COWFORK) return -EINVAL; #endif if ((iflags & BMV_IF_ATTRFORK) && (iflags & BMV_IF_COWFORK)) return -EINVAL; if (bmv->bmv_length < -1) return -EINVAL; bmv->bmv_entries = 0; if (bmv->bmv_length == 0) return 0; if (iflags & BMV_IF_ATTRFORK) whichfork = XFS_ATTR_FORK; else if (iflags & BMV_IF_COWFORK) whichfork = XFS_COW_FORK; else whichfork = XFS_DATA_FORK; xfs_ilock(ip, XFS_IOLOCK_SHARED); switch (whichfork) { case XFS_ATTR_FORK: lock = xfs_ilock_attr_map_shared(ip); if (!xfs_inode_has_attr_fork(ip)) goto out_unlock_ilock; max_len = 1LL << 32; break; case XFS_COW_FORK: lock = XFS_ILOCK_SHARED; xfs_ilock(ip, lock); /* No CoW fork? Just return */ if (!xfs_ifork_ptr(ip, whichfork)) goto out_unlock_ilock; if (xfs_get_cowextsz_hint(ip)) max_len = mp->m_super->s_maxbytes; else max_len = XFS_ISIZE(ip); break; case XFS_DATA_FORK: if (!(iflags & BMV_IF_DELALLOC) && (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_disk_size)) { error = filemap_write_and_wait(VFS_I(ip)->i_mapping); if (error) goto out_unlock_iolock; /* * Even after flushing the inode, there can still be * delalloc blocks on the inode beyond EOF due to * speculative preallocation. These are not removed * until the release function is called or the inode * is inactivated. Hence we cannot assert here that * ip->i_delayed_blks == 0. */ } if (xfs_get_extsz_hint(ip) || (ip->i_diflags & XFS_DIFLAG_PREALLOC)) max_len = mp->m_super->s_maxbytes; else max_len = XFS_ISIZE(ip); lock = xfs_ilock_data_map_shared(ip); break; } ifp = xfs_ifork_ptr(ip, whichfork); switch (ifp->if_format) { case XFS_DINODE_FMT_EXTENTS: case XFS_DINODE_FMT_BTREE: break; case XFS_DINODE_FMT_LOCAL: /* Local format inode forks report no extents. */ goto out_unlock_ilock; default: error = -EINVAL; goto out_unlock_ilock; } if (bmv->bmv_length == -1) { max_len = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, max_len)); bmv->bmv_length = max(0LL, max_len - bmv->bmv_offset); } bmv_end = bmv->bmv_offset + bmv->bmv_length; first_bno = bno = XFS_BB_TO_FSBT(mp, bmv->bmv_offset); len = XFS_BB_TO_FSB(mp, bmv->bmv_length); error = xfs_iread_extents(NULL, ip, whichfork); if (error) goto out_unlock_ilock; if (!xfs_iext_lookup_extent(ip, ifp, bno, &icur, &got)) { /* * Report a whole-file hole if the delalloc flag is set to * stay compatible with the old implementation. */ if (iflags & BMV_IF_DELALLOC) xfs_getbmap_report_hole(ip, bmv, out, bmv_end, bno, XFS_B_TO_FSB(mp, XFS_ISIZE(ip))); goto out_unlock_ilock; } while (!xfs_getbmap_full(bmv)) { xfs_trim_extent(&got, first_bno, len); /* * Report an entry for a hole if this extent doesn't directly * follow the previous one. */ if (got.br_startoff > bno) { xfs_getbmap_report_hole(ip, bmv, out, bmv_end, bno, got.br_startoff); if (xfs_getbmap_full(bmv)) break; } /* * In order to report shared extents accurately, we report each * distinct shared / unshared part of a single bmbt record with * an individual getbmapx record. */ bno = got.br_startoff + got.br_blockcount; rec = got; do { error = xfs_getbmap_report_one(ip, bmv, out, bmv_end, &rec); if (error || xfs_getbmap_full(bmv)) goto out_unlock_ilock; } while (xfs_getbmap_next_rec(&rec, bno)); if (!xfs_iext_next_extent(ifp, &icur, &got)) { xfs_fileoff_t end = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)); if (bmv->bmv_entries > 0) out[bmv->bmv_entries - 1].bmv_oflags |= BMV_OF_LAST; if (whichfork != XFS_ATTR_FORK && bno < end && !xfs_getbmap_full(bmv)) { xfs_getbmap_report_hole(ip, bmv, out, bmv_end, bno, end); } break; } if (bno >= first_bno + len) break; } out_unlock_ilock: xfs_iunlock(ip, lock); out_unlock_iolock: xfs_iunlock(ip, XFS_IOLOCK_SHARED); return error; } /* * Dead simple method of punching delalyed allocation blocks from a range in * the inode. This will always punch out both the start and end blocks, even * if the ranges only partially overlap them, so it is up to the caller to * ensure that partial blocks are not passed in. */ void xfs_bmap_punch_delalloc_range( struct xfs_inode *ip, int whichfork, xfs_off_t start_byte, xfs_off_t end_byte, struct xfs_zone_alloc_ctx *ac) { struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, start_byte); xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, end_byte); struct xfs_bmbt_irec got, del; struct xfs_iext_cursor icur; ASSERT(!xfs_need_iread_extents(ifp)); xfs_ilock(ip, XFS_ILOCK_EXCL); if (!xfs_iext_lookup_extent_before(ip, ifp, &end_fsb, &icur, &got)) goto out_unlock; while (got.br_startoff + got.br_blockcount > start_fsb) { del = got; xfs_trim_extent(&del, start_fsb, end_fsb - start_fsb); /* * A delete can push the cursor forward. Step back to the * previous extent on non-delalloc or extents outside the * target range. */ if (!del.br_blockcount || !isnullstartblock(del.br_startblock)) { if (!xfs_iext_prev_extent(ifp, &icur, &got)) break; continue; } if (xfs_is_zoned_inode(ip) && ac) { /* * In a zoned buffered write context we need to return * the punched delalloc allocations to the allocation * context. This allows reusing them in the following * iomap iterations. */ xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, &del, XFS_BMAPI_REMAP); ac->reserved_blocks += del.br_blockcount; } else { xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, &del, 0); } if (!xfs_iext_get_extent(ifp, &icur, &got)) break; } if (whichfork == XFS_COW_FORK && !ifp->if_bytes) xfs_inode_clear_cowblocks_tag(ip); out_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); } /* * Test whether it is appropriate to check an inode for and free post EOF * blocks. */ bool xfs_can_free_eofblocks( struct xfs_inode *ip) { struct xfs_mount *mp = ip->i_mount; bool found_blocks = false; xfs_fileoff_t end_fsb; xfs_fileoff_t last_fsb; struct xfs_bmbt_irec imap; struct xfs_iext_cursor icur; /* * Caller must either hold the exclusive io lock; or be inactivating * the inode, which guarantees there are no other users of the inode. */ if (!(inode_state_read_once(VFS_I(ip)) & I_FREEING)) xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL); /* prealloc/delalloc exists only on regular files */ if (!S_ISREG(VFS_I(ip)->i_mode)) return false; /* * Zero sized files with no cached pages and delalloc blocks will not * have speculative prealloc/delalloc blocks to remove. */ if (VFS_I(ip)->i_size == 0 && VFS_I(ip)->i_mapping->nrpages == 0 && ip->i_delayed_blks == 0) return false; /* If we haven't read in the extent list, then don't do it now. */ if (xfs_need_iread_extents(&ip->i_df)) return false; /* * Do not free real extents in preallocated files unless the file has * delalloc blocks and we are forced to remove them. */ if ((ip->i_diflags & XFS_DIFLAG_PREALLOC) && !ip->i_delayed_blks) return false; /* * Do not try to free post-EOF blocks if EOF is beyond the end of the * range supported by the page cache, because the truncation will loop * forever. */ end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip)); if (xfs_inode_has_bigrtalloc(ip)) end_fsb = xfs_fileoff_roundup_rtx(mp, end_fsb); last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); if (last_fsb <= end_fsb) return false; /* * Check if there is an post-EOF extent to free. If there are any * delalloc blocks attached to the inode (data fork delalloc * reservations or CoW extents of any kind), we need to free them so * that inactivation doesn't fail to erase them. */ xfs_ilock(ip, XFS_ILOCK_SHARED); if (ip->i_delayed_blks || xfs_iext_lookup_extent(ip, &ip->i_df, end_fsb, &icur, &imap)) found_blocks = true; xfs_iunlock(ip, XFS_ILOCK_SHARED); return found_blocks; } /* * This is called to free any blocks beyond eof. The caller must hold * IOLOCK_EXCL unless we are in the inode reclaim path and have the only * reference to the inode. */ int xfs_free_eofblocks( struct xfs_inode *ip) { struct xfs_trans *tp; struct xfs_mount *mp = ip->i_mount; int error; /* Attach the dquots to the inode up front. */ error = xfs_qm_dqattach(ip); if (error) return error; /* Wait on dio to ensure i_size has settled. */ inode_dio_wait(VFS_I(ip)); /* * For preallocated files only free delayed allocations. * * Note that this means we also leave speculative preallocations in * place for preallocated files. */ if (ip->i_diflags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) { if (ip->i_delayed_blks) { xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, round_up(XFS_ISIZE(ip), mp->m_sb.sb_blocksize), LLONG_MAX, NULL); } xfs_inode_clear_eofblocks_tag(ip); return 0; } error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); if (error) { ASSERT(xfs_is_shutdown(mp)); return error; } xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); /* * Do not update the on-disk file size. If we update the on-disk file * size and then the system crashes before the contents of the file are * flushed to disk then the files may be full of holes (ie NULL files * bug). */ error = xfs_itruncate_extents_flags(&tp, ip, XFS_DATA_FORK, XFS_ISIZE(ip), XFS_BMAPI_NODISCARD); if (error) goto err_cancel; error = xfs_trans_commit(tp); if (error) goto out_unlock; xfs_inode_clear_eofblocks_tag(ip); goto out_unlock; err_cancel: /* * If we get an error at this point we simply don't * bother truncating the file. */ xfs_trans_cancel(tp); out_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } int xfs_alloc_file_space( struct xfs_inode *ip, xfs_off_t offset, xfs_off_t len) { xfs_mount_t *mp = ip->i_mount; xfs_off_t count; xfs_filblks_t allocatesize_fsb; xfs_extlen_t extsz, temp; xfs_fileoff_t startoffset_fsb; xfs_fileoff_t endoffset_fsb; int rt; xfs_trans_t *tp; xfs_bmbt_irec_t imaps[1], *imapp; int error; if (xfs_is_always_cow_inode(ip)) return 0; trace_xfs_alloc_file_space(ip); if (xfs_is_shutdown(mp)) return -EIO; error = xfs_qm_dqattach(ip); if (error) return error; if (len <= 0) return -EINVAL; rt = XFS_IS_REALTIME_INODE(ip); extsz = xfs_get_extsz_hint(ip); count = len; imapp = &imaps[0]; startoffset_fsb = XFS_B_TO_FSBT(mp, offset); endoffset_fsb = XFS_B_TO_FSB(mp, offset + count); allocatesize_fsb = endoffset_fsb - startoffset_fsb; /* * Allocate file space until done or until there is an error */ while (allocatesize_fsb && !error) { xfs_fileoff_t s, e; unsigned int dblocks, rblocks, resblks; int nimaps = 1; /* * Determine space reservations for data/realtime. */ if (unlikely(extsz)) { s = startoffset_fsb; do_div(s, extsz); s *= extsz; e = startoffset_fsb + allocatesize_fsb; div_u64_rem(startoffset_fsb, extsz, &temp); if (temp) e += temp; div_u64_rem(e, extsz, &temp); if (temp) e += extsz - temp; } else { s = 0; e = allocatesize_fsb; } /* * The transaction reservation is limited to a 32-bit block * count, hence we need to limit the number of blocks we are * trying to reserve to avoid an overflow. We can't allocate * more than @nimaps extents, and an extent is limited on disk * to XFS_BMBT_MAX_EXTLEN (21 bits), so use that to enforce the * limit. */ resblks = min_t(xfs_fileoff_t, (e - s), (XFS_MAX_BMBT_EXTLEN * nimaps)); if (unlikely(rt)) { dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0); rblocks = resblks; } else { dblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks); rblocks = 0; } error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, dblocks, rblocks, false, &tp); if (error) break; error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK, XFS_IEXT_ADD_NOSPLIT_CNT); if (error) goto error; /* * If the allocator cannot find a single free extent large * enough to cover the start block of the requested range, * xfs_bmapi_write will return -ENOSR. * * In that case we simply need to keep looping with the same * startoffset_fsb so that one of the following allocations * will eventually reach the requested range. */ error = xfs_bmapi_write(tp, ip, startoffset_fsb, allocatesize_fsb, XFS_BMAPI_PREALLOC, 0, imapp, &nimaps); if (error) { if (error != -ENOSR) goto error; error = 0; } else { startoffset_fsb += imapp->br_blockcount; allocatesize_fsb -= imapp->br_blockcount; } ip->i_diflags |= XFS_DIFLAG_PREALLOC; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); error = xfs_trans_commit(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); } return error; error: xfs_trans_cancel(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } static int xfs_unmap_extent( struct xfs_inode *ip, xfs_fileoff_t startoffset_fsb, xfs_filblks_t len_fsb, int *done) { struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; uint resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); int error; error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, resblks, 0, false, &tp); if (error) return error; error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK, XFS_IEXT_PUNCH_HOLE_CNT); if (error) goto out_trans_cancel; error = xfs_bunmapi(tp, ip, startoffset_fsb, len_fsb, 0, 2, done); if (error) goto out_trans_cancel; error = xfs_trans_commit(tp); out_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; out_trans_cancel: xfs_trans_cancel(tp); goto out_unlock; } /* Caller must first wait for the completion of any pending DIOs if required. */ int xfs_flush_unmap_range( struct xfs_inode *ip, xfs_off_t offset, xfs_off_t len) { struct inode *inode = VFS_I(ip); xfs_off_t rounding, start, end; int error; /* * Make sure we extend the flush out to extent alignment * boundaries so any extent range overlapping the start/end * of the modification we are about to do is clean and idle. */ rounding = max_t(xfs_off_t, xfs_inode_alloc_unitsize(ip), PAGE_SIZE); start = rounddown_64(offset, rounding); end = roundup_64(offset + len, rounding) - 1; error = filemap_write_and_wait_range(inode->i_mapping, start, end); if (error) return error; truncate_pagecache_range(inode, start, end); return 0; } int xfs_free_file_space( struct xfs_inode *ip, xfs_off_t offset, xfs_off_t len, struct xfs_zone_alloc_ctx *ac) { struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t startoffset_fsb; xfs_fileoff_t endoffset_fsb; int done = 0, error; trace_xfs_free_file_space(ip); error = xfs_qm_dqattach(ip); if (error) return error; if (len <= 0) /* if nothing being freed */ return 0; /* * Now AIO and DIO has drained we flush and (if necessary) invalidate * the cached range over the first operation we are about to run. */ error = xfs_flush_unmap_range(ip, offset, len); if (error) return error; startoffset_fsb = XFS_B_TO_FSB(mp, offset); endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len); /* We can only free complete realtime extents. */ if (xfs_inode_has_bigrtalloc(ip)) { startoffset_fsb = xfs_fileoff_roundup_rtx(mp, startoffset_fsb); endoffset_fsb = xfs_fileoff_rounddown_rtx(mp, endoffset_fsb); } /* * Need to zero the stuff we're not freeing, on disk. */ if (endoffset_fsb > startoffset_fsb) { while (!done) { error = xfs_unmap_extent(ip, startoffset_fsb, endoffset_fsb - startoffset_fsb, &done); if (error) return error; } } /* * Now that we've unmap all full blocks we'll have to zero out any * partial block at the beginning and/or end. xfs_zero_range is smart * enough to skip any holes, including those we just created, but we * must take care not to zero beyond EOF and enlarge i_size. */ if (offset >= XFS_ISIZE(ip)) return 0; if (offset + len > XFS_ISIZE(ip)) len = XFS_ISIZE(ip) - offset; error = xfs_zero_range(ip, offset, len, ac, NULL); if (error) return error; /* * If we zeroed right up to EOF and EOF straddles a page boundary we * must make sure that the post-EOF area is also zeroed because the * page could be mmap'd and xfs_zero_range doesn't do that for us. * Writeback of the eof page will do this, albeit clumsily. */ if (offset + len >= XFS_ISIZE(ip) && offset_in_page(offset + len) > 0) { error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, round_down(offset + len, PAGE_SIZE), LLONG_MAX); } return error; } static int xfs_prepare_shift( struct xfs_inode *ip, loff_t offset) { unsigned int rounding; int error; /* * Trim eofblocks to avoid shifting uninitialized post-eof preallocation * into the accessible region of the file. */ if (xfs_can_free_eofblocks(ip)) { error = xfs_free_eofblocks(ip); if (error) return error; } /* * Shift operations must stabilize the start block offset boundary along * with the full range of the operation. If we don't, a COW writeback * completion could race with an insert, front merge with the start * extent (after split) during the shift and corrupt the file. Start * with the allocation unit just prior to the start to stabilize the * boundary. */ rounding = xfs_inode_alloc_unitsize(ip); offset = rounddown_64(offset, rounding); if (offset) offset -= rounding; /* * Writeback and invalidate cache for the remainder of the file as we're * about to shift down every extent from offset to EOF. */ error = xfs_flush_unmap_range(ip, offset, XFS_ISIZE(ip)); if (error) return error; /* * Clean out anything hanging around in the cow fork now that * we've flushed all the dirty data out to disk to avoid having * CoW extents at the wrong offsets. */ if (xfs_inode_has_cow_data(ip)) { error = xfs_reflink_cancel_cow_range(ip, offset, NULLFILEOFF, true); if (error) return error; } return 0; } /* * xfs_collapse_file_space() * This routine frees disk space and shift extent for the given file. * The first thing we do is to free data blocks in the specified range * by calling xfs_free_file_space(). It would also sync dirty data * and invalidate page cache over the region on which collapse range * is working. And Shift extent records to the left to cover a hole. * RETURNS: * 0 on success * errno on error * */ int xfs_collapse_file_space( struct xfs_inode *ip, xfs_off_t offset, xfs_off_t len, struct xfs_zone_alloc_ctx *ac) { struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; int error; xfs_fileoff_t next_fsb = XFS_B_TO_FSB(mp, offset + len); xfs_fileoff_t shift_fsb = XFS_B_TO_FSB(mp, len); bool done = false; xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL); trace_xfs_collapse_file_space(ip); error = xfs_free_file_space(ip, offset, len, ac); if (error) return error; error = xfs_prepare_shift(ip, offset); if (error) return error; error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, &tp); if (error) return error; xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); while (!done) { error = xfs_bmap_collapse_extents(tp, ip, &next_fsb, shift_fsb, &done); if (error) goto out_trans_cancel; if (done) break; /* finish any deferred frees and roll the transaction */ error = xfs_defer_finish(&tp); if (error) goto out_trans_cancel; } error = xfs_trans_commit(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; out_trans_cancel: xfs_trans_cancel(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } /* * xfs_insert_file_space() * This routine create hole space by shifting extents for the given file. * The first thing we do is to sync dirty data and invalidate page cache * over the region on which insert range is working. And split an extent * to two extents at given offset by calling xfs_bmap_split_extent. * And shift all extent records which are laying between [offset, * last allocated extent] to the right to reserve hole range. * RETURNS: * 0 on success * errno on error */ int xfs_insert_file_space( struct xfs_inode *ip, loff_t offset, loff_t len) { struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; int error; xfs_fileoff_t stop_fsb = XFS_B_TO_FSB(mp, offset); xfs_fileoff_t next_fsb = NULLFSBLOCK; xfs_fileoff_t shift_fsb = XFS_B_TO_FSB(mp, len); bool done = false; xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL); trace_xfs_insert_file_space(ip); error = xfs_bmap_can_insert_extents(ip, stop_fsb, shift_fsb); if (error) return error; error = xfs_prepare_shift(ip, offset); if (error) return error; error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, 0, &tp); if (error) return error; xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK, XFS_IEXT_PUNCH_HOLE_CNT); if (error) goto out_trans_cancel; /* * The extent shifting code works on extent granularity. So, if stop_fsb * is not the starting block of extent, we need to split the extent at * stop_fsb. */ error = xfs_bmap_split_extent(tp, ip, stop_fsb); if (error) goto out_trans_cancel; do { error = xfs_defer_finish(&tp); if (error) goto out_trans_cancel; error = xfs_bmap_insert_extents(tp, ip, &next_fsb, shift_fsb, &done, stop_fsb); if (error) goto out_trans_cancel; } while (!done); error = xfs_trans_commit(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; out_trans_cancel: xfs_trans_cancel(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } /* * We need to check that the format of the data fork in the temporary inode is * valid for the target inode before doing the swap. This is not a problem with * attr1 because of the fixed fork offset, but attr2 has a dynamically sized * data fork depending on the space the attribute fork is taking so we can get * invalid formats on the target inode. * * E.g. target has space for 7 extents in extent format, temp inode only has * space for 6. If we defragment down to 7 extents, then the tmp format is a * btree, but when swapped it needs to be in extent format. Hence we can't just * blindly swap data forks on attr2 filesystems. * * Note that we check the swap in both directions so that we don't end up with * a corrupt temporary inode, either. * * Note that fixing the way xfs_fsr sets up the attribute fork in the source * inode will prevent this situation from occurring, so all we do here is * reject and log the attempt. basically we are putting the responsibility on * userspace to get this right. */ static int xfs_swap_extents_check_format( struct xfs_inode *ip, /* target inode */ struct xfs_inode *tip) /* tmp inode */ { struct xfs_ifork *ifp = &ip->i_df; struct xfs_ifork *tifp = &tip->i_df; /* User/group/project quota ids must match if quotas are enforced. */ if (XFS_IS_QUOTA_ON(ip->i_mount) && (!uid_eq(VFS_I(ip)->i_uid, VFS_I(tip)->i_uid) || !gid_eq(VFS_I(ip)->i_gid, VFS_I(tip)->i_gid) || ip->i_projid != tip->i_projid)) return -EINVAL; /* Should never get a local format */ if (ifp->if_format == XFS_DINODE_FMT_LOCAL || tifp->if_format == XFS_DINODE_FMT_LOCAL) return -EINVAL; /* * if the target inode has less extents that then temporary inode then * why did userspace call us? */ if (ifp->if_nextents < tifp->if_nextents) return -EINVAL; /* * If we have to use the (expensive) rmap swap method, we can * handle any number of extents and any format. */ if (xfs_has_rmapbt(ip->i_mount)) return 0; /* * if the target inode is in extent form and the temp inode is in btree * form then we will end up with the target inode in the wrong format * as we already know there are less extents in the temp inode. */ if (ifp->if_format == XFS_DINODE_FMT_EXTENTS && tifp->if_format == XFS_DINODE_FMT_BTREE) return -EINVAL; /* Check temp in extent form to max in target */ if (tifp->if_format == XFS_DINODE_FMT_EXTENTS && tifp->if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)) return -EINVAL; /* Check target in extent form to max in temp */ if (ifp->if_format == XFS_DINODE_FMT_EXTENTS && ifp->if_nextents > XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK)) return -EINVAL; /* * If we are in a btree format, check that the temp root block will fit * in the target and that it has enough extents to be in btree format * in the target. * * Note that we have to be careful to allow btree->extent conversions * (a common defrag case) which will occur when the temp inode is in * extent format... */ if (tifp->if_format == XFS_DINODE_FMT_BTREE) { if (xfs_inode_has_attr_fork(ip) && xfs_bmap_bmdr_space(tifp->if_broot) > xfs_inode_fork_boff(ip)) return -EINVAL; if (tifp->if_nextents <= XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)) return -EINVAL; } /* Reciprocal target->temp btree format checks */ if (ifp->if_format == XFS_DINODE_FMT_BTREE) { if (xfs_inode_has_attr_fork(tip) && xfs_bmap_bmdr_space(ip->i_df.if_broot) > xfs_inode_fork_boff(tip)) return -EINVAL; if (ifp->if_nextents <= XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK)) return -EINVAL; } return 0; } static int xfs_swap_extent_flush( struct xfs_inode *ip) { int error; error = filemap_write_and_wait(VFS_I(ip)->i_mapping); if (error) return error; truncate_pagecache_range(VFS_I(ip), 0, -1); /* Verify O_DIRECT for ftmp */ if (VFS_I(ip)->i_mapping->nrpages) return -EINVAL; return 0; } /* * Move extents from one file to another, when rmap is enabled. */ STATIC int xfs_swap_extent_rmap( struct xfs_trans **tpp, struct xfs_inode *ip, struct xfs_inode *tip) { struct xfs_trans *tp = *tpp; struct xfs_bmbt_irec irec; struct xfs_bmbt_irec uirec; struct xfs_bmbt_irec tirec; xfs_fileoff_t offset_fsb; xfs_fileoff_t end_fsb; xfs_filblks_t count_fsb; int error; xfs_filblks_t ilen; xfs_filblks_t rlen; int nimaps; uint64_t tip_flags2; /* * If the source file has shared blocks, we must flag the donor * file as having shared blocks so that we get the shared-block * rmap functions when we go to fix up the rmaps. The flags * will be switch for reals later. */ tip_flags2 = tip->i_diflags2; if (ip->i_diflags2 & XFS_DIFLAG2_REFLINK) tip->i_diflags2 |= XFS_DIFLAG2_REFLINK; offset_fsb = 0; end_fsb = XFS_B_TO_FSB(ip->i_mount, i_size_read(VFS_I(ip))); count_fsb = (xfs_filblks_t)(end_fsb - offset_fsb); while (count_fsb) { /* Read extent from the donor file */ nimaps = 1; error = xfs_bmapi_read(tip, offset_fsb, count_fsb, &tirec, &nimaps, 0); if (error) goto out; ASSERT(nimaps == 1); ASSERT(tirec.br_startblock != DELAYSTARTBLOCK); trace_xfs_swap_extent_rmap_remap(tip, &tirec); ilen = tirec.br_blockcount; /* Unmap the old blocks in the source file. */ while (tirec.br_blockcount) { ASSERT(tp->t_highest_agno == NULLAGNUMBER); trace_xfs_swap_extent_rmap_remap_piece(tip, &tirec); /* Read extent from the source file */ nimaps = 1; error = xfs_bmapi_read(ip, tirec.br_startoff, tirec.br_blockcount, &irec, &nimaps, 0); if (error) goto out; ASSERT(nimaps == 1); ASSERT(tirec.br_startoff == irec.br_startoff); trace_xfs_swap_extent_rmap_remap_piece(ip, &irec); /* Trim the extent. */ uirec = tirec; uirec.br_blockcount = rlen = min_t(xfs_filblks_t, tirec.br_blockcount, irec.br_blockcount); trace_xfs_swap_extent_rmap_remap_piece(tip, &uirec); if (xfs_bmap_is_real_extent(&uirec)) { error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK, XFS_IEXT_SWAP_RMAP_CNT); if (error) goto out; } if (xfs_bmap_is_real_extent(&irec)) { error = xfs_iext_count_extend(tp, tip, XFS_DATA_FORK, XFS_IEXT_SWAP_RMAP_CNT); if (error) goto out; } /* Remove the mapping from the donor file. */ xfs_bmap_unmap_extent(tp, tip, XFS_DATA_FORK, &uirec); /* Remove the mapping from the source file. */ xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &irec); /* Map the donor file's blocks into the source file. */ xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, &uirec); /* Map the source file's blocks into the donor file. */ xfs_bmap_map_extent(tp, tip, XFS_DATA_FORK, &irec); error = xfs_defer_finish(tpp); tp = *tpp; if (error) goto out; tirec.br_startoff += rlen; if (tirec.br_startblock != HOLESTARTBLOCK && tirec.br_startblock != DELAYSTARTBLOCK) tirec.br_startblock += rlen; tirec.br_blockcount -= rlen; } /* Roll on... */ count_fsb -= ilen; offset_fsb += ilen; } tip->i_diflags2 = tip_flags2; return 0; out: trace_xfs_swap_extent_rmap_error(ip, error, _RET_IP_); tip->i_diflags2 = tip_flags2; return error; } /* Swap the extents of two files by swapping data forks. */ STATIC int xfs_swap_extent_forks( struct xfs_trans *tp, struct xfs_inode *ip, struct xfs_inode *tip, int *src_log_flags, int *target_log_flags) { xfs_filblks_t aforkblks = 0; xfs_filblks_t taforkblks = 0; xfs_extnum_t junk; uint64_t tmp; int error; /* * Count the number of extended attribute blocks */ if (xfs_inode_has_attr_fork(ip) && ip->i_af.if_nextents > 0 && ip->i_af.if_format != XFS_DINODE_FMT_LOCAL) { error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &junk, &aforkblks); if (error) return error; } if (xfs_inode_has_attr_fork(tip) && tip->i_af.if_nextents > 0 && tip->i_af.if_format != XFS_DINODE_FMT_LOCAL) { error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK, &junk, &taforkblks); if (error) return error; } /* * Btree format (v3) inodes have the inode number stamped in the bmbt * block headers. We can't start changing the bmbt blocks until the * inode owner change is logged so recovery does the right thing in the * event of a crash. Set the owner change log flags now and leave the * bmbt scan as the last step. */ if (xfs_has_v3inodes(ip->i_mount)) { if (ip->i_df.if_format == XFS_DINODE_FMT_BTREE) (*target_log_flags) |= XFS_ILOG_DOWNER; if (tip->i_df.if_format == XFS_DINODE_FMT_BTREE) (*src_log_flags) |= XFS_ILOG_DOWNER; } /* * Swap the data forks of the inodes */ swap(ip->i_df, tip->i_df); /* * Fix the on-disk inode values */ tmp = (uint64_t)ip->i_nblocks; ip->i_nblocks = tip->i_nblocks - taforkblks + aforkblks; tip->i_nblocks = tmp + taforkblks - aforkblks; /* * The extents in the source inode could still contain speculative * preallocation beyond EOF (e.g. the file is open but not modified * while defrag is in progress). In that case, we need to copy over the * number of delalloc blocks the data fork in the source inode is * tracking beyond EOF so that when the fork is truncated away when the * temporary inode is unlinked we don't underrun the i_delayed_blks * counter on that inode. */ ASSERT(tip->i_delayed_blks == 0); tip->i_delayed_blks = ip->i_delayed_blks; ip->i_delayed_blks = 0; switch (ip->i_df.if_format) { case XFS_DINODE_FMT_EXTENTS: (*src_log_flags) |= XFS_ILOG_DEXT; break; case XFS_DINODE_FMT_BTREE: ASSERT(!xfs_has_v3inodes(ip->i_mount) || (*src_log_flags & XFS_ILOG_DOWNER)); (*src_log_flags) |= XFS_ILOG_DBROOT; break; } switch (tip->i_df.if_format) { case XFS_DINODE_FMT_EXTENTS: (*target_log_flags) |= XFS_ILOG_DEXT; break; case XFS_DINODE_FMT_BTREE: (*target_log_flags) |= XFS_ILOG_DBROOT; ASSERT(!xfs_has_v3inodes(ip->i_mount) || (*target_log_flags & XFS_ILOG_DOWNER)); break; } return 0; } /* * Fix up the owners of the bmbt blocks to refer to the current inode. The * change owner scan attempts to order all modified buffers in the current * transaction. In the event of ordered buffer failure, the offending buffer is * physically logged as a fallback and the scan returns -EAGAIN. We must roll * the transaction in this case to replenish the fallback log reservation and * restart the scan. This process repeats until the scan completes. */ static int xfs_swap_change_owner( struct xfs_trans **tpp, struct xfs_inode *ip, struct xfs_inode *tmpip) { int error; struct xfs_trans *tp = *tpp; do { error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK, ip->i_ino, NULL); /* success or fatal error */ if (error != -EAGAIN) break; error = xfs_trans_roll(tpp); if (error) break; tp = *tpp; /* * Redirty both inodes so they can relog and keep the log tail * moving forward. */ xfs_trans_ijoin(tp, ip, 0); xfs_trans_ijoin(tp, tmpip, 0); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); xfs_trans_log_inode(tp, tmpip, XFS_ILOG_CORE); } while (true); return error; } int xfs_swap_extents( struct xfs_inode *ip, /* target inode */ struct xfs_inode *tip, /* tmp inode */ struct xfs_swapext *sxp) { struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; struct xfs_bstat *sbp = &sxp->sx_stat; int src_log_flags, target_log_flags; int error = 0; uint64_t f; int resblks = 0; unsigned int flags = 0; struct timespec64 ctime, mtime; /* * Lock the inodes against other IO, page faults and truncate to * begin with. Then we can ensure the inodes are flushed and have no * page cache safely. Once we have done this we can take the ilocks and * do the rest of the checks. */ lock_two_nondirectories(VFS_I(ip), VFS_I(tip)); filemap_invalidate_lock_two(VFS_I(ip)->i_mapping, VFS_I(tip)->i_mapping); /* Verify that both files have the same format */ if ((VFS_I(ip)->i_mode & S_IFMT) != (VFS_I(tip)->i_mode & S_IFMT)) { error = -EINVAL; goto out_unlock; } /* Verify both files are either real-time or non-realtime */ if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) { error = -EINVAL; goto out_unlock; } /* * The rmapbt implementation is unable to resume a swapext operation * after a crash if the allocation unit size is larger than a block. * This (deprecated) interface will not be upgraded to handle this * situation. Defragmentation must be performed with the commit range * ioctl. */ if (XFS_IS_REALTIME_INODE(ip) && xfs_has_rtgroups(ip->i_mount)) { error = -EOPNOTSUPP; goto out_unlock; } error = xfs_qm_dqattach(ip); if (error) goto out_unlock; error = xfs_qm_dqattach(tip); if (error) goto out_unlock; error = xfs_swap_extent_flush(ip); if (error) goto out_unlock; error = xfs_swap_extent_flush(tip); if (error) goto out_unlock; if (xfs_inode_has_cow_data(tip)) { error = xfs_reflink_cancel_cow_range(tip, 0, NULLFILEOFF, true); if (error) goto out_unlock; } /* * Extent "swapping" with rmap requires a permanent reservation and * a block reservation because it's really just a remap operation * performed with log redo items! */ if (xfs_has_rmapbt(mp)) { int w = XFS_DATA_FORK; uint32_t ipnext = ip->i_df.if_nextents; uint32_t tipnext = tip->i_df.if_nextents; /* * Conceptually this shouldn't affect the shape of either bmbt, * but since we atomically move extents one by one, we reserve * enough space to rebuild both trees. */ resblks = XFS_SWAP_RMAP_SPACE_RES(mp, ipnext, w); resblks += XFS_SWAP_RMAP_SPACE_RES(mp, tipnext, w); /* * If either inode straddles a bmapbt block allocation boundary, * the rmapbt algorithm triggers repeated allocs and frees as * extents are remapped. This can exhaust the block reservation * prematurely and cause shutdown. Return freed blocks to the * transaction reservation to counter this behavior. */ flags |= XFS_TRANS_RES_FDBLKS; } error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, flags, &tp); if (error) goto out_unlock; /* * Lock and join the inodes to the tansaction so that transaction commit * or cancel will unlock the inodes from this point onwards. */ xfs_lock_two_inodes(ip, XFS_ILOCK_EXCL, tip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); xfs_trans_ijoin(tp, tip, 0); /* Verify all data are being swapped */ if (sxp->sx_offset != 0 || sxp->sx_length != ip->i_disk_size || sxp->sx_length != tip->i_disk_size) { error = -EFAULT; goto out_trans_cancel; } trace_xfs_swap_extent_before(ip, 0); trace_xfs_swap_extent_before(tip, 1); /* check inode formats now that data is flushed */ error = xfs_swap_extents_check_format(ip, tip); if (error) { xfs_notice(mp, "%s: inode 0x%llx format is incompatible for exchanging.", __func__, ip->i_ino); goto out_trans_cancel; } /* * Compare the current change & modify times with that * passed in. If they differ, we abort this swap. * This is the mechanism used to ensure the calling * process that the file was not changed out from * under it. */ ctime = inode_get_ctime(VFS_I(ip)); mtime = inode_get_mtime(VFS_I(ip)); if ((sbp->bs_ctime.tv_sec != ctime.tv_sec) || (sbp->bs_ctime.tv_nsec != ctime.tv_nsec) || (sbp->bs_mtime.tv_sec != mtime.tv_sec) || (sbp->bs_mtime.tv_nsec != mtime.tv_nsec)) { error = -EBUSY; goto out_trans_cancel; } /* * Note the trickiness in setting the log flags - we set the owner log * flag on the opposite inode (i.e. the inode we are setting the new * owner to be) because once we swap the forks and log that, log * recovery is going to see the fork as owned by the swapped inode, * not the pre-swapped inodes. */ src_log_flags = XFS_ILOG_CORE; target_log_flags = XFS_ILOG_CORE; if (xfs_has_rmapbt(mp)) error = xfs_swap_extent_rmap(&tp, ip, tip); else error = xfs_swap_extent_forks(tp, ip, tip, &src_log_flags, &target_log_flags); if (error) goto out_trans_cancel; /* Do we have to swap reflink flags? */ if ((ip->i_diflags2 & XFS_DIFLAG2_REFLINK) ^ (tip->i_diflags2 & XFS_DIFLAG2_REFLINK)) { f = ip->i_diflags2 & XFS_DIFLAG2_REFLINK; ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; ip->i_diflags2 |= tip->i_diflags2 & XFS_DIFLAG2_REFLINK; tip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; tip->i_diflags2 |= f & XFS_DIFLAG2_REFLINK; } /* Swap the cow forks. */ if (xfs_has_reflink(mp)) { ASSERT(!ip->i_cowfp || ip->i_cowfp->if_format == XFS_DINODE_FMT_EXTENTS); ASSERT(!tip->i_cowfp || tip->i_cowfp->if_format == XFS_DINODE_FMT_EXTENTS); swap(ip->i_cowfp, tip->i_cowfp); if (ip->i_cowfp && ip->i_cowfp->if_bytes) xfs_inode_set_cowblocks_tag(ip); else xfs_inode_clear_cowblocks_tag(ip); if (tip->i_cowfp && tip->i_cowfp->if_bytes) xfs_inode_set_cowblocks_tag(tip); else xfs_inode_clear_cowblocks_tag(tip); } xfs_trans_log_inode(tp, ip, src_log_flags); xfs_trans_log_inode(tp, tip, target_log_flags); /* * The extent forks have been swapped, but crc=1,rmapbt=0 filesystems * have inode number owner values in the bmbt blocks that still refer to * the old inode. Scan each bmbt to fix up the owner values with the * inode number of the current inode. */ if (src_log_flags & XFS_ILOG_DOWNER) { error = xfs_swap_change_owner(&tp, ip, tip); if (error) goto out_trans_cancel; } if (target_log_flags & XFS_ILOG_DOWNER) { error = xfs_swap_change_owner(&tp, tip, ip); if (error) goto out_trans_cancel; } /* * If this is a synchronous mount, make sure that the * transaction goes to disk before returning to the user. */ if (xfs_has_wsync(mp)) xfs_trans_set_sync(tp); error = xfs_trans_commit(tp); trace_xfs_swap_extent_after(ip, 0); trace_xfs_swap_extent_after(tip, 1); out_unlock_ilock: xfs_iunlock(ip, XFS_ILOCK_EXCL); xfs_iunlock(tip, XFS_ILOCK_EXCL); out_unlock: filemap_invalidate_unlock_two(VFS_I(ip)->i_mapping, VFS_I(tip)->i_mapping); unlock_two_nondirectories(VFS_I(ip), VFS_I(tip)); return error; out_trans_cancel: xfs_trans_cancel(tp); goto out_unlock_ilock; }
5 5 2 2 2 1 2 4 1 1 1 1 4 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 // SPDX-License-Identifier: GPL-2.0-only /* * Optimized MPEG FS - inode and super operations. * Copyright (C) 2006 Bob Copeland <me@bobcopeland.com> */ #include <linux/module.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/fs.h> #include <linux/vfs.h> #include <linux/cred.h> #include <linux/buffer_head.h> #include <linux/vmalloc.h> #include <linux/writeback.h> #include <linux/seq_file.h> #include <linux/crc-itu-t.h> #include <linux/fs_struct.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include "omfs.h" MODULE_AUTHOR("Bob Copeland <me@bobcopeland.com>"); MODULE_DESCRIPTION("OMFS (ReplayTV/Karma) Filesystem for Linux"); MODULE_LICENSE("GPL"); struct buffer_head *omfs_bread(struct super_block *sb, sector_t block) { struct omfs_sb_info *sbi = OMFS_SB(sb); if (block >= sbi->s_num_blocks) return NULL; return sb_bread(sb, clus_to_blk(sbi, block)); } struct inode *omfs_new_inode(struct inode *dir, umode_t mode) { struct inode *inode; u64 new_block; int err; int len; struct omfs_sb_info *sbi = OMFS_SB(dir->i_sb); inode = new_inode(dir->i_sb); if (!inode) return ERR_PTR(-ENOMEM); err = omfs_allocate_range(dir->i_sb, sbi->s_mirrors, sbi->s_mirrors, &new_block, &len); if (err) goto fail; inode->i_ino = new_block; inode_init_owner(&nop_mnt_idmap, inode, NULL, mode); inode->i_mapping->a_ops = &omfs_aops; simple_inode_init_ts(inode); switch (mode & S_IFMT) { case S_IFDIR: inode->i_op = &omfs_dir_inops; inode->i_fop = &omfs_dir_operations; inode->i_size = sbi->s_sys_blocksize; inc_nlink(inode); break; case S_IFREG: inode->i_op = &omfs_file_inops; inode->i_fop = &omfs_file_operations; inode->i_size = 0; break; } insert_inode_hash(inode); mark_inode_dirty(inode); return inode; fail: make_bad_inode(inode); iput(inode); return ERR_PTR(err); } /* * Update the header checksums for a dirty inode based on its contents. * Caller is expected to hold the buffer head underlying oi and mark it * dirty. */ static void omfs_update_checksums(struct omfs_inode *oi) { int xor, i, ofs = 0, count; u16 crc = 0; unsigned char *ptr = (unsigned char *) oi; count = be32_to_cpu(oi->i_head.h_body_size); ofs = sizeof(struct omfs_header); crc = crc_itu_t(crc, ptr + ofs, count); oi->i_head.h_crc = cpu_to_be16(crc); xor = ptr[0]; for (i = 1; i < OMFS_XOR_COUNT; i++) xor ^= ptr[i]; oi->i_head.h_check_xor = xor; } static int __omfs_write_inode(struct inode *inode, int wait) { struct omfs_inode *oi; struct omfs_sb_info *sbi = OMFS_SB(inode->i_sb); struct buffer_head *bh, *bh2; u64 ctime; int i; int ret = -EIO; int sync_failed = 0; /* get current inode since we may have written sibling ptrs etc. */ bh = omfs_bread(inode->i_sb, inode->i_ino); if (!bh) goto out; oi = (struct omfs_inode *) bh->b_data; oi->i_head.h_self = cpu_to_be64(inode->i_ino); if (S_ISDIR(inode->i_mode)) oi->i_type = OMFS_DIR; else if (S_ISREG(inode->i_mode)) oi->i_type = OMFS_FILE; else { printk(KERN_WARNING "omfs: unknown file type: %d\n", inode->i_mode); goto out_brelse; } oi->i_head.h_body_size = cpu_to_be32(sbi->s_sys_blocksize - sizeof(struct omfs_header)); oi->i_head.h_version = 1; oi->i_head.h_type = OMFS_INODE_NORMAL; oi->i_head.h_magic = OMFS_IMAGIC; oi->i_size = cpu_to_be64(inode->i_size); ctime = inode_get_ctime_sec(inode) * 1000LL + ((inode_get_ctime_nsec(inode) + 999)/1000); oi->i_ctime = cpu_to_be64(ctime); omfs_update_checksums(oi); mark_buffer_dirty(bh); if (wait) { sync_dirty_buffer(bh); if (buffer_req(bh) && !buffer_uptodate(bh)) sync_failed = 1; } /* if mirroring writes, copy to next fsblock */ for (i = 1; i < sbi->s_mirrors; i++) { bh2 = omfs_bread(inode->i_sb, inode->i_ino + i); if (!bh2) goto out_brelse; memcpy(bh2->b_data, bh->b_data, bh->b_size); mark_buffer_dirty(bh2); if (wait) { sync_dirty_buffer(bh2); if (buffer_req(bh2) && !buffer_uptodate(bh2)) sync_failed = 1; } brelse(bh2); } ret = (sync_failed) ? -EIO : 0; out_brelse: brelse(bh); out: return ret; } static int omfs_write_inode(struct inode *inode, struct writeback_control *wbc) { return __omfs_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL); } int omfs_sync_inode(struct inode *inode) { return __omfs_write_inode(inode, 1); } /* * called when an entry is deleted, need to clear the bits in the * bitmaps. */ static void omfs_evict_inode(struct inode *inode) { truncate_inode_pages_final(&inode->i_data); clear_inode(inode); if (inode->i_nlink) return; if (S_ISREG(inode->i_mode)) { inode->i_size = 0; omfs_shrink_inode(inode); } omfs_clear_range(inode->i_sb, inode->i_ino, 2); } struct inode *omfs_iget(struct super_block *sb, ino_t ino) { struct omfs_sb_info *sbi = OMFS_SB(sb); struct omfs_inode *oi; struct buffer_head *bh; u64 ctime; unsigned long nsecs; struct inode *inode; inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); if (!(inode_state_read_once(inode) & I_NEW)) return inode; bh = omfs_bread(inode->i_sb, ino); if (!bh) goto iget_failed; oi = (struct omfs_inode *)bh->b_data; /* check self */ if (ino != be64_to_cpu(oi->i_head.h_self)) goto fail_bh; inode->i_uid = sbi->s_uid; inode->i_gid = sbi->s_gid; ctime = be64_to_cpu(oi->i_ctime); nsecs = do_div(ctime, 1000) * 1000L; inode_set_atime(inode, ctime, nsecs); inode_set_mtime(inode, ctime, nsecs); inode_set_ctime(inode, ctime, nsecs); inode->i_mapping->a_ops = &omfs_aops; switch (oi->i_type) { case OMFS_DIR: inode->i_mode = S_IFDIR | (S_IRWXUGO & ~sbi->s_dmask); inode->i_op = &omfs_dir_inops; inode->i_fop = &omfs_dir_operations; inode->i_size = sbi->s_sys_blocksize; inc_nlink(inode); break; case OMFS_FILE: inode->i_mode = S_IFREG | (S_IRWXUGO & ~sbi->s_fmask); inode->i_fop = &omfs_file_operations; inode->i_size = be64_to_cpu(oi->i_size); break; } brelse(bh); unlock_new_inode(inode); return inode; fail_bh: brelse(bh); iget_failed: iget_failed(inode); return ERR_PTR(-EIO); } static void omfs_put_super(struct super_block *sb) { struct omfs_sb_info *sbi = OMFS_SB(sb); kfree(sbi->s_imap); kfree(sbi); sb->s_fs_info = NULL; } static int omfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *s = dentry->d_sb; struct omfs_sb_info *sbi = OMFS_SB(s); u64 id = huge_encode_dev(s->s_bdev->bd_dev); buf->f_type = OMFS_MAGIC; buf->f_bsize = sbi->s_blocksize; buf->f_blocks = sbi->s_num_blocks; buf->f_files = sbi->s_num_blocks; buf->f_namelen = OMFS_NAMELEN; buf->f_fsid = u64_to_fsid(id); buf->f_bfree = buf->f_bavail = buf->f_ffree = omfs_count_free(s); return 0; } /* * Display the mount options in /proc/mounts. */ static int omfs_show_options(struct seq_file *m, struct dentry *root) { struct omfs_sb_info *sbi = OMFS_SB(root->d_sb); umode_t cur_umask = current_umask(); if (!uid_eq(sbi->s_uid, current_uid())) seq_printf(m, ",uid=%u", from_kuid_munged(&init_user_ns, sbi->s_uid)); if (!gid_eq(sbi->s_gid, current_gid())) seq_printf(m, ",gid=%u", from_kgid_munged(&init_user_ns, sbi->s_gid)); if (sbi->s_dmask == sbi->s_fmask) { if (sbi->s_fmask != cur_umask) seq_printf(m, ",umask=%o", sbi->s_fmask); } else { if (sbi->s_dmask != cur_umask) seq_printf(m, ",dmask=%o", sbi->s_dmask); if (sbi->s_fmask != cur_umask) seq_printf(m, ",fmask=%o", sbi->s_fmask); } return 0; } static const struct super_operations omfs_sops = { .write_inode = omfs_write_inode, .evict_inode = omfs_evict_inode, .put_super = omfs_put_super, .statfs = omfs_statfs, .show_options = omfs_show_options, }; /* * For Rio Karma, there is an on-disk free bitmap whose location is * stored in the root block. For ReplayTV, there is no such free bitmap * so we have to walk the tree. Both inodes and file data are allocated * from the same map. This array can be big (300k) so we allocate * in units of the blocksize. */ static int omfs_get_imap(struct super_block *sb) { unsigned int bitmap_size, array_size; int count; struct omfs_sb_info *sbi = OMFS_SB(sb); struct buffer_head *bh; unsigned long **ptr; sector_t block; bitmap_size = DIV_ROUND_UP(sbi->s_num_blocks, 8); array_size = DIV_ROUND_UP(bitmap_size, sb->s_blocksize); if (sbi->s_bitmap_ino == ~0ULL) goto out; sbi->s_imap_size = array_size; sbi->s_imap = kcalloc(array_size, sizeof(unsigned long *), GFP_KERNEL); if (!sbi->s_imap) goto nomem; block = clus_to_blk(sbi, sbi->s_bitmap_ino); if (block >= sbi->s_num_blocks) goto nomem; ptr = sbi->s_imap; for (count = bitmap_size; count > 0; count -= sb->s_blocksize) { bh = sb_bread(sb, block++); if (!bh) goto nomem_free; *ptr = kmemdup(bh->b_data, sb->s_blocksize, GFP_KERNEL); if (!*ptr) { brelse(bh); goto nomem_free; } if (count < sb->s_blocksize) memset((void *)*ptr + count, 0xff, sb->s_blocksize - count); brelse(bh); ptr++; } out: return 0; nomem_free: for (count = 0; count < array_size; count++) kfree(sbi->s_imap[count]); kfree(sbi->s_imap); nomem: sbi->s_imap = NULL; sbi->s_imap_size = 0; return -ENOMEM; } struct omfs_mount_options { kuid_t s_uid; kgid_t s_gid; int s_dmask; int s_fmask; }; enum { Opt_uid, Opt_gid, Opt_umask, Opt_dmask, Opt_fmask, }; static const struct fs_parameter_spec omfs_param_spec[] = { fsparam_uid ("uid", Opt_uid), fsparam_gid ("gid", Opt_gid), fsparam_u32oct ("umask", Opt_umask), fsparam_u32oct ("dmask", Opt_dmask), fsparam_u32oct ("fmask", Opt_fmask), {} }; static int omfs_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct omfs_mount_options *opts = fc->fs_private; int token; struct fs_parse_result result; /* All options are ignored on remount */ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) return 0; token = fs_parse(fc, omfs_param_spec, param, &result); if (token < 0) return token; switch (token) { case Opt_uid: opts->s_uid = result.uid; break; case Opt_gid: opts->s_gid = result.gid; break; case Opt_umask: opts->s_fmask = opts->s_dmask = result.uint_32; break; case Opt_dmask: opts->s_dmask = result.uint_32; break; case Opt_fmask: opts->s_fmask = result.uint_32; break; default: return -EINVAL; } return 0; } static void omfs_set_options(struct omfs_sb_info *sbi, struct omfs_mount_options *opts) { sbi->s_uid = opts->s_uid; sbi->s_gid = opts->s_gid; sbi->s_dmask = opts->s_dmask; sbi->s_fmask = opts->s_fmask; } static int omfs_fill_super(struct super_block *sb, struct fs_context *fc) { struct buffer_head *bh, *bh2; struct omfs_super_block *omfs_sb; struct omfs_root_block *omfs_rb; struct omfs_sb_info *sbi; struct inode *root; struct omfs_mount_options *parsed_opts = fc->fs_private; int ret = -EINVAL; int silent = fc->sb_flags & SB_SILENT; sbi = kzalloc_obj(struct omfs_sb_info); if (!sbi) return -ENOMEM; sb->s_fs_info = sbi; omfs_set_options(sbi, parsed_opts); sb->s_maxbytes = 0xffffffff; sb->s_time_gran = NSEC_PER_MSEC; sb->s_time_min = 0; sb->s_time_max = U64_MAX / MSEC_PER_SEC; sb_set_blocksize(sb, 0x200); bh = sb_bread(sb, 0); if (!bh) goto end; omfs_sb = (struct omfs_super_block *)bh->b_data; if (omfs_sb->s_magic != cpu_to_be32(OMFS_MAGIC)) { if (!silent) printk(KERN_ERR "omfs: Invalid superblock (%x)\n", omfs_sb->s_magic); goto out_brelse_bh; } sb->s_magic = OMFS_MAGIC; sbi->s_num_blocks = be64_to_cpu(omfs_sb->s_num_blocks); sbi->s_blocksize = be32_to_cpu(omfs_sb->s_blocksize); sbi->s_mirrors = be32_to_cpu(omfs_sb->s_mirrors); sbi->s_root_ino = be64_to_cpu(omfs_sb->s_root_block); sbi->s_sys_blocksize = be32_to_cpu(omfs_sb->s_sys_blocksize); mutex_init(&sbi->s_bitmap_lock); if (sbi->s_num_blocks > OMFS_MAX_BLOCKS) { printk(KERN_ERR "omfs: sysblock number (%llx) is out of range\n", (unsigned long long)sbi->s_num_blocks); goto out_brelse_bh; } if (sbi->s_sys_blocksize > PAGE_SIZE) { printk(KERN_ERR "omfs: sysblock size (%d) is out of range\n", sbi->s_sys_blocksize); goto out_brelse_bh; } if (sbi->s_sys_blocksize < OMFS_DIR_START) { printk(KERN_ERR "omfs: sysblock size (%d) is too small\n", sbi->s_sys_blocksize); goto out_brelse_bh; } if (sbi->s_blocksize < sbi->s_sys_blocksize || sbi->s_blocksize > OMFS_MAX_BLOCK_SIZE) { printk(KERN_ERR "omfs: block size (%d) is out of range\n", sbi->s_blocksize); goto out_brelse_bh; } /* * Use sys_blocksize as the fs block since it is smaller than a * page while the fs blocksize can be larger. */ sb_set_blocksize(sb, sbi->s_sys_blocksize); /* * ...and the difference goes into a shift. sys_blocksize is always * a power of two factor of blocksize. */ sbi->s_block_shift = get_bitmask_order(sbi->s_blocksize) - get_bitmask_order(sbi->s_sys_blocksize); bh2 = omfs_bread(sb, be64_to_cpu(omfs_sb->s_root_block)); if (!bh2) goto out_brelse_bh; omfs_rb = (struct omfs_root_block *)bh2->b_data; sbi->s_bitmap_ino = be64_to_cpu(omfs_rb->r_bitmap); sbi->s_clustersize = be32_to_cpu(omfs_rb->r_clustersize); if (sbi->s_num_blocks != be64_to_cpu(omfs_rb->r_num_blocks)) { printk(KERN_ERR "omfs: block count discrepancy between " "super and root blocks (%llx, %llx)\n", (unsigned long long)sbi->s_num_blocks, (unsigned long long)be64_to_cpu(omfs_rb->r_num_blocks)); goto out_brelse_bh2; } if (sbi->s_bitmap_ino != ~0ULL && sbi->s_bitmap_ino > sbi->s_num_blocks) { printk(KERN_ERR "omfs: free space bitmap location is corrupt " "(%llx, total blocks %llx)\n", (unsigned long long) sbi->s_bitmap_ino, (unsigned long long) sbi->s_num_blocks); goto out_brelse_bh2; } if (sbi->s_clustersize < 1 || sbi->s_clustersize > OMFS_MAX_CLUSTER_SIZE) { printk(KERN_ERR "omfs: cluster size out of range (%d)", sbi->s_clustersize); goto out_brelse_bh2; } ret = omfs_get_imap(sb); if (ret) goto out_brelse_bh2; sb->s_op = &omfs_sops; root = omfs_iget(sb, be64_to_cpu(omfs_rb->r_root_dir)); if (IS_ERR(root)) { ret = PTR_ERR(root); goto out_brelse_bh2; } sb->s_root = d_make_root(root); if (!sb->s_root) { ret = -ENOMEM; goto out_brelse_bh2; } printk(KERN_DEBUG "omfs: Mounted volume %s\n", omfs_rb->r_name); ret = 0; out_brelse_bh2: brelse(bh2); out_brelse_bh: brelse(bh); end: if (ret) kfree(sbi); return ret; } static int omfs_get_tree(struct fs_context *fc) { return get_tree_bdev(fc, omfs_fill_super); } static void omfs_free_fc(struct fs_context *fc); static const struct fs_context_operations omfs_context_ops = { .parse_param = omfs_parse_param, .get_tree = omfs_get_tree, .free = omfs_free_fc, }; static int omfs_init_fs_context(struct fs_context *fc) { struct omfs_mount_options *opts; opts = kzalloc_obj(*opts); if (!opts) return -ENOMEM; /* Set mount options defaults */ opts->s_uid = current_uid(); opts->s_gid = current_gid(); opts->s_dmask = opts->s_fmask = current_umask(); fc->fs_private = opts; fc->ops = &omfs_context_ops; return 0; } static void omfs_free_fc(struct fs_context *fc) { kfree(fc->fs_private); } static struct file_system_type omfs_fs_type = { .owner = THIS_MODULE, .name = "omfs", .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, .init_fs_context = omfs_init_fs_context, .parameters = omfs_param_spec, }; MODULE_ALIAS_FS("omfs"); static int __init init_omfs_fs(void) { return register_filesystem(&omfs_fs_type); } static void __exit exit_omfs_fs(void) { unregister_filesystem(&omfs_fs_type); } module_init(init_omfs_fs); module_exit(exit_omfs_fs);
58 1 15 16 13 57 2 16 40 12 12 14 11 4 2 3 9 14 23 17 23 17 14 4 1 19 22 60 22 74 74 47 31 75 1 3 17 59 7 7 1 7 7 2 7 19 19 2 15 8 31 3 29 3 3 14 38 37 37 17 1 7 15 1 16 34 1 2 2 19 28 47 2 13 38 3 1 15 19 2 32 27 7 4 2 2 100 5 97 48 57 5 5 3 3 3 35 3 107 8 139 119 136 107 1 131 4 19 115 131 12 12 4 4 4 5 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/file.c * * Copyright (C) 1992, 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) * * from * * linux/fs/minix/file.c * * Copyright (C) 1991, 1992 Linus Torvalds * * ext4 fs regular file handling primitives * * 64-bit file support on 64-bit platforms by Jakub Jelinek * (jj@sunsite.ms.mff.cuni.cz) */ #include <linux/time.h> #include <linux/fs.h> #include <linux/iomap.h> #include <linux/mount.h> #include <linux/path.h> #include <linux/dax.h> #include <linux/filelock.h> #include <linux/quotaops.h> #include <linux/uio.h> #include <linux/mman.h> #include <linux/backing-dev.h> #include "ext4.h" #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" #include "truncate.h" /* * Returns %true if the given DIO request should be attempted with DIO, or * %false if it should fall back to buffered I/O. * * DIO isn't well specified; when it's unsupported (either due to the request * being misaligned, or due to the file not supporting DIO at all), filesystems * either fall back to buffered I/O or return EINVAL. For files that don't use * any special features like encryption or verity, ext4 has traditionally * returned EINVAL for misaligned DIO. iomap_dio_rw() uses this convention too. * In this case, we should attempt the DIO, *not* fall back to buffered I/O. * * In contrast, in cases where DIO is unsupported due to ext4 features, ext4 * traditionally falls back to buffered I/O. * * This function implements the traditional ext4 behavior in all these cases. */ static bool ext4_should_use_dio(struct kiocb *iocb, struct iov_iter *iter) { struct inode *inode = file_inode(iocb->ki_filp); u32 dio_align = ext4_dio_alignment(inode); if (dio_align == 0) return false; if (dio_align == 1) return true; return IS_ALIGNED(iocb->ki_pos | iov_iter_alignment(iter), dio_align); } static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) { ssize_t ret; struct inode *inode = file_inode(iocb->ki_filp); if (iocb->ki_flags & IOCB_NOWAIT) { if (!inode_trylock_shared(inode)) return -EAGAIN; } else { inode_lock_shared(inode); } if (!ext4_should_use_dio(iocb, to)) { inode_unlock_shared(inode); /* * Fallback to buffered I/O if the operation being performed on * the inode is not supported by direct I/O. The IOCB_DIRECT * flag needs to be cleared here in order to ensure that the * direct I/O path within generic_file_read_iter() is not * taken. */ iocb->ki_flags &= ~IOCB_DIRECT; return generic_file_read_iter(iocb, to); } ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0, NULL, 0); inode_unlock_shared(inode); file_accessed(iocb->ki_filp); return ret; } #ifdef CONFIG_FS_DAX static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct inode *inode = file_inode(iocb->ki_filp); ssize_t ret; if (iocb->ki_flags & IOCB_NOWAIT) { if (!inode_trylock_shared(inode)) return -EAGAIN; } else { inode_lock_shared(inode); } /* * Recheck under inode lock - at this point we are sure it cannot * change anymore */ if (!IS_DAX(inode)) { inode_unlock_shared(inode); /* Fallback to buffered IO in case we cannot support DAX */ return generic_file_read_iter(iocb, to); } ret = dax_iomap_rw(iocb, to, &ext4_iomap_ops); inode_unlock_shared(inode); file_accessed(iocb->ki_filp); return ret; } #endif static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct inode *inode = file_inode(iocb->ki_filp); if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; if (!iov_iter_count(to)) return 0; /* skip atime */ #ifdef CONFIG_FS_DAX if (IS_DAX(inode)) return ext4_dax_read_iter(iocb, to); #endif if (iocb->ki_flags & IOCB_DIRECT) return ext4_dio_read_iter(iocb, to); return generic_file_read_iter(iocb, to); } static ssize_t ext4_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { struct inode *inode = file_inode(in); if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; return filemap_splice_read(in, ppos, pipe, len, flags); } /* * Called when an inode is released. Note that this is different * from ext4_file_open: open gets called at every open, but release * gets called only when /all/ the files are closed. */ static int ext4_release_file(struct inode *inode, struct file *filp) { if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) { ext4_alloc_da_blocks(inode); ext4_clear_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); } /* if we are the last writer on the inode, drop the block reservation */ if ((filp->f_mode & FMODE_WRITE) && (atomic_read(&inode->i_writecount) == 1) && !EXT4_I(inode)->i_reserved_data_blocks) { down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode); up_write(&EXT4_I(inode)->i_data_sem); } if (is_dx(inode) && filp->private_data) ext4_htree_free_dir_info(filp->private_data); return 0; } /* * This tests whether the IO in question is block-aligned or not. * Ext4 utilizes unwritten extents when hole-filling during direct IO, and they * are converted to written only after the IO is complete. Until they are * mapped, these blocks appear as holes, so dio_zero_block() will assume that * it needs to zero out portions of the start and/or end block. If 2 AIO * threads are at work on the same unwritten block, they must be synchronized * or one thread will zero the other's data, causing corruption. */ static bool ext4_unaligned_io(struct inode *inode, struct iov_iter *from, loff_t pos) { struct super_block *sb = inode->i_sb; unsigned long blockmask = sb->s_blocksize - 1; if ((pos | iov_iter_alignment(from)) & blockmask) return true; return false; } static bool ext4_extending_io(struct inode *inode, loff_t offset, size_t len) { if (offset + len > i_size_read(inode) || offset + len > EXT4_I(inode)->i_disksize) return true; return false; } /* Is IO overwriting allocated or initialized blocks? */ static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len, bool *unwritten) { struct ext4_map_blocks map; unsigned int blkbits = inode->i_blkbits; int err, blklen; if (pos + len > i_size_read(inode)) return false; map.m_lblk = pos >> blkbits; map.m_len = EXT4_MAX_BLOCKS(len, pos, blkbits); blklen = map.m_len; err = ext4_map_blocks(NULL, inode, &map, 0); if (err != blklen) return false; /* * 'err==len' means that all of the blocks have been preallocated, * regardless of whether they have been initialized or not. We need to * check m_flags to distinguish the unwritten extents. */ *unwritten = !(map.m_flags & EXT4_MAP_MAPPED); return true; } static ssize_t ext4_generic_write_checks(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); ssize_t ret; if (unlikely(IS_IMMUTABLE(inode))) return -EPERM; ret = generic_write_checks(iocb, from); if (ret <= 0) return ret; /* * If we have encountered a bitmap-format file, the size limit * is smaller than s_maxbytes, which is for extent-mapped files. */ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); if (iocb->ki_pos >= sbi->s_bitmap_maxbytes) return -EFBIG; iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos); } return iov_iter_count(from); } static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); loff_t old_size = i_size_read(inode); ssize_t ret, count; count = ext4_generic_write_checks(iocb, from); if (count <= 0) return count; ret = file_modified(iocb->ki_filp); if (ret) return ret; /* * If the position is beyond the EOF, it is necessary to zero out the * partial block that beyond the existing EOF, as it may contains * stale data written through mmap. */ if (iocb->ki_pos > old_size && !ext4_verity_in_progress(inode)) { if (iocb->ki_flags & IOCB_NOWAIT) return -EAGAIN; ret = ext4_block_zero_eof(inode, old_size, iocb->ki_pos); if (ret) return ret; } return count; } static ssize_t ext4_buffered_write_iter(struct kiocb *iocb, struct iov_iter *from) { ssize_t ret; struct inode *inode = file_inode(iocb->ki_filp); if (iocb->ki_flags & IOCB_NOWAIT) return -EOPNOTSUPP; inode_lock(inode); ret = ext4_write_checks(iocb, from); if (ret <= 0) goto out; ret = generic_perform_write(iocb, from); out: inode_unlock(inode); if (unlikely(ret <= 0)) return ret; return generic_write_sync(iocb, ret); } static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset, ssize_t written, ssize_t count) { handle_t *handle; lockdep_assert_held_write(&inode->i_rwsem); handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) return PTR_ERR(handle); if (ext4_update_inode_size(inode, offset + written)) { int ret = ext4_mark_inode_dirty(handle, inode); if (unlikely(ret)) { ext4_journal_stop(handle); return ret; } } if ((written == count) && inode->i_nlink) ext4_orphan_del(handle, inode); ext4_journal_stop(handle); return written; } /* * Clean up the inode after DIO or DAX extending write has completed and the * inode size has been updated using ext4_handle_inode_extension(). */ static void ext4_inode_extension_cleanup(struct inode *inode, bool need_trunc) { lockdep_assert_held_write(&inode->i_rwsem); if (need_trunc) { ext4_truncate_failed_write(inode); /* * If the truncate operation failed early, then the inode may * still be on the orphan list. In that case, we need to try * remove the inode from the in-memory linked list. */ if (inode->i_nlink) ext4_orphan_del(NULL, inode); return; } /* * If i_disksize got extended either due to writeback of delalloc * blocks or extending truncate while the DIO was running we could fail * to cleanup the orphan list in ext4_handle_inode_extension(). Do it * now. */ if (ext4_inode_orphan_tracked(inode) && inode->i_nlink) { handle_t *handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) { /* * The write has successfully completed. Not much to * do with the error here so just cleanup the orphan * list and hope for the best. */ ext4_orphan_del(NULL, inode); return; } ext4_orphan_del(handle, inode); ext4_journal_stop(handle); } } static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size, int error, unsigned int flags) { loff_t pos = iocb->ki_pos; struct inode *inode = file_inode(iocb->ki_filp); if (!error && size && (flags & IOMAP_DIO_UNWRITTEN) && (iocb->ki_flags & IOCB_ATOMIC)) error = ext4_convert_unwritten_extents_atomic(NULL, inode, pos, size); else if (!error && size && flags & IOMAP_DIO_UNWRITTEN) error = ext4_convert_unwritten_extents(NULL, inode, pos, size); if (error) return error; /* * Note that EXT4_I(inode)->i_disksize can get extended up to * inode->i_size while the I/O was running due to writeback of delalloc * blocks. But the code in ext4_iomap_alloc() is careful to use * zeroed/unwritten extents if this is possible; thus we won't leave * uninitialized blocks in a file even if we didn't succeed in writing * as much as we intended. Also we can race with truncate or write * expanding the file so we have to be a bit careful here. */ if (pos + size <= READ_ONCE(EXT4_I(inode)->i_disksize) && pos + size <= i_size_read(inode)) return 0; error = ext4_handle_inode_extension(inode, pos, size, size); return error < 0 ? error : 0; } static const struct iomap_dio_ops ext4_dio_write_ops = { .end_io = ext4_dio_write_end_io, }; /* * The intention here is to start with shared lock acquired then see if any * condition requires an exclusive inode lock. If yes, then we restart the * whole operation by releasing the shared lock and acquiring exclusive lock. * * - For unaligned_io we never take shared lock as it may cause data corruption * when two unaligned IO tries to modify the same block e.g. while zeroing. * * - For extending writes case we don't take the shared lock, since it requires * updating inode i_disksize and/or orphan handling with exclusive lock. * * - shared locking will only be true mostly with overwrites, including * initialized blocks and unwritten blocks. * * - Otherwise we will switch to exclusive i_rwsem lock. */ static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from, bool *ilock_shared, bool *extend, int *dio_flags) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); loff_t offset; size_t count; ssize_t ret; bool overwrite, unaligned_io, unwritten; restart: ret = ext4_generic_write_checks(iocb, from); if (ret <= 0) goto out; offset = iocb->ki_pos; count = ret; unaligned_io = ext4_unaligned_io(inode, from, offset); *extend = ext4_extending_io(inode, offset, count); overwrite = ext4_overwrite_io(inode, offset, count, &unwritten); /* * Determine whether we need to upgrade to an exclusive lock. This is * required to change security info in file_modified(), for extending * I/O, any form of non-overwrite I/O, and unaligned I/O to unwritten * extents (as partial block zeroing may be required). * * Note that unaligned writes are allowed under shared lock so long as * they are pure overwrites. Otherwise, concurrent unaligned writes risk * data corruption due to partial block zeroing in the dio layer, and so * the I/O must occur exclusively. */ if (*ilock_shared && ((!IS_NOSEC(inode) || *extend || !overwrite || (unaligned_io && unwritten)))) { if (iocb->ki_flags & IOCB_NOWAIT) { ret = -EAGAIN; goto out; } inode_unlock_shared(inode); *ilock_shared = false; inode_lock(inode); goto restart; } /* * Now that locking is settled, determine dio flags and exclusivity * requirements. We don't use DIO_OVERWRITE_ONLY because we enforce * behavior already. The inode lock is already held exclusive if the * write is non-overwrite or extending, so drain all outstanding dio and * set the force wait dio flag. */ if (!*ilock_shared && (unaligned_io || *extend)) { if (iocb->ki_flags & IOCB_NOWAIT) { ret = -EAGAIN; goto out; } if (unaligned_io && (!overwrite || unwritten)) inode_dio_wait(inode); *dio_flags = IOMAP_DIO_FORCE_WAIT; } ret = file_modified(file); if (ret < 0) goto out; return count; out: if (*ilock_shared) inode_unlock_shared(inode); else inode_unlock(inode); return ret; } static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) { ssize_t ret; handle_t *handle; struct inode *inode = file_inode(iocb->ki_filp); loff_t offset = iocb->ki_pos; size_t count = iov_iter_count(from); bool extend = false; bool ilock_shared = true; int dio_flags = 0; /* * Quick check here without any i_rwsem lock to see if it is extending * IO. A more reliable check is done in ext4_dio_write_checks() with * proper locking in place. */ if (offset + count > i_size_read(inode)) ilock_shared = false; if (iocb->ki_flags & IOCB_NOWAIT) { if (ilock_shared) { if (!inode_trylock_shared(inode)) return -EAGAIN; } else { if (!inode_trylock(inode)) return -EAGAIN; } } else { if (ilock_shared) inode_lock_shared(inode); else inode_lock(inode); } /* Fallback to buffered I/O if the inode does not support direct I/O. */ if (!ext4_should_use_dio(iocb, from)) { if (ilock_shared) inode_unlock_shared(inode); else inode_unlock(inode); return ext4_buffered_write_iter(iocb, from); } /* * Prevent inline data from being created since we are going to allocate * blocks for DIO. We know the inode does not currently have inline data * because ext4_should_use_dio() checked for it, but we have to clear * the state flag before the write checks because a lock cycle could * introduce races with other writers. */ ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend, &dio_flags); if (ret <= 0) return ret; offset = iocb->ki_pos; count = ret; if (extend) { handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out; } ret = ext4_orphan_add(handle, inode); ext4_journal_stop(handle); if (ret) goto out; } ret = iomap_dio_rw(iocb, from, &ext4_iomap_ops, &ext4_dio_write_ops, dio_flags, NULL, 0); if (ret == -ENOTBLK) ret = 0; if (extend) { /* * We always perform extending DIO write synchronously so by * now the IO is completed and ext4_handle_inode_extension() * was called. Cleanup the inode in case of error or race with * writeback of delalloc blocks. */ WARN_ON_ONCE(ret == -EIOCBQUEUED); ext4_inode_extension_cleanup(inode, ret < 0); } out: if (ilock_shared) inode_unlock_shared(inode); else inode_unlock(inode); if (ret >= 0 && iov_iter_count(from)) { ssize_t err; loff_t endbyte; /* * There is no support for atomic writes on buffered-io yet, * we should never fallback to buffered-io for DIO atomic * writes. */ WARN_ON_ONCE(iocb->ki_flags & IOCB_ATOMIC); offset = iocb->ki_pos; err = ext4_buffered_write_iter(iocb, from); if (err < 0) return err; /* * We need to ensure that the pages within the page cache for * the range covered by this I/O are written to disk and * invalidated. This is in attempt to preserve the expected * direct I/O semantics in the case we fallback to buffered I/O * to complete off the I/O request. */ ret += err; endbyte = offset + err - 1; err = filemap_write_and_wait_range(iocb->ki_filp->f_mapping, offset, endbyte); if (!err) invalidate_mapping_pages(iocb->ki_filp->f_mapping, offset >> PAGE_SHIFT, endbyte >> PAGE_SHIFT); } return ret; } #ifdef CONFIG_FS_DAX static ssize_t ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) { ssize_t ret; size_t count; loff_t offset; handle_t *handle; bool extend = false; struct inode *inode = file_inode(iocb->ki_filp); if (iocb->ki_flags & IOCB_NOWAIT) { if (!inode_trylock(inode)) return -EAGAIN; } else { inode_lock(inode); } ret = ext4_write_checks(iocb, from); if (ret <= 0) goto out; offset = iocb->ki_pos; count = iov_iter_count(from); if (offset + count > EXT4_I(inode)->i_disksize) { handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out; } ret = ext4_orphan_add(handle, inode); if (ret) { ext4_journal_stop(handle); goto out; } extend = true; ext4_journal_stop(handle); } ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops); if (extend) { ret = ext4_handle_inode_extension(inode, offset, ret, count); ext4_inode_extension_cleanup(inode, ret < (ssize_t)count); } out: inode_unlock(inode); if (ret > 0) ret = generic_write_sync(iocb, ret); return ret; } #endif static ssize_t ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { int ret; struct inode *inode = file_inode(iocb->ki_filp); ret = ext4_emergency_state(inode->i_sb); if (unlikely(ret)) return ret; #ifdef CONFIG_FS_DAX if (IS_DAX(inode)) return ext4_dax_write_iter(iocb, from); #endif if (iocb->ki_flags & IOCB_ATOMIC) { size_t len = iov_iter_count(from); if (len < EXT4_SB(inode->i_sb)->s_awu_min || len > EXT4_SB(inode->i_sb)->s_awu_max) return -EINVAL; ret = generic_atomic_write_valid(iocb, from); if (ret) return ret; } if (iocb->ki_flags & IOCB_DIRECT) return ext4_dio_write_iter(iocb, from); else return ext4_buffered_write_iter(iocb, from); } #ifdef CONFIG_FS_DAX static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf, unsigned int order) { int error = 0; vm_fault_t result; int retries = 0; handle_t *handle = NULL; struct inode *inode = file_inode(vmf->vma->vm_file); struct super_block *sb = inode->i_sb; /* * We have to distinguish real writes from writes which will result in a * COW page; COW writes should *not* poke the journal (the file will not * be changed). Doing so would cause unintended failures when mounted * read-only. * * We check for VM_SHARED rather than vmf->cow_page since the latter is * unset for order != 0 (i.e. only in do_cow_fault); for * other sizes, dax_iomap_fault will handle splitting / fallback so that * we eventually come back with a COW page. */ bool write = (vmf->flags & FAULT_FLAG_WRITE) && (vmf->vma->vm_flags & VM_SHARED); struct address_space *mapping = vmf->vma->vm_file->f_mapping; unsigned long pfn; if (write) { sb_start_pagefault(sb); file_update_time(vmf->vma->vm_file); filemap_invalidate_lock_shared(mapping); retry: handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE, EXT4_DATA_TRANS_BLOCKS(sb)); if (IS_ERR(handle)) { filemap_invalidate_unlock_shared(mapping); sb_end_pagefault(sb); return VM_FAULT_SIGBUS; } } else { filemap_invalidate_lock_shared(mapping); } result = dax_iomap_fault(vmf, order, &pfn, &error, &ext4_iomap_ops); if (write) { ext4_journal_stop(handle); if ((result & VM_FAULT_ERROR) && error == -ENOSPC && ext4_should_retry_alloc(sb, &retries)) goto retry; /* Handling synchronous page fault? */ if (result & VM_FAULT_NEEDDSYNC) result = dax_finish_sync_fault(vmf, order, pfn); filemap_invalidate_unlock_shared(mapping); sb_end_pagefault(sb); } else { filemap_invalidate_unlock_shared(mapping); } return result; } static vm_fault_t ext4_dax_fault(struct vm_fault *vmf) { return ext4_dax_huge_fault(vmf, 0); } static const struct vm_operations_struct ext4_dax_vm_ops = { .fault = ext4_dax_fault, .huge_fault = ext4_dax_huge_fault, .page_mkwrite = ext4_dax_fault, .pfn_mkwrite = ext4_dax_fault, }; #else #define ext4_dax_vm_ops ext4_file_vm_ops #endif static const struct vm_operations_struct ext4_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = ext4_page_mkwrite, }; static int ext4_file_mmap_prepare(struct vm_area_desc *desc) { int ret; struct file *file = desc->file; struct inode *inode = file->f_mapping->host; struct dax_device *dax_dev = EXT4_SB(inode->i_sb)->s_daxdev; if (file->f_mode & FMODE_WRITE) ret = ext4_emergency_state(inode->i_sb); else ret = ext4_forced_shutdown(inode->i_sb) ? -EIO : 0; if (unlikely(ret)) return ret; /* * We don't support synchronous mappings for non-DAX files and * for DAX files if underneath dax_device is not synchronous. */ if (!daxdev_mapping_supported(desc, file_inode(file), dax_dev)) return -EOPNOTSUPP; file_accessed(file); if (IS_DAX(file_inode(file))) { desc->vm_ops = &ext4_dax_vm_ops; vma_desc_set_flags(desc, VMA_HUGEPAGE_BIT); } else { desc->vm_ops = &ext4_file_vm_ops; } return 0; } static int ext4_sample_last_mounted(struct super_block *sb, struct vfsmount *mnt) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct path path; char buf[64], *cp; handle_t *handle; int err; if (likely(ext4_test_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED))) return 0; if (ext4_emergency_state(sb) || sb_rdonly(sb) || !sb_start_intwrite_trylock(sb)) return 0; ext4_set_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED); /* * Sample where the filesystem has been mounted and * store it in the superblock for sysadmin convenience * when trying to sort through large numbers of block * devices or filesystem images. */ path.mnt = mnt; path.dentry = mnt->mnt_root; cp = d_path(&path, buf, sizeof(buf)); err = 0; if (IS_ERR(cp)) goto out; handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1); err = PTR_ERR(handle); if (IS_ERR(handle)) goto out; BUFFER_TRACE(sbi->s_sbh, "get_write_access"); err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh, EXT4_JTR_NONE); if (err) goto out_journal; lock_buffer(sbi->s_sbh); strtomem_pad(sbi->s_es->s_last_mounted, cp, 0); ext4_superblock_csum_set(sb); unlock_buffer(sbi->s_sbh); ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); out_journal: ext4_journal_stop(handle); out: sb_end_intwrite(sb); return err; } static int ext4_file_open(struct inode *inode, struct file *filp) { int ret; if (filp->f_mode & FMODE_WRITE) ret = ext4_emergency_state(inode->i_sb); else ret = ext4_forced_shutdown(inode->i_sb) ? -EIO : 0; if (unlikely(ret)) return ret; ret = ext4_sample_last_mounted(inode->i_sb, filp->f_path.mnt); if (ret) return ret; ret = fscrypt_file_open(inode, filp); if (ret) return ret; ret = fsverity_file_open(inode, filp); if (ret) return ret; /* * Set up the jbd2_inode if we are opening the inode for * writing and the journal is present */ if (filp->f_mode & FMODE_WRITE) { ret = ext4_inode_attach_jinode(inode); if (ret < 0) return ret; } if (ext4_inode_can_atomic_write(inode)) filp->f_mode |= FMODE_CAN_ATOMIC_WRITE; filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT; return dquot_file_open(inode, filp); } /* * ext4_llseek() handles both block-mapped and extent-mapped maxbytes values * by calling generic_file_llseek_size() with the appropriate maxbytes * value for each. */ loff_t ext4_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; loff_t maxbytes = ext4_get_maxbytes(inode); switch (whence) { default: return generic_file_llseek_size(file, offset, whence, maxbytes, i_size_read(inode)); case SEEK_HOLE: inode_lock_shared(inode); offset = iomap_seek_hole(inode, offset, &ext4_iomap_report_ops); inode_unlock_shared(inode); break; case SEEK_DATA: inode_lock_shared(inode); offset = iomap_seek_data(inode, offset, &ext4_iomap_report_ops); inode_unlock_shared(inode); break; } if (offset < 0) return offset; return vfs_setpos(file, offset, maxbytes); } const struct file_operations ext4_file_operations = { .llseek = ext4_llseek, .read_iter = ext4_file_read_iter, .write_iter = ext4_file_write_iter, .iopoll = iocb_bio_iopoll, .unlocked_ioctl = ext4_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ext4_compat_ioctl, #endif .mmap_prepare = ext4_file_mmap_prepare, .open = ext4_file_open, .release = ext4_release_file, .fsync = ext4_sync_file, .get_unmapped_area = thp_get_unmapped_area, .splice_read = ext4_file_splice_read, .splice_write = iter_file_splice_write, .fallocate = ext4_fallocate, .fop_flags = FOP_MMAP_SYNC | FOP_BUFFER_RASYNC | FOP_DIO_PARALLEL_WRITE | FOP_DONTCACHE, .setlease = generic_setlease, }; const struct inode_operations ext4_file_inode_operations = { .setattr = ext4_setattr, .getattr = ext4_file_getattr, .listxattr = ext4_listxattr, .get_inode_acl = ext4_get_acl, .set_acl = ext4_set_acl, .fiemap = ext4_fiemap, .fileattr_get = ext4_fileattr_get, .fileattr_set = ext4_fileattr_set, };
2 1 2 2 2 2 2 1 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 // SPDX-License-Identifier: GPL-2.0 #include <linux/in.h> #include <linux/inet.h> #include <linux/list.h> #include <linux/module.h> #include <linux/net.h> #include <linux/proc_fs.h> #include <linux/rculist.h> #include <linux/seq_file.h> #include <linux/socket.h> #include <net/inet_sock.h> #include <net/kcm.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/tcp.h> #ifdef CONFIG_PROC_FS static struct kcm_mux *kcm_get_first(struct seq_file *seq) { struct net *net = seq_file_net(seq); struct kcm_net *knet = net_generic(net, kcm_net_id); return list_first_or_null_rcu(&knet->mux_list, struct kcm_mux, kcm_mux_list); } static struct kcm_mux *kcm_get_next(struct kcm_mux *mux) { struct kcm_net *knet = mux->knet; return list_next_or_null_rcu(&knet->mux_list, &mux->kcm_mux_list, struct kcm_mux, kcm_mux_list); } static struct kcm_mux *kcm_get_idx(struct seq_file *seq, loff_t pos) { struct net *net = seq_file_net(seq); struct kcm_net *knet = net_generic(net, kcm_net_id); struct kcm_mux *m; list_for_each_entry_rcu(m, &knet->mux_list, kcm_mux_list) { if (!pos) return m; --pos; } return NULL; } static void *kcm_seq_next(struct seq_file *seq, void *v, loff_t *pos) { void *p; if (v == SEQ_START_TOKEN) p = kcm_get_first(seq); else p = kcm_get_next(v); ++*pos; return p; } static void *kcm_seq_start(struct seq_file *seq, loff_t *pos) __acquires(rcu) { rcu_read_lock(); if (!*pos) return SEQ_START_TOKEN; else return kcm_get_idx(seq, *pos - 1); } static void kcm_seq_stop(struct seq_file *seq, void *v) __releases(rcu) { rcu_read_unlock(); } struct kcm_proc_mux_state { struct seq_net_private p; int idx; }; static void kcm_format_mux_header(struct seq_file *seq) { struct net *net = seq_file_net(seq); struct kcm_net *knet = net_generic(net, kcm_net_id); seq_printf(seq, "*** KCM statistics (%d MUX) ****\n", knet->count); seq_printf(seq, "%-14s %-10s %-16s %-10s %-16s %-8s %-8s %-8s %-8s %s", "Object", "RX-Msgs", "RX-Bytes", "TX-Msgs", "TX-Bytes", "Recv-Q", "Rmem", "Send-Q", "Smem", "Status"); /* XXX: pdsts header stuff here */ seq_puts(seq, "\n"); } static void kcm_format_sock(struct kcm_sock *kcm, struct seq_file *seq, int i, int *len) { seq_printf(seq, " kcm-%-7u %-10llu %-16llu %-10llu %-16llu %-8d %-8d %-8d %-8s ", kcm->index, kcm->stats.rx_msgs, kcm->stats.rx_bytes, kcm->stats.tx_msgs, kcm->stats.tx_bytes, kcm->sk.sk_receive_queue.qlen, sk_rmem_alloc_get(&kcm->sk), kcm->sk.sk_write_queue.qlen, "-"); if (kcm->tx_psock) seq_printf(seq, "Psck-%u ", kcm->tx_psock->index); if (kcm->tx_wait) seq_puts(seq, "TxWait "); if (kcm->tx_wait_more) seq_puts(seq, "WMore "); if (kcm->rx_wait) seq_puts(seq, "RxWait "); seq_puts(seq, "\n"); } static void kcm_format_psock(struct kcm_psock *psock, struct seq_file *seq, int i, int *len) { seq_printf(seq, " psock-%-5u %-10llu %-16llu %-10llu %-16llu %-8d %-8d %-8d %-8d ", psock->index, psock->strp.stats.msgs, psock->strp.stats.bytes, psock->stats.tx_msgs, psock->stats.tx_bytes, psock->sk->sk_receive_queue.qlen, atomic_read(&psock->sk->sk_rmem_alloc), psock->sk->sk_write_queue.qlen, refcount_read(&psock->sk->sk_wmem_alloc)); if (psock->done) seq_puts(seq, "Done "); if (psock->tx_stopped) seq_puts(seq, "TxStop "); if (psock->strp.stopped) seq_puts(seq, "RxStop "); if (psock->tx_kcm) seq_printf(seq, "Rsvd-%d ", psock->tx_kcm->index); if (!psock->strp.paused && !psock->ready_rx_msg) { if (psock->sk->sk_receive_queue.qlen) { if (psock->strp.need_bytes) seq_printf(seq, "RxWait=%u ", psock->strp.need_bytes); else seq_printf(seq, "RxWait "); } } else { if (psock->strp.paused) seq_puts(seq, "RxPause "); if (psock->ready_rx_msg) seq_puts(seq, "RdyRx "); } seq_puts(seq, "\n"); } static void kcm_format_mux(struct kcm_mux *mux, loff_t idx, struct seq_file *seq) { int i, len; struct kcm_sock *kcm; struct kcm_psock *psock; /* mux information */ seq_printf(seq, "%-6s%-8s %-10llu %-16llu %-10llu %-16llu %-8s %-8s %-8s %-8s ", "mux", "", mux->stats.rx_msgs, mux->stats.rx_bytes, mux->stats.tx_msgs, mux->stats.tx_bytes, "-", "-", "-", "-"); seq_printf(seq, "KCMs: %d, Psocks %d\n", mux->kcm_socks_cnt, mux->psocks_cnt); /* kcm sock information */ i = 0; spin_lock_bh(&mux->lock); list_for_each_entry(kcm, &mux->kcm_socks, kcm_sock_list) { kcm_format_sock(kcm, seq, i, &len); i++; } i = 0; list_for_each_entry(psock, &mux->psocks, psock_list) { kcm_format_psock(psock, seq, i, &len); i++; } spin_unlock_bh(&mux->lock); } static int kcm_seq_show(struct seq_file *seq, void *v) { struct kcm_proc_mux_state *mux_state; mux_state = seq->private; if (v == SEQ_START_TOKEN) { mux_state->idx = 0; kcm_format_mux_header(seq); } else { kcm_format_mux(v, mux_state->idx, seq); mux_state->idx++; } return 0; } static const struct seq_operations kcm_seq_ops = { .show = kcm_seq_show, .start = kcm_seq_start, .next = kcm_seq_next, .stop = kcm_seq_stop, }; static int kcm_stats_seq_show(struct seq_file *seq, void *v) { struct kcm_psock_stats psock_stats; struct kcm_mux_stats mux_stats; struct strp_aggr_stats strp_stats; struct kcm_mux *mux; struct kcm_psock *psock; struct net *net = seq->private; struct kcm_net *knet = net_generic(net, kcm_net_id); memset(&mux_stats, 0, sizeof(mux_stats)); memset(&psock_stats, 0, sizeof(psock_stats)); memset(&strp_stats, 0, sizeof(strp_stats)); mutex_lock(&knet->mutex); aggregate_mux_stats(&knet->aggregate_mux_stats, &mux_stats); aggregate_psock_stats(&knet->aggregate_psock_stats, &psock_stats); aggregate_strp_stats(&knet->aggregate_strp_stats, &strp_stats); list_for_each_entry(mux, &knet->mux_list, kcm_mux_list) { spin_lock_bh(&mux->lock); aggregate_mux_stats(&mux->stats, &mux_stats); aggregate_psock_stats(&mux->aggregate_psock_stats, &psock_stats); aggregate_strp_stats(&mux->aggregate_strp_stats, &strp_stats); list_for_each_entry(psock, &mux->psocks, psock_list) { aggregate_psock_stats(&psock->stats, &psock_stats); save_strp_stats(&psock->strp, &strp_stats); } spin_unlock_bh(&mux->lock); } mutex_unlock(&knet->mutex); seq_printf(seq, "%-8s %-10s %-16s %-10s %-16s %-10s %-10s %-10s %-10s %-10s\n", "MUX", "RX-Msgs", "RX-Bytes", "TX-Msgs", "TX-Bytes", "TX-Retries", "Attach", "Unattach", "UnattchRsvd", "RX-RdyDrops"); seq_printf(seq, "%-8s %-10llu %-16llu %-10llu %-16llu %-10u %-10u %-10u %-10u %-10u\n", "", mux_stats.rx_msgs, mux_stats.rx_bytes, mux_stats.tx_msgs, mux_stats.tx_bytes, mux_stats.tx_retries, mux_stats.psock_attach, mux_stats.psock_unattach_rsvd, mux_stats.psock_unattach, mux_stats.rx_ready_drops); seq_printf(seq, "%-8s %-10s %-16s %-10s %-16s %-10s %-10s %-10s %-10s %-10s %-10s %-10s %-10s %-10s %-10s %-10s\n", "Psock", "RX-Msgs", "RX-Bytes", "TX-Msgs", "TX-Bytes", "Reserved", "Unreserved", "RX-Aborts", "RX-Intr", "RX-Unrecov", "RX-MemFail", "RX-NeedMor", "RX-BadLen", "RX-TooBig", "RX-Timeout", "TX-Aborts"); seq_printf(seq, "%-8s %-10llu %-16llu %-10llu %-16llu %-10llu %-10llu %-10u %-10u %-10u %-10u %-10u %-10u %-10u %-10u %-10u\n", "", strp_stats.msgs, strp_stats.bytes, psock_stats.tx_msgs, psock_stats.tx_bytes, psock_stats.reserved, psock_stats.unreserved, strp_stats.aborts, strp_stats.interrupted, strp_stats.unrecov_intr, strp_stats.mem_fail, strp_stats.need_more_hdr, strp_stats.bad_hdr_len, strp_stats.msg_too_big, strp_stats.msg_timeouts, psock_stats.tx_aborts); return 0; } static int kcm_proc_init_net(struct net *net) { if (!proc_create_net_single("kcm_stats", 0444, net->proc_net, kcm_stats_seq_show, NULL)) goto out_kcm_stats; if (!proc_create_net("kcm", 0444, net->proc_net, &kcm_seq_ops, sizeof(struct kcm_proc_mux_state))) goto out_kcm; return 0; out_kcm: remove_proc_entry("kcm_stats", net->proc_net); out_kcm_stats: return -ENOMEM; } static void kcm_proc_exit_net(struct net *net) { remove_proc_entry("kcm", net->proc_net); remove_proc_entry("kcm_stats", net->proc_net); } static struct pernet_operations kcm_net_ops = { .init = kcm_proc_init_net, .exit = kcm_proc_exit_net, }; int __init kcm_proc_init(void) { return register_pernet_subsys(&kcm_net_ops); } void __exit kcm_proc_exit(void) { unregister_pernet_subsys(&kcm_net_ops); } #endif /* CONFIG_PROC_FS */
218 1406 3950 1392 1324 1324 7954 7152 3885 8 3885 5335 176 5334 3071 6595 5406 5236 5193 189 28 189 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 /* SPDX-License-Identifier: GPL-2.0 */ /* * Variant of atomic_t specialized for reference counts. * * The interface matches the atomic_t interface (to aid in porting) but only * provides the few functions one should use for reference counting. * * Saturation semantics * ==================== * * refcount_t differs from atomic_t in that the counter saturates at * REFCOUNT_SATURATED and will not move once there. This avoids wrapping the * counter and causing 'spurious' use-after-free issues. In order to avoid the * cost associated with introducing cmpxchg() loops into all of the saturating * operations, we temporarily allow the counter to take on an unchecked value * and then explicitly set it to REFCOUNT_SATURATED on detecting that underflow * or overflow has occurred. Although this is racy when multiple threads * access the refcount concurrently, by placing REFCOUNT_SATURATED roughly * equidistant from 0 and INT_MAX we minimise the scope for error: * * INT_MAX REFCOUNT_SATURATED UINT_MAX * 0 (0x7fff_ffff) (0xc000_0000) (0xffff_ffff) * +--------------------------------+----------------+----------------+ * <---------- bad value! ----------> * * (in a signed view of the world, the "bad value" range corresponds to * a negative counter value). * * As an example, consider a refcount_inc() operation that causes the counter * to overflow: * * int old = atomic_fetch_add_relaxed(r); * // old is INT_MAX, refcount now INT_MIN (0x8000_0000) * if (old < 0) * atomic_set(r, REFCOUNT_SATURATED); * * If another thread also performs a refcount_inc() operation between the two * atomic operations, then the count will continue to edge closer to 0. If it * reaches a value of 1 before /any/ of the threads reset it to the saturated * value, then a concurrent refcount_dec_and_test() may erroneously free the * underlying object. * Linux limits the maximum number of tasks to PID_MAX_LIMIT, which is currently * 0x400000 (and can't easily be raised in the future beyond FUTEX_TID_MASK). * With the current PID limit, if no batched refcounting operations are used and * the attacker can't repeatedly trigger kernel oopses in the middle of refcount * operations, this makes it impossible for a saturated refcount to leave the * saturation range, even if it is possible for multiple uses of the same * refcount to nest in the context of a single task: * * (UINT_MAX+1-REFCOUNT_SATURATED) / PID_MAX_LIMIT = * 0x40000000 / 0x400000 = 0x100 = 256 * * If hundreds of references are added/removed with a single refcounting * operation, it may potentially be possible to leave the saturation range; but * given the precise timing details involved with the round-robin scheduling of * each thread manipulating the refcount and the need to hit the race multiple * times in succession, there doesn't appear to be a practical avenue of attack * even if using refcount_add() operations with larger increments. * * Memory ordering * =============== * * Memory ordering rules are slightly relaxed wrt regular atomic_t functions * and provide only what is strictly required for refcounts. * * The increments are fully relaxed; these will not provide ordering. The * rationale is that whatever is used to obtain the object we're increasing the * reference count on will provide the ordering. For locked data structures, * its the lock acquire, for RCU/lockless data structures its the dependent * load. * * Do note that inc_not_zero() provides a control dependency which will order * future stores against the inc, this ensures we'll never modify the object * if we did not in fact acquire a reference. * * The decrements will provide release order, such that all the prior loads and * stores will be issued before, it also provides a control dependency, which * will order us against the subsequent free(). * * The control dependency is against the load of the cmpxchg (ll/sc) that * succeeded. This means the stores aren't fully ordered, but this is fine * because the 1->0 transition indicates no concurrency. * * Note that the allocator is responsible for ordering things between free() * and alloc(). * * The decrements dec_and_test() and sub_and_test() also provide acquire * ordering on success. * * refcount_{add|inc}_not_zero_acquire() and refcount_set_release() provide * acquire and release ordering for cases when the memory occupied by the * object might be reused to store another object. This is important for the * cases where secondary validation is required to detect such reuse, e.g. * SLAB_TYPESAFE_BY_RCU. The secondary validation checks have to happen after * the refcount is taken, hence acquire order is necessary. Similarly, when the * object is initialized, all stores to its attributes should be visible before * the refcount is set, otherwise a stale attribute value might be used by * another task which succeeds in taking a refcount to the new object. */ #ifndef _LINUX_REFCOUNT_H #define _LINUX_REFCOUNT_H #include <linux/atomic.h> #include <linux/bug.h> #include <linux/compiler.h> #include <linux/limits.h> #include <linux/refcount_types.h> #include <linux/spinlock_types.h> struct mutex; #define REFCOUNT_INIT(n) { .refs = ATOMIC_INIT(n), } #define REFCOUNT_MAX INT_MAX #define REFCOUNT_SATURATED (INT_MIN / 2) enum refcount_saturation_type { REFCOUNT_ADD_NOT_ZERO_OVF, REFCOUNT_ADD_OVF, REFCOUNT_ADD_UAF, REFCOUNT_SUB_UAF, REFCOUNT_DEC_LEAK, }; void refcount_warn_saturate(refcount_t *r, enum refcount_saturation_type t); /** * refcount_set - set a refcount's value * @r: the refcount * @n: value to which the refcount will be set */ static inline void refcount_set(refcount_t *r, int n) { atomic_set(&r->refs, n); } /** * refcount_set_release - set a refcount's value with release ordering * @r: the refcount * @n: value to which the refcount will be set * * This function should be used when memory occupied by the object might be * reused to store another object -- consider SLAB_TYPESAFE_BY_RCU. * * Provides release memory ordering which will order previous memory operations * against this store. This ensures all updates to this object are visible * once the refcount is set and stale values from the object previously * occupying this memory are overwritten with new ones. * * This function should be called only after new object is fully initialized. * After this call the object should be considered visible to other tasks even * if it was not yet added into an object collection normally used to discover * it. This is because other tasks might have discovered the object previously * occupying the same memory and after memory reuse they can succeed in taking * refcount to the new object and start using it. */ static inline void refcount_set_release(refcount_t *r, int n) { atomic_set_release(&r->refs, n); } /** * refcount_read - get a refcount's value * @r: the refcount * * Return: the refcount's value */ static inline unsigned int refcount_read(const refcount_t *r) { return atomic_read(&r->refs); } static inline __must_check bool __refcount_add_not_zero(int i, refcount_t *r, int *oldp) { int old = refcount_read(r); do { if (!old) break; } while (!atomic_try_cmpxchg_relaxed(&r->refs, &old, old + i)); if (oldp) *oldp = old; if (unlikely(old < 0 || old + i < 0)) refcount_warn_saturate(r, REFCOUNT_ADD_NOT_ZERO_OVF); return old; } /** * refcount_add_not_zero - add a value to a refcount unless it is 0 * @i: the value to add to the refcount * @r: the refcount * * Will saturate at REFCOUNT_SATURATED and WARN. * * Provides no memory ordering, it is assumed the caller has guaranteed the * object memory to be stable (RCU, etc.). It does provide a control dependency * and thereby orders future stores. See the comment on top. * * Use of this function is not recommended for the normal reference counting * use case in which references are taken and released one at a time. In these * cases, refcount_inc(), or one of its variants, should instead be used to * increment a reference count. * * Return: false if the passed refcount is 0, true otherwise */ static inline __must_check bool refcount_add_not_zero(int i, refcount_t *r) { return __refcount_add_not_zero(i, r, NULL); } static inline __must_check bool __refcount_add_not_zero_limited_acquire(int i, refcount_t *r, int *oldp, int limit) { int old = refcount_read(r); do { if (!old) break; if (i > limit - old) { if (oldp) *oldp = old; return false; } } while (!atomic_try_cmpxchg_acquire(&r->refs, &old, old + i)); if (oldp) *oldp = old; if (unlikely(old < 0 || old + i < 0)) refcount_warn_saturate(r, REFCOUNT_ADD_NOT_ZERO_OVF); return old; } static inline __must_check bool __refcount_inc_not_zero_limited_acquire(refcount_t *r, int *oldp, int limit) { return __refcount_add_not_zero_limited_acquire(1, r, oldp, limit); } static inline __must_check bool __refcount_add_not_zero_acquire(int i, refcount_t *r, int *oldp) { return __refcount_add_not_zero_limited_acquire(i, r, oldp, INT_MAX); } /** * refcount_add_not_zero_acquire - add a value to a refcount with acquire ordering unless it is 0 * * @i: the value to add to the refcount * @r: the refcount * * Will saturate at REFCOUNT_SATURATED and WARN. * * This function should be used when memory occupied by the object might be * reused to store another object -- consider SLAB_TYPESAFE_BY_RCU. * * Provides acquire memory ordering on success, it is assumed the caller has * guaranteed the object memory to be stable (RCU, etc.). It does provide a * control dependency and thereby orders future stores. See the comment on top. * * Use of this function is not recommended for the normal reference counting * use case in which references are taken and released one at a time. In these * cases, refcount_inc_not_zero_acquire() should instead be used to increment a * reference count. * * Return: false if the passed refcount is 0, true otherwise */ static inline __must_check bool refcount_add_not_zero_acquire(int i, refcount_t *r) { return __refcount_add_not_zero_acquire(i, r, NULL); } static inline void __refcount_add(int i, refcount_t *r, int *oldp) { int old = atomic_fetch_add_relaxed(i, &r->refs); if (oldp) *oldp = old; if (unlikely(!old)) refcount_warn_saturate(r, REFCOUNT_ADD_UAF); else if (unlikely(old < 0 || old + i < 0)) refcount_warn_saturate(r, REFCOUNT_ADD_OVF); } /** * refcount_add - add a value to a refcount * @i: the value to add to the refcount * @r: the refcount * * Similar to atomic_add(), but will saturate at REFCOUNT_SATURATED and WARN. * * Provides no memory ordering, it is assumed the caller has guaranteed the * object memory to be stable (RCU, etc.). It does provide a control dependency * and thereby orders future stores. See the comment on top. * * Use of this function is not recommended for the normal reference counting * use case in which references are taken and released one at a time. In these * cases, refcount_inc(), or one of its variants, should instead be used to * increment a reference count. */ static inline void refcount_add(int i, refcount_t *r) { __refcount_add(i, r, NULL); } static inline __must_check bool __refcount_inc_not_zero(refcount_t *r, int *oldp) { return __refcount_add_not_zero(1, r, oldp); } /** * refcount_inc_not_zero - increment a refcount unless it is 0 * @r: the refcount to increment * * Similar to atomic_inc_not_zero(), but will saturate at REFCOUNT_SATURATED * and WARN. * * Provides no memory ordering, it is assumed the caller has guaranteed the * object memory to be stable (RCU, etc.). It does provide a control dependency * and thereby orders future stores. See the comment on top. * * Return: true if the increment was successful, false otherwise */ static inline __must_check bool refcount_inc_not_zero(refcount_t *r) { return __refcount_inc_not_zero(r, NULL); } static inline __must_check bool __refcount_inc_not_zero_acquire(refcount_t *r, int *oldp) { return __refcount_add_not_zero_acquire(1, r, oldp); } /** * refcount_inc_not_zero_acquire - increment a refcount with acquire ordering unless it is 0 * @r: the refcount to increment * * Similar to refcount_inc_not_zero(), but provides acquire memory ordering on * success. * * This function should be used when memory occupied by the object might be * reused to store another object -- consider SLAB_TYPESAFE_BY_RCU. * * Provides acquire memory ordering on success, it is assumed the caller has * guaranteed the object memory to be stable (RCU, etc.). It does provide a * control dependency and thereby orders future stores. See the comment on top. * * Return: true if the increment was successful, false otherwise */ static inline __must_check bool refcount_inc_not_zero_acquire(refcount_t *r) { return __refcount_inc_not_zero_acquire(r, NULL); } static inline void __refcount_inc(refcount_t *r, int *oldp) { __refcount_add(1, r, oldp); } /** * refcount_inc - increment a refcount * @r: the refcount to increment * * Similar to atomic_inc(), but will saturate at REFCOUNT_SATURATED and WARN. * * Provides no memory ordering, it is assumed the caller already has a * reference on the object. * * Will WARN if the refcount is 0, as this represents a possible use-after-free * condition. */ static inline void refcount_inc(refcount_t *r) { __refcount_inc(r, NULL); } static inline __must_check bool __refcount_sub_and_test(int i, refcount_t *r, int *oldp) { int old = atomic_fetch_sub_release(i, &r->refs); if (oldp) *oldp = old; if (old > 0 && old == i) { smp_acquire__after_ctrl_dep(); return true; } if (unlikely(old <= 0 || old - i < 0)) refcount_warn_saturate(r, REFCOUNT_SUB_UAF); return false; } /** * refcount_sub_and_test - subtract from a refcount and test if it is 0 * @i: amount to subtract from the refcount * @r: the refcount * * Similar to atomic_dec_and_test(), but it will WARN, return false and * ultimately leak on underflow and will fail to decrement when saturated * at REFCOUNT_SATURATED. * * Provides release memory ordering, such that prior loads and stores are done * before, and provides an acquire ordering on success such that free() * must come after. * * Use of this function is not recommended for the normal reference counting * use case in which references are taken and released one at a time. In these * cases, refcount_dec(), or one of its variants, should instead be used to * decrement a reference count. * * Return: true if the resulting refcount is 0, false otherwise */ static inline __must_check bool refcount_sub_and_test(int i, refcount_t *r) { return __refcount_sub_and_test(i, r, NULL); } static inline __must_check bool __refcount_dec_and_test(refcount_t *r, int *oldp) { return __refcount_sub_and_test(1, r, oldp); } /** * refcount_dec_and_test - decrement a refcount and test if it is 0 * @r: the refcount * * Similar to atomic_dec_and_test(), it will WARN on underflow and fail to * decrement when saturated at REFCOUNT_SATURATED. * * Provides release memory ordering, such that prior loads and stores are done * before, and provides an acquire ordering on success such that free() * must come after. * * Return: true if the resulting refcount is 0, false otherwise */ static inline __must_check bool refcount_dec_and_test(refcount_t *r) { return __refcount_dec_and_test(r, NULL); } static inline void __refcount_dec(refcount_t *r, int *oldp) { int old = atomic_fetch_sub_release(1, &r->refs); if (oldp) *oldp = old; if (unlikely(old <= 1)) refcount_warn_saturate(r, REFCOUNT_DEC_LEAK); } /** * refcount_dec - decrement a refcount * @r: the refcount * * Similar to atomic_dec(), it will WARN on underflow and fail to decrement * when saturated at REFCOUNT_SATURATED. * * Provides release memory ordering, such that prior loads and stores are done * before. */ static inline void refcount_dec(refcount_t *r) { __refcount_dec(r, NULL); } extern __must_check bool refcount_dec_if_one(refcount_t *r); extern __must_check bool refcount_dec_not_one(refcount_t *r); extern __must_check bool refcount_dec_and_mutex_lock(refcount_t *r, struct mutex *lock) __cond_acquires(true, lock); extern __must_check bool refcount_dec_and_lock(refcount_t *r, spinlock_t *lock) __cond_acquires(true, lock); extern __must_check bool refcount_dec_and_lock_irqsave(refcount_t *r, spinlock_t *lock, unsigned long *flags) __cond_acquires(true, lock); #endif /* _LINUX_REFCOUNT_H */
73 3 47 47 48 48 50 50 25 25 6 36 25 3 273 263 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2007 Oracle. All rights reserved. */ #ifndef BTRFS_INODE_H #define BTRFS_INODE_H #include <linux/hash.h> #include <linux/refcount.h> #include <linux/spinlock.h> #include <linux/mutex.h> #include <linux/rwsem.h> #include <linux/fs.h> #include <linux/mm.h> #include <linux/compiler.h> #include <linux/fscrypt.h> #include <linux/lockdep.h> #include <uapi/linux/btrfs_tree.h> #include <trace/events/btrfs.h> #include "ctree.h" #include "block-rsv.h" #include "extent_map.h" #include "extent-io-tree.h" struct posix_acl; struct iov_iter; struct writeback_control; struct btrfs_root; struct btrfs_fs_info; struct btrfs_trans_handle; struct btrfs_bio; struct btrfs_file_extent; struct btrfs_delayed_node; /* * Since we search a directory based on f_pos (struct dir_context::pos) we have * to start at 2 since '.' and '..' have f_pos of 0 and 1 respectively, so * everybody else has to start at 2 (see btrfs_real_readdir() and dir_emit_dots()). */ #define BTRFS_DIR_START_INDEX 2 /* * ordered_data_close is set by truncate when a file that used * to have good data has been truncated to zero. When it is set * the btrfs file release call will add this inode to the * ordered operations list so that we make sure to flush out any * new data the application may have written before commit. */ enum { BTRFS_INODE_FLUSH_ON_CLOSE, BTRFS_INODE_DUMMY, BTRFS_INODE_IN_DEFRAG, BTRFS_INODE_HAS_ASYNC_EXTENT, /* * Always set under the VFS' inode lock, otherwise it can cause races * during fsync (we start as a fast fsync and then end up in a full * fsync racing with ordered extent completion). */ BTRFS_INODE_NEEDS_FULL_SYNC, BTRFS_INODE_COPY_EVERYTHING, BTRFS_INODE_HAS_PROPS, BTRFS_INODE_SNAPSHOT_FLUSH, /* * Set and used when logging an inode and it serves to signal that an * inode does not have xattrs, so subsequent fsyncs can avoid searching * for xattrs to log. This bit must be cleared whenever a xattr is added * to an inode. */ BTRFS_INODE_NO_XATTRS, /* * Set when we are in a context where we need to start a transaction and * have dirty pages with the respective file range locked. This is to * ensure that when reserving space for the transaction, if we are low * on available space and need to flush delalloc, we will not flush * delalloc for this inode, because that could result in a deadlock (on * the file range, inode's io_tree). */ BTRFS_INODE_NO_DELALLOC_FLUSH, /* * Set when we are working on enabling verity for a file. Computing and * writing the whole Merkle tree can take a while so we want to prevent * races where two separate tasks attempt to simultaneously start verity * on the same file. */ BTRFS_INODE_VERITY_IN_PROGRESS, /* Set when this inode is a free space inode. */ BTRFS_INODE_FREE_SPACE_INODE, /* Set when there are no capabilities in XATTs for the inode. */ BTRFS_INODE_NO_CAP_XATTR, /* * Set if an error happened when doing a COW write before submitting a * bio or during writeback. Used for both buffered writes and direct IO * writes. This is to signal a fast fsync that it has to wait for * ordered extents to complete and therefore not log extent maps that * point to unwritten extents (when an ordered extent completes and it * has the BTRFS_ORDERED_IOERR flag set, it drops extent maps in its * range). */ BTRFS_INODE_COW_WRITE_ERROR, /* * Indicate this is a directory that points to a subvolume for which * there is no root reference item. That's a case like the following: * * $ btrfs subvolume create /mnt/parent * $ btrfs subvolume create /mnt/parent/child * $ btrfs subvolume snapshot /mnt/parent /mnt/snap * * If subvolume "parent" is root 256, subvolume "child" is root 257 and * snapshot "snap" is root 258, then there's no root reference item (key * BTRFS_ROOT_REF_KEY in the root tree) for the subvolume "child" * associated to root 258 (the snapshot) - there's only for the root * of the "parent" subvolume (root 256). In the chunk root we have a * (256 BTRFS_ROOT_REF_KEY 257) key but we don't have a * (258 BTRFS_ROOT_REF_KEY 257) key - the sames goes for backrefs, we * have a (257 BTRFS_ROOT_BACKREF_KEY 256) but we don't have a * (257 BTRFS_ROOT_BACKREF_KEY 258) key. * * So when opening the "child" dentry from the snapshot's directory, * we don't find a root ref item and we create a stub inode. This is * done at new_simple_dir(), called from btrfs_lookup_dentry(). */ BTRFS_INODE_ROOT_STUB, }; /* in memory btrfs inode */ struct btrfs_inode { /* which subvolume this inode belongs to */ struct btrfs_root *root; #if BITS_PER_LONG == 32 /* * The objectid of the corresponding BTRFS_INODE_ITEM_KEY. * On 64 bits platforms we can get it from vfs_inode.i_ino, which is an * unsigned long and therefore 64 bits on such platforms. */ u64 objectid; #endif /* Cached value of inode property 'compression'. */ u8 prop_compress; /* * Force compression on the file using the defrag ioctl, could be * different from prop_compress and takes precedence if set. */ u8 defrag_compress; s8 defrag_compress_level; /* * Lock for counters and all fields used to determine if the inode is in * the log or not (last_trans, last_sub_trans, last_log_commit, * logged_trans), to access/update delalloc_bytes, new_delalloc_bytes, * defrag_bytes, disk_i_size, outstanding_extents, csum_bytes and to * update the VFS' inode number of bytes used. * Also protects setting struct file::private_data. */ spinlock_t lock; /* the extent_tree has caches of all the extent mappings to disk */ struct extent_map_tree extent_tree; /* the io_tree does range state (DIRTY, LOCKED etc) */ struct extent_io_tree io_tree; /* * Keep track of where the inode has extent items mapped in order to * make sure the i_size adjustments are accurate. Not required when the * filesystem is NO_HOLES, the status can't be set while mounted as * it's a mkfs-time feature. */ struct extent_io_tree *file_extent_tree; /* held while logging the inode in tree-log.c */ struct mutex log_mutex; /* * Counters to keep track of the number of extent item's we may use due * to delalloc and such. outstanding_extents is the number of extent * items we think we'll end up using, and reserved_extents is the number * of extent items we've reserved metadata for. Protected by 'lock'. */ unsigned outstanding_extents; /* used to order data wrt metadata */ spinlock_t ordered_tree_lock; struct rb_root ordered_tree; struct rb_node *ordered_tree_last; /* list of all the delalloc inodes in the FS. There are times we need * to write all the delalloc pages to disk, and this list is used * to walk them all. */ struct list_head delalloc_inodes; unsigned long runtime_flags; /* full 64 bit generation number, struct vfs_inode doesn't have a big * enough field for this. */ u64 generation; /* * ID of the transaction handle that last modified this inode. * Protected by 'lock'. */ u64 last_trans; /* * ID of the transaction that last logged this inode. * Protected by 'lock'. */ u64 logged_trans; /* * Log transaction ID when this inode was last modified. * Protected by 'lock'. */ int last_sub_trans; /* A local copy of root's last_log_commit. Protected by 'lock'. */ int last_log_commit; union { /* * Total number of bytes pending delalloc, used by stat to * calculate the real block usage of the file. This is used * only for files. Protected by 'lock'. */ u64 delalloc_bytes; /* * The lowest possible index of the next dir index key which * points to an inode that needs to be logged. * This is used only for directories. * Use the helpers btrfs_get_first_dir_index_to_log() and * btrfs_set_first_dir_index_to_log() to access this field. */ u64 first_dir_index_to_log; }; union { /* * Total number of bytes pending delalloc that fall within a file * range that is either a hole or beyond EOF (and no prealloc extent * exists in the range). This is always <= delalloc_bytes and this * is used only for files. Protected by 'lock'. */ u64 new_delalloc_bytes; /* * The offset of the last dir index key that was logged. * This is used only for directories. Protected by 'log_mutex'. */ u64 last_dir_index_offset; }; union { /* * Total number of bytes pending defrag, used by stat to check whether * it needs COW. Protected by 'lock'. * Used by inodes other than the data relocation inode. */ u64 defrag_bytes; /* * Logical address of the block group being relocated. * Used only by the data relocation inode. */ u64 reloc_block_group_start; }; /* * The size of the file stored in the metadata on disk. data=ordered * means the in-memory i_size might be larger than the size on disk * because not all the blocks are written yet. Protected by 'lock'. */ u64 disk_i_size; union { /* * If this is a directory then index_cnt is the counter for the * index number for new files that are created. For an empty * directory, this must be initialized to BTRFS_DIR_START_INDEX. */ u64 index_cnt; /* * If this is not a directory, this is the number of bytes * outstanding that are going to need csums. This is used in * ENOSPC accounting. Protected by 'lock'. */ u64 csum_bytes; }; /* Cache the directory index number to speed the dir/file remove */ u64 dir_index; /* the fsync log has some corner cases that mean we have to check * directories to see if any unlinks have been done before * the directory was logged. See tree-log.c for all the * details */ u64 last_unlink_trans; union { /* * The id/generation of the last transaction where this inode * was either the source or the destination of a clone/dedupe * operation. Used when logging an inode to know if there are * shared extents that need special care when logging checksum * items, to avoid duplicate checksum items in a log (which can * lead to a corruption where we end up with missing checksum * ranges after log replay). Protected by the VFS inode lock. * Used for regular files only. */ u64 last_reflink_trans; /* * In case this a root stub inode (BTRFS_INODE_ROOT_STUB flag set), * the ID of that root. */ u64 ref_root_id; }; /* Backwards incompatible flags, lower half of inode_item::flags */ u32 flags; /* Read-only compatibility flags, upper half of inode_item::flags */ u32 ro_flags; struct btrfs_block_rsv block_rsv; struct btrfs_delayed_node *delayed_node; /* File creation time. */ u64 i_otime_sec; u32 i_otime_nsec; /* Hook into fs_info->delayed_iputs */ struct list_head delayed_iput; struct rw_semaphore i_mmap_lock; struct inode vfs_inode; }; static inline u64 btrfs_get_first_dir_index_to_log(const struct btrfs_inode *inode) { return READ_ONCE(inode->first_dir_index_to_log); } static inline void btrfs_set_first_dir_index_to_log(struct btrfs_inode *inode, u64 index) { WRITE_ONCE(inode->first_dir_index_to_log, index); } /* Type checked and const-preserving VFS inode -> btrfs inode. */ #define BTRFS_I(_inode) \ _Generic(_inode, \ struct inode *: container_of(_inode, struct btrfs_inode, vfs_inode), \ const struct inode *: (const struct btrfs_inode *)container_of( \ _inode, const struct btrfs_inode, vfs_inode)) static inline unsigned long btrfs_inode_hash(u64 objectid, const struct btrfs_root *root) { u64 h = objectid ^ (root->root_key.objectid * GOLDEN_RATIO_PRIME); #if BITS_PER_LONG == 32 h = (h >> 32) ^ (h & 0xffffffff); #endif return (unsigned long)h; } #if BITS_PER_LONG == 32 /* * On 32 bit systems the i_ino of struct inode is 32 bits (unsigned long), so * we use the inode's location objectid which is a u64 to avoid truncation. */ static inline u64 btrfs_ino(const struct btrfs_inode *inode) { u64 ino = inode->objectid; if (test_bit(BTRFS_INODE_ROOT_STUB, &inode->runtime_flags)) ino = inode->vfs_inode.i_ino; return ino; } #else static inline u64 btrfs_ino(const struct btrfs_inode *inode) { return inode->vfs_inode.i_ino; } #endif static inline void btrfs_get_inode_key(const struct btrfs_inode *inode, struct btrfs_key *key) { key->objectid = btrfs_ino(inode); key->type = BTRFS_INODE_ITEM_KEY; key->offset = 0; } static inline void btrfs_set_inode_number(struct btrfs_inode *inode, u64 ino) { #if BITS_PER_LONG == 32 inode->objectid = ino; #endif inode->vfs_inode.i_ino = ino; } static inline void btrfs_i_size_write(struct btrfs_inode *inode, u64 size) { i_size_write(&inode->vfs_inode, size); inode->disk_i_size = size; } static inline bool btrfs_is_free_space_inode(const struct btrfs_inode *inode) { return test_bit(BTRFS_INODE_FREE_SPACE_INODE, &inode->runtime_flags); } static inline bool is_data_inode(const struct btrfs_inode *inode) { return btrfs_ino(inode) != BTRFS_BTREE_INODE_OBJECTID; } static inline void btrfs_mod_outstanding_extents(struct btrfs_inode *inode, int mod) { lockdep_assert_held(&inode->lock); inode->outstanding_extents += mod; if (btrfs_is_free_space_inode(inode)) return; trace_btrfs_inode_mod_outstanding_extents(inode->root, btrfs_ino(inode), mod, inode->outstanding_extents); } /* * Called every time after doing a buffered, direct IO or memory mapped write. * * This is to ensure that if we write to a file that was previously fsynced in * the current transaction, then try to fsync it again in the same transaction, * we will know that there were changes in the file and that it needs to be * logged. */ static inline void btrfs_set_inode_last_sub_trans(struct btrfs_inode *inode) { spin_lock(&inode->lock); inode->last_sub_trans = inode->root->log_transid; spin_unlock(&inode->lock); } /* * Should be called while holding the inode's VFS lock in exclusive mode, or * while holding the inode's mmap lock (struct btrfs_inode::i_mmap_lock) in * either shared or exclusive mode, or in a context where no one else can access * the inode concurrently (during inode creation or when loading an inode from * disk). */ static inline void btrfs_set_inode_full_sync(struct btrfs_inode *inode) { set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); /* * The inode may have been part of a reflink operation in the last * transaction that modified it, and then a fsync has reset the * last_reflink_trans to avoid subsequent fsyncs in the same * transaction to do unnecessary work. So update last_reflink_trans * to the last_trans value (we have to be pessimistic and assume a * reflink happened). * * The ->last_trans is protected by the inode's spinlock and we can * have a concurrent ordered extent completion update it. Also set * last_reflink_trans to ->last_trans only if the former is less than * the later, because we can be called in a context where * last_reflink_trans was set to the current transaction generation * while ->last_trans was not yet updated in the current transaction, * and therefore has a lower value. */ spin_lock(&inode->lock); if (inode->last_reflink_trans < inode->last_trans) inode->last_reflink_trans = inode->last_trans; spin_unlock(&inode->lock); } static inline bool btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation) { bool ret = false; spin_lock(&inode->lock); if (inode->logged_trans == generation && inode->last_sub_trans <= inode->last_log_commit && inode->last_sub_trans <= btrfs_get_root_last_log_commit(inode->root)) ret = true; spin_unlock(&inode->lock); return ret; } /* * Check if the inode has flags compatible with compression */ static inline bool btrfs_inode_can_compress(const struct btrfs_inode *inode) { if (inode->flags & BTRFS_INODE_NODATACOW || inode->flags & BTRFS_INODE_NODATASUM) return false; return true; } static inline void btrfs_assert_inode_locked(struct btrfs_inode *inode) { /* Immediately trigger a crash if the inode is not locked. */ ASSERT(inode_is_locked(&inode->vfs_inode)); /* Trigger a splat in dmesg if this task is not holding the lock. */ lockdep_assert_held(&inode->vfs_inode.i_rwsem); } static inline void btrfs_update_inode_mapping_flags(struct btrfs_inode *inode) { if (inode->flags & BTRFS_INODE_NODATASUM) mapping_clear_stable_writes(inode->vfs_inode.i_mapping); else mapping_set_stable_writes(inode->vfs_inode.i_mapping); } static inline void btrfs_set_inode_mapping_order(struct btrfs_inode *inode) { /* Metadata inode should not reach here. */ ASSERT(is_data_inode(inode)); /* We only allow BITS_PER_LONGS blocks for each bitmap. */ #ifdef CONFIG_BTRFS_EXPERIMENTAL mapping_set_folio_order_range(inode->vfs_inode.i_mapping, inode->root->fs_info->block_min_order, inode->root->fs_info->block_max_order); #endif } void btrfs_calculate_block_csum_folio(struct btrfs_fs_info *fs_info, const phys_addr_t paddr, u8 *dest); void btrfs_calculate_block_csum_pages(struct btrfs_fs_info *fs_info, const phys_addr_t paddrs[], u8 *dest); int btrfs_check_block_csum(struct btrfs_fs_info *fs_info, phys_addr_t paddr, u8 *csum, const u8 * const csum_expected); bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, u32 bio_offset, const phys_addr_t paddrs[]); noinline int can_nocow_extent(struct btrfs_inode *inode, u64 offset, u64 *len, struct btrfs_file_extent *file_extent, bool nowait); void btrfs_del_delalloc_inode(struct btrfs_inode *inode); struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry); int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index); int btrfs_unlink_inode(struct btrfs_trans_handle *trans, struct btrfs_inode *dir, struct btrfs_inode *inode, const struct fscrypt_str *name); int btrfs_add_link(struct btrfs_trans_handle *trans, struct btrfs_inode *parent_inode, struct btrfs_inode *inode, const struct fscrypt_str *name, bool add_backref, u64 index); int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry); int btrfs_truncate_block(struct btrfs_inode *inode, u64 offset, u64 start, u64 end); int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context); int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, bool in_reclaim_context); int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, unsigned int extra_bits, struct extent_state **cached_state); struct btrfs_new_inode_args { /* Input */ struct inode *dir; struct dentry *dentry; struct inode *inode; bool orphan; bool subvol; /* Output from btrfs_new_inode_prepare(), input to btrfs_create_new_inode(). */ struct posix_acl *default_acl; struct posix_acl *acl; struct fscrypt_name fname; }; int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args, unsigned int *trans_num_items); int btrfs_create_new_inode(struct btrfs_trans_handle *trans, struct btrfs_new_inode_args *args); void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args); struct inode *btrfs_new_subvol_inode(struct mnt_idmap *idmap, struct inode *dir); void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *state, u32 bits); void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, struct extent_state *state, u32 bits); void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state *new, struct extent_state *other); void btrfs_split_delalloc_extent(struct btrfs_inode *inode, struct extent_state *orig, u64 split); void btrfs_evict_inode(struct inode *inode); struct inode *btrfs_alloc_inode(struct super_block *sb); void btrfs_destroy_inode(struct inode *inode); void btrfs_free_inode(struct inode *inode); int btrfs_drop_inode(struct inode *inode); int __init btrfs_init_cachep(void); void __cold btrfs_destroy_cachep(void); struct btrfs_inode *btrfs_iget_path(u64 ino, struct btrfs_root *root, struct btrfs_path *path); struct btrfs_inode *btrfs_iget(u64 ino, struct btrfs_root *root); struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, struct folio *folio, u64 start, u64 len); int btrfs_update_inode(struct btrfs_trans_handle *trans, struct btrfs_inode *inode); int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, struct btrfs_inode *inode); int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct btrfs_inode *inode); int btrfs_orphan_cleanup(struct btrfs_root *root); int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size); void btrfs_add_delayed_iput(struct btrfs_inode *inode); void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info); int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info); int btrfs_prealloc_file_range(struct inode *inode, int mode, u64 start, u64 num_bytes, u64 min_size, loff_t actual_len, u64 *alloc_hint); int btrfs_prealloc_file_range_trans(struct inode *inode, struct btrfs_trans_handle *trans, int mode, u64 start, u64 num_bytes, u64 min_size, loff_t actual_len, u64 *alloc_hint); int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_folio, u64 start, u64 end, struct writeback_control *wbc); int btrfs_writepage_cow_fixup(struct folio *folio); int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info, int compress_type); int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, u64 disk_bytenr, u64 disk_io_size, struct page **pages, void *uring_ctx); ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, struct btrfs_ioctl_encoded_io_args *encoded, struct extent_state **cached_state, u64 *disk_bytenr, u64 *disk_io_size); ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, struct iov_iter *iter, u64 start, u64 lockend, struct extent_state **cached_state, u64 disk_bytenr, u64 disk_io_size, size_t count, bool compressed, bool *unlocked); ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, const struct btrfs_ioctl_encoded_io_args *encoded); struct btrfs_inode *btrfs_find_first_inode(struct btrfs_root *root, u64 min_ino); extern const struct dentry_operations btrfs_dentry_operations; /* Inode locking type flags, by default the exclusive lock is taken. */ enum btrfs_ilock_type { ENUM_BIT(BTRFS_ILOCK_SHARED), ENUM_BIT(BTRFS_ILOCK_TRY), ENUM_BIT(BTRFS_ILOCK_MMAP), }; int btrfs_inode_lock(struct btrfs_inode *inode, unsigned int ilock_flags); void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags); void btrfs_update_inode_bytes(struct btrfs_inode *inode, const u64 add_bytes, const u64 del_bytes); void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end); u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, u64 num_bytes); struct extent_map *btrfs_create_io_em(struct btrfs_inode *inode, u64 start, const struct btrfs_file_extent *file_extent, int type); #endif
294 294 20 20 20 20 20 20 314 306 60 21 48 31 32 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 // SPDX-License-Identifier: GPL-2.0 #include <linux/slab.h> #include "messages.h" #include "subpage.h" #include "btrfs_inode.h" /* * Subpage (block size < folio size) support overview: * * Limitations: * * - Only support 64K page size for now * This is to make metadata handling easier, as 64K page would ensure * all nodesize would fit inside one page, thus we don't need to handle * cases where a tree block crosses several pages. * * - Only metadata read-write for now * The data read-write part is in development. * * - Metadata can't cross 64K page boundary * btrfs-progs and kernel have done that for a while, thus only ancient * filesystems could have such problem. For such case, do a graceful * rejection. * * Special behavior: * * - Metadata * Metadata read is fully supported. * Meaning when reading one tree block will only trigger the read for the * needed range, other unrelated range in the same page will not be touched. * * Metadata write support is partial. * The writeback is still for the full page, but we will only submit * the dirty extent buffers in the page. * * This means, if we have a metadata page like this: * * Page offset * 0 16K 32K 48K 64K * |/////////| |///////////| * \- Tree block A \- Tree block B * * Even if we just want to writeback tree block A, we will also writeback * tree block B if it's also dirty. * * This may cause extra metadata writeback which results more COW. * * Implementation: * * - Common * Both metadata and data will use a new structure, btrfs_folio_state, to * record the status of each sector inside a page. This provides the extra * granularity needed. * * - Metadata * Since we have multiple tree blocks inside one page, we can't rely on page * locking anymore, or we will have greatly reduced concurrency or even * deadlocks (hold one tree lock while trying to lock another tree lock in * the same page). * * Thus for metadata locking, subpage support relies on io_tree locking only. * This means a slightly higher tree locking latency. */ int btrfs_attach_folio_state(const struct btrfs_fs_info *fs_info, struct folio *folio, enum btrfs_folio_type type) { struct btrfs_folio_state *bfs; /* For metadata we don't support large folio yet. */ if (type == BTRFS_SUBPAGE_METADATA) ASSERT(!folio_test_large(folio)); /* * We have cases like a dummy extent buffer page, which is not mapped * and doesn't need to be locked. */ if (folio->mapping) ASSERT(folio_test_locked(folio)); /* Either not subpage, or the folio already has private attached. */ if (folio_test_private(folio)) return 0; if (type == BTRFS_SUBPAGE_METADATA && !btrfs_meta_is_subpage(fs_info)) return 0; if (type == BTRFS_SUBPAGE_DATA && !btrfs_is_subpage(fs_info, folio)) return 0; bfs = btrfs_alloc_folio_state(fs_info, folio_size(folio), type); if (IS_ERR(bfs)) return PTR_ERR(bfs); folio_attach_private(folio, bfs); return 0; } void btrfs_detach_folio_state(const struct btrfs_fs_info *fs_info, struct folio *folio, enum btrfs_folio_type type) { struct btrfs_folio_state *bfs; /* Either not subpage, or the folio already has private attached. */ if (!folio_test_private(folio)) return; if (type == BTRFS_SUBPAGE_METADATA && !btrfs_meta_is_subpage(fs_info)) return; if (type == BTRFS_SUBPAGE_DATA && !btrfs_is_subpage(fs_info, folio)) return; bfs = folio_detach_private(folio); ASSERT(bfs); btrfs_free_folio_state(bfs); } struct btrfs_folio_state *btrfs_alloc_folio_state(const struct btrfs_fs_info *fs_info, size_t fsize, enum btrfs_folio_type type) { struct btrfs_folio_state *ret; unsigned int real_size; ASSERT(fs_info->sectorsize < fsize); real_size = struct_size(ret, bitmaps, BITS_TO_LONGS(btrfs_bitmap_nr_max * (fsize >> fs_info->sectorsize_bits))); ret = kzalloc(real_size, GFP_NOFS); if (!ret) return ERR_PTR(-ENOMEM); spin_lock_init(&ret->lock); if (type == BTRFS_SUBPAGE_METADATA) atomic_set(&ret->eb_refs, 0); else atomic_set(&ret->nr_locked, 0); return ret; } /* * Increase the eb_refs of current subpage. * * This is important for eb allocation, to prevent race with last eb freeing * of the same page. * With the eb_refs increased before the eb inserted into radix tree, * detach_extent_buffer_page() won't detach the folio private while we're still * allocating the extent buffer. */ void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio) { struct btrfs_folio_state *bfs; if (!btrfs_meta_is_subpage(fs_info)) return; ASSERT(folio_test_private(folio) && folio->mapping); lockdep_assert_held(&folio->mapping->i_private_lock); bfs = folio_get_private(folio); atomic_inc(&bfs->eb_refs); } void btrfs_folio_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio) { struct btrfs_folio_state *bfs; if (!btrfs_meta_is_subpage(fs_info)) return; ASSERT(folio_test_private(folio) && folio->mapping); lockdep_assert_held(&folio->mapping->i_private_lock); bfs = folio_get_private(folio); ASSERT(atomic_read(&bfs->eb_refs)); atomic_dec(&bfs->eb_refs); } static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { /* Basic checks */ ASSERT(folio_test_private(folio) && folio_get_private(folio)); ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && IS_ALIGNED(len, fs_info->sectorsize), "start=%llu len=%u", start, len); /* * The range check only works for mapped page, we can still have * unmapped page like dummy extent buffer pages. */ if (folio->mapping) ASSERT(folio_pos(folio) <= start && start + len <= folio_next_pos(folio), "start=%llu len=%u folio_pos=%llu folio_size=%zu", start, len, folio_pos(folio), folio_size(folio)); } #define subpage_calc_start_bit(fs_info, folio, name, start, len) \ ({ \ unsigned int __start_bit; \ const unsigned int __bpf = btrfs_blocks_per_folio(fs_info, folio); \ \ btrfs_subpage_assert(fs_info, folio, start, len); \ __start_bit = offset_in_folio(folio, start) >> fs_info->sectorsize_bits; \ __start_bit += __bpf * btrfs_bitmap_nr_##name; \ __start_bit; \ }) static void btrfs_subpage_clamp_range(struct folio *folio, u64 *start, u32 *len) { u64 orig_start = *start; u32 orig_len = *len; *start = max_t(u64, folio_pos(folio), orig_start); /* * For certain call sites like btrfs_drop_pages(), we may have pages * beyond the target range. In that case, just set @len to 0, subpage * helpers can handle @len == 0 without any problem. */ if (folio_pos(folio) >= orig_start + orig_len) *len = 0; else *len = min_t(u64, folio_next_pos(folio), orig_start + orig_len) - *start; } static bool btrfs_subpage_end_and_test_lock(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { struct btrfs_folio_state *bfs = folio_get_private(folio); const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len); const int nbits = (len >> fs_info->sectorsize_bits); unsigned long flags; unsigned int cleared = 0; int bit = start_bit; bool last; btrfs_subpage_assert(fs_info, folio, start, len); spin_lock_irqsave(&bfs->lock, flags); /* * We have call sites passing @lock_page into * extent_clear_unlock_delalloc() for compression path. * * This @locked_page is locked by plain lock_page(), thus its * subpage::locked is 0. Handle them in a special way. */ if (atomic_read(&bfs->nr_locked) == 0) { spin_unlock_irqrestore(&bfs->lock, flags); return true; } for_each_set_bit_from(bit, bfs->bitmaps, start_bit + nbits) { clear_bit(bit, bfs->bitmaps); cleared++; } ASSERT(atomic_read(&bfs->nr_locked) >= cleared, "atomic_read(&bfs->nr_locked)=%d cleared=%d", atomic_read(&bfs->nr_locked), cleared); last = atomic_sub_and_test(cleared, &bfs->nr_locked); spin_unlock_irqrestore(&bfs->lock, flags); return last; } /* * Handle different locked folios: * * - Non-subpage folio * Just unlock it. * * - folio locked but without any subpage locked * This happens either before writepage_delalloc() or the delalloc range is * already handled by previous folio. * We can simple unlock it. * * - folio locked with subpage range locked. * We go through the locked sectors inside the range and clear their locked * bitmap, reduce the writer lock number, and unlock the page if that's * the last locked range. */ void btrfs_folio_end_lock(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { struct btrfs_folio_state *bfs = folio_get_private(folio); ASSERT(folio_test_locked(folio)); if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio)) { folio_unlock(folio); return; } /* * For subpage case, there are two types of locked page. With or * without locked number. * * Since we own the page lock, no one else could touch subpage::locked * and we are safe to do several atomic operations without spinlock. */ if (atomic_read(&bfs->nr_locked) == 0) { /* No subpage lock, locked by plain lock_page(). */ folio_unlock(folio); return; } btrfs_subpage_clamp_range(folio, &start, &len); if (btrfs_subpage_end_and_test_lock(fs_info, folio, start, len)) folio_unlock(folio); } void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info, struct folio *folio, unsigned long bitmap) { struct btrfs_folio_state *bfs = folio_get_private(folio); const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); const int start_bit = blocks_per_folio * btrfs_bitmap_nr_locked; unsigned long flags; bool last = false; int cleared = 0; int bit; if (!btrfs_is_subpage(fs_info, folio)) { folio_unlock(folio); return; } if (atomic_read(&bfs->nr_locked) == 0) { /* No subpage lock, locked by plain lock_page(). */ folio_unlock(folio); return; } spin_lock_irqsave(&bfs->lock, flags); for_each_set_bit(bit, &bitmap, blocks_per_folio) { if (test_and_clear_bit(bit + start_bit, bfs->bitmaps)) cleared++; } ASSERT(atomic_read(&bfs->nr_locked) >= cleared, "atomic_read(&bfs->nr_locked)=%d cleared=%d", atomic_read(&bfs->nr_locked), cleared); last = atomic_sub_and_test(cleared, &bfs->nr_locked); spin_unlock_irqrestore(&bfs->lock, flags); if (last) folio_unlock(folio); } #define subpage_test_bitmap_all_set(fs_info, folio, name) \ ({ \ struct btrfs_folio_state *__bfs = folio_get_private(folio); \ const unsigned int __bpf = btrfs_blocks_per_folio(fs_info, folio); \ \ bitmap_test_range_all_set(__bfs->bitmaps, \ __bpf * btrfs_bitmap_nr_##name, __bpf); \ }) #define subpage_test_bitmap_all_zero(fs_info, folio, name) \ ({ \ struct btrfs_folio_state *__bfs = folio_get_private(folio); \ const unsigned int __bpf = btrfs_blocks_per_folio(fs_info, folio); \ \ bitmap_test_range_all_zero(__bfs->bitmaps, \ __bpf * btrfs_bitmap_nr_##name, __bpf); \ }) void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { struct btrfs_folio_state *bfs = folio_get_private(folio); unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, uptodate, start, len); unsigned long flags; spin_lock_irqsave(&bfs->lock, flags); bitmap_set(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits); if (subpage_test_bitmap_all_set(fs_info, folio, uptodate)) folio_mark_uptodate(folio); spin_unlock_irqrestore(&bfs->lock, flags); } void btrfs_subpage_clear_uptodate(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { struct btrfs_folio_state *bfs = folio_get_private(folio); unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, uptodate, start, len); unsigned long flags; spin_lock_irqsave(&bfs->lock, flags); bitmap_clear(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits); folio_clear_uptodate(folio); spin_unlock_irqrestore(&bfs->lock, flags); } void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { struct btrfs_folio_state *bfs = folio_get_private(folio); unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, dirty, start, len); unsigned long flags; spin_lock_irqsave(&bfs->lock, flags); bitmap_set(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits); spin_unlock_irqrestore(&bfs->lock, flags); folio_mark_dirty(folio); } /* * Extra clear_and_test function for subpage dirty bitmap. * * Return true if we're the last bits in the dirty_bitmap and clear the * dirty_bitmap. * Return false otherwise. * * NOTE: Callers should manually clear page dirty for true case, as we have * extra handling for tree blocks. */ bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { struct btrfs_folio_state *bfs = folio_get_private(folio); unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, dirty, start, len); unsigned long flags; bool last = false; spin_lock_irqsave(&bfs->lock, flags); bitmap_clear(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits); if (subpage_test_bitmap_all_zero(fs_info, folio, dirty)) last = true; spin_unlock_irqrestore(&bfs->lock, flags); return last; } void btrfs_subpage_clear_dirty(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { bool last; last = btrfs_subpage_clear_and_test_dirty(fs_info, folio, start, len); if (last) folio_clear_dirty_for_io(folio); } void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { struct btrfs_folio_state *bfs = folio_get_private(folio); unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, writeback, start, len); unsigned long flags; bool keep_write; spin_lock_irqsave(&bfs->lock, flags); bitmap_set(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits); /* * Don't clear the TOWRITE tag when starting writeback on a still-dirty * folio. Doing so can cause WB_SYNC_ALL writepages() to overlook it, * assume writeback is complete, and exit too early — violating sync * ordering guarantees. */ keep_write = folio_test_dirty(folio); if (!folio_test_writeback(folio)) __folio_start_writeback(folio, keep_write); spin_unlock_irqrestore(&bfs->lock, flags); } void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { struct btrfs_folio_state *bfs = folio_get_private(folio); unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, writeback, start, len); unsigned long flags; spin_lock_irqsave(&bfs->lock, flags); bitmap_clear(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits); if (subpage_test_bitmap_all_zero(fs_info, folio, writeback)) { ASSERT(folio_test_writeback(folio)); folio_end_writeback(folio); } spin_unlock_irqrestore(&bfs->lock, flags); } void btrfs_subpage_set_ordered(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { struct btrfs_folio_state *bfs = folio_get_private(folio); unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, ordered, start, len); unsigned long flags; spin_lock_irqsave(&bfs->lock, flags); bitmap_set(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits); folio_set_ordered(folio); spin_unlock_irqrestore(&bfs->lock, flags); } void btrfs_subpage_clear_ordered(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { struct btrfs_folio_state *bfs = folio_get_private(folio); unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, ordered, start, len); unsigned long flags; spin_lock_irqsave(&bfs->lock, flags); bitmap_clear(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits); if (subpage_test_bitmap_all_zero(fs_info, folio, ordered)) folio_clear_ordered(folio); spin_unlock_irqrestore(&bfs->lock, flags); } void btrfs_subpage_set_checked(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { struct btrfs_folio_state *bfs = folio_get_private(folio); unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, checked, start, len); unsigned long flags; spin_lock_irqsave(&bfs->lock, flags); bitmap_set(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits); if (subpage_test_bitmap_all_set(fs_info, folio, checked)) folio_set_checked(folio); spin_unlock_irqrestore(&bfs->lock, flags); } void btrfs_subpage_clear_checked(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { struct btrfs_folio_state *bfs = folio_get_private(folio); unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, checked, start, len); unsigned long flags; spin_lock_irqsave(&bfs->lock, flags); bitmap_clear(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits); folio_clear_checked(folio); spin_unlock_irqrestore(&bfs->lock, flags); } /* * Unlike set/clear which is dependent on each page status, for test all bits * are tested in the same way. */ #define IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(name) \ bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \ struct folio *folio, u64 start, u32 len) \ { \ struct btrfs_folio_state *bfs = folio_get_private(folio); \ unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, \ name, start, len); \ unsigned long flags; \ bool ret; \ \ spin_lock_irqsave(&bfs->lock, flags); \ ret = bitmap_test_range_all_set(bfs->bitmaps, start_bit, \ len >> fs_info->sectorsize_bits); \ spin_unlock_irqrestore(&bfs->lock, flags); \ return ret; \ } IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(uptodate); IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(dirty); IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(writeback); IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(ordered); IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(checked); /* * Note that, in selftests (extent-io-tests), we can have empty fs_info passed * in. We only test sectorsize == PAGE_SIZE cases so far, thus we can fall * back to regular sectorsize branch. */ #define IMPLEMENT_BTRFS_PAGE_OPS(name, folio_set_func, \ folio_clear_func, folio_test_func) \ void btrfs_folio_set_##name(const struct btrfs_fs_info *fs_info, \ struct folio *folio, u64 start, u32 len) \ { \ if (unlikely(!fs_info) || \ !btrfs_is_subpage(fs_info, folio)) { \ folio_set_func(folio); \ return; \ } \ btrfs_subpage_set_##name(fs_info, folio, start, len); \ } \ void btrfs_folio_clear_##name(const struct btrfs_fs_info *fs_info, \ struct folio *folio, u64 start, u32 len) \ { \ if (unlikely(!fs_info) || \ !btrfs_is_subpage(fs_info, folio)) { \ folio_clear_func(folio); \ return; \ } \ btrfs_subpage_clear_##name(fs_info, folio, start, len); \ } \ bool btrfs_folio_test_##name(const struct btrfs_fs_info *fs_info, \ struct folio *folio, u64 start, u32 len) \ { \ if (unlikely(!fs_info) || \ !btrfs_is_subpage(fs_info, folio)) \ return folio_test_func(folio); \ return btrfs_subpage_test_##name(fs_info, folio, start, len); \ } \ void btrfs_folio_clamp_set_##name(const struct btrfs_fs_info *fs_info, \ struct folio *folio, u64 start, u32 len) \ { \ if (unlikely(!fs_info) || \ !btrfs_is_subpage(fs_info, folio)) { \ folio_set_func(folio); \ return; \ } \ btrfs_subpage_clamp_range(folio, &start, &len); \ btrfs_subpage_set_##name(fs_info, folio, start, len); \ } \ void btrfs_folio_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \ struct folio *folio, u64 start, u32 len) \ { \ if (unlikely(!fs_info) || \ !btrfs_is_subpage(fs_info, folio)) { \ folio_clear_func(folio); \ return; \ } \ btrfs_subpage_clamp_range(folio, &start, &len); \ btrfs_subpage_clear_##name(fs_info, folio, start, len); \ } \ bool btrfs_folio_clamp_test_##name(const struct btrfs_fs_info *fs_info, \ struct folio *folio, u64 start, u32 len) \ { \ if (unlikely(!fs_info) || \ !btrfs_is_subpage(fs_info, folio)) \ return folio_test_func(folio); \ btrfs_subpage_clamp_range(folio, &start, &len); \ return btrfs_subpage_test_##name(fs_info, folio, start, len); \ } \ void btrfs_meta_folio_set_##name(struct folio *folio, const struct extent_buffer *eb) \ { \ if (!btrfs_meta_is_subpage(eb->fs_info)) { \ folio_set_func(folio); \ return; \ } \ btrfs_subpage_set_##name(eb->fs_info, folio, eb->start, eb->len); \ } \ void btrfs_meta_folio_clear_##name(struct folio *folio, const struct extent_buffer *eb) \ { \ if (!btrfs_meta_is_subpage(eb->fs_info)) { \ folio_clear_func(folio); \ return; \ } \ btrfs_subpage_clear_##name(eb->fs_info, folio, eb->start, eb->len); \ } \ bool btrfs_meta_folio_test_##name(struct folio *folio, const struct extent_buffer *eb) \ { \ if (!btrfs_meta_is_subpage(eb->fs_info)) \ return folio_test_func(folio); \ return btrfs_subpage_test_##name(eb->fs_info, folio, eb->start, eb->len); \ } IMPLEMENT_BTRFS_PAGE_OPS(uptodate, folio_mark_uptodate, folio_clear_uptodate, folio_test_uptodate); IMPLEMENT_BTRFS_PAGE_OPS(dirty, folio_mark_dirty, folio_clear_dirty_for_io, folio_test_dirty); IMPLEMENT_BTRFS_PAGE_OPS(writeback, folio_start_writeback, folio_end_writeback, folio_test_writeback); IMPLEMENT_BTRFS_PAGE_OPS(ordered, folio_set_ordered, folio_clear_ordered, folio_test_ordered); IMPLEMENT_BTRFS_PAGE_OPS(checked, folio_set_checked, folio_clear_checked, folio_test_checked); #define GET_SUBPAGE_BITMAP(fs_info, folio, name, dst) \ { \ const unsigned int __bpf = btrfs_blocks_per_folio(fs_info, folio); \ const struct btrfs_folio_state *__bfs = folio_get_private(folio); \ \ ASSERT(__bpf <= BITS_PER_LONG); \ *dst = bitmap_read(__bfs->bitmaps, \ __bpf * btrfs_bitmap_nr_##name, __bpf); \ } #define SUBPAGE_DUMP_BITMAP(fs_info, folio, name, start, len) \ { \ unsigned long bitmap; \ const unsigned int __bpf = btrfs_blocks_per_folio(fs_info, folio); \ \ GET_SUBPAGE_BITMAP(fs_info, folio, name, &bitmap); \ btrfs_warn(fs_info, \ "dumping bitmap start=%llu len=%u folio=%llu " #name "_bitmap=%*pbl", \ start, len, folio_pos(folio), __bpf, &bitmap); \ } /* * Make sure not only the page dirty bit is cleared, but also subpage dirty bit * is cleared. */ void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { struct btrfs_folio_state *bfs; unsigned int start_bit; unsigned int nbits; unsigned long flags; if (!IS_ENABLED(CONFIG_BTRFS_ASSERT)) return; if (!btrfs_is_subpage(fs_info, folio)) { ASSERT(!folio_test_dirty(folio)); return; } start_bit = subpage_calc_start_bit(fs_info, folio, dirty, start, len); nbits = len >> fs_info->sectorsize_bits; bfs = folio_get_private(folio); ASSERT(bfs); spin_lock_irqsave(&bfs->lock, flags); if (unlikely(!bitmap_test_range_all_zero(bfs->bitmaps, start_bit, nbits))) { SUBPAGE_DUMP_BITMAP(fs_info, folio, dirty, start, len); ASSERT(bitmap_test_range_all_zero(bfs->bitmaps, start_bit, nbits)); } ASSERT(bitmap_test_range_all_zero(bfs->bitmaps, start_bit, nbits)); spin_unlock_irqrestore(&bfs->lock, flags); } /* * This is for folio already locked by plain lock_page()/folio_lock(), which * doesn't have any subpage awareness. * * This populates the involved subpage ranges so that subpage helpers can * properly unlock them. */ void btrfs_folio_set_lock(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { struct btrfs_folio_state *bfs; unsigned long flags; unsigned int start_bit; unsigned int nbits; int ret; ASSERT(folio_test_locked(folio)); if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio)) return; bfs = folio_get_private(folio); start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len); nbits = len >> fs_info->sectorsize_bits; spin_lock_irqsave(&bfs->lock, flags); /* Target range should not yet be locked. */ if (unlikely(!bitmap_test_range_all_zero(bfs->bitmaps, start_bit, nbits))) { SUBPAGE_DUMP_BITMAP(fs_info, folio, locked, start, len); ASSERT(bitmap_test_range_all_zero(bfs->bitmaps, start_bit, nbits)); } bitmap_set(bfs->bitmaps, start_bit, nbits); ret = atomic_add_return(nbits, &bfs->nr_locked); ASSERT(ret <= btrfs_blocks_per_folio(fs_info, folio)); spin_unlock_irqrestore(&bfs->lock, flags); } /* * Clear the dirty flag for the folio. * * If the affected folio is no longer dirty, return true. Otherwise return false. */ bool btrfs_meta_folio_clear_and_test_dirty(struct folio *folio, const struct extent_buffer *eb) { bool last; if (!btrfs_meta_is_subpage(eb->fs_info)) { folio_clear_dirty_for_io(folio); return true; } last = btrfs_subpage_clear_and_test_dirty(eb->fs_info, folio, eb->start, eb->len); if (last) { folio_clear_dirty_for_io(folio); return true; } return false; } void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { struct btrfs_folio_state *bfs; const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); unsigned long uptodate_bitmap; unsigned long dirty_bitmap; unsigned long writeback_bitmap; unsigned long ordered_bitmap; unsigned long checked_bitmap; unsigned long locked_bitmap; unsigned long flags; ASSERT(folio_test_private(folio) && folio_get_private(folio)); ASSERT(blocks_per_folio > 1); bfs = folio_get_private(folio); spin_lock_irqsave(&bfs->lock, flags); GET_SUBPAGE_BITMAP(fs_info, folio, uptodate, &uptodate_bitmap); GET_SUBPAGE_BITMAP(fs_info, folio, dirty, &dirty_bitmap); GET_SUBPAGE_BITMAP(fs_info, folio, writeback, &writeback_bitmap); GET_SUBPAGE_BITMAP(fs_info, folio, ordered, &ordered_bitmap); GET_SUBPAGE_BITMAP(fs_info, folio, checked, &checked_bitmap); GET_SUBPAGE_BITMAP(fs_info, folio, locked, &locked_bitmap); spin_unlock_irqrestore(&bfs->lock, flags); dump_page(folio_page(folio, 0), "btrfs folio state dump"); btrfs_warn(fs_info, "start=%llu len=%u page=%llu, bitmaps uptodate=%*pbl dirty=%*pbl locked=%*pbl writeback=%*pbl ordered=%*pbl checked=%*pbl", start, len, folio_pos(folio), blocks_per_folio, &uptodate_bitmap, blocks_per_folio, &dirty_bitmap, blocks_per_folio, &locked_bitmap, blocks_per_folio, &writeback_bitmap, blocks_per_folio, &ordered_bitmap, blocks_per_folio, &checked_bitmap); } void btrfs_get_subpage_dirty_bitmap(struct btrfs_fs_info *fs_info, struct folio *folio, unsigned long *ret_bitmap) { struct btrfs_folio_state *bfs; unsigned long flags; ASSERT(folio_test_private(folio) && folio_get_private(folio)); ASSERT(btrfs_blocks_per_folio(fs_info, folio) > 1); bfs = folio_get_private(folio); spin_lock_irqsave(&bfs->lock, flags); GET_SUBPAGE_BITMAP(fs_info, folio, dirty, ret_bitmap); spin_unlock_irqrestore(&bfs->lock, flags); }
115 5747 5166 1220 4894 5687 1 14 313 563 5017 5099 17 1136 14 31 592 114 115 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_FS_SUPER_H #define _LINUX_FS_SUPER_H #include <linux/fs/super_types.h> #include <linux/unicode.h> /* * These are internal functions, please use sb_start_{write,pagefault,intwrite} * instead. */ static inline void __sb_end_write(struct super_block *sb, int level) { percpu_up_read(sb->s_writers.rw_sem + level - 1); } static inline void __sb_start_write(struct super_block *sb, int level) { percpu_down_read_freezable(sb->s_writers.rw_sem + level - 1, true); } static inline bool __sb_start_write_trylock(struct super_block *sb, int level) { return percpu_down_read_trylock(sb->s_writers.rw_sem + level - 1); } #define __sb_writers_acquired(sb, lev) \ percpu_rwsem_acquire(&(sb)->s_writers.rw_sem[(lev) - 1], 1, _THIS_IP_) #define __sb_writers_release(sb, lev) \ percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev) - 1], _THIS_IP_) /** * __sb_write_started - check if sb freeze level is held * @sb: the super we write to * @level: the freeze level * * * > 0 - sb freeze level is held * * 0 - sb freeze level is not held * * < 0 - !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN */ static inline int __sb_write_started(const struct super_block *sb, int level) { return lockdep_is_held_type(sb->s_writers.rw_sem + level - 1, 1); } /** * sb_write_started - check if SB_FREEZE_WRITE is held * @sb: the super we write to * * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN. */ static inline bool sb_write_started(const struct super_block *sb) { return __sb_write_started(sb, SB_FREEZE_WRITE); } /** * sb_write_not_started - check if SB_FREEZE_WRITE is not held * @sb: the super we write to * * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN. */ static inline bool sb_write_not_started(const struct super_block *sb) { return __sb_write_started(sb, SB_FREEZE_WRITE) <= 0; } /** * sb_end_write - drop write access to a superblock * @sb: the super we wrote to * * Decrement number of writers to the filesystem. Wake up possible waiters * wanting to freeze the filesystem. */ static inline void sb_end_write(struct super_block *sb) { __sb_end_write(sb, SB_FREEZE_WRITE); } /** * sb_end_pagefault - drop write access to a superblock from a page fault * @sb: the super we wrote to * * Decrement number of processes handling write page fault to the filesystem. * Wake up possible waiters wanting to freeze the filesystem. */ static inline void sb_end_pagefault(struct super_block *sb) { __sb_end_write(sb, SB_FREEZE_PAGEFAULT); } /** * sb_end_intwrite - drop write access to a superblock for internal fs purposes * @sb: the super we wrote to * * Decrement fs-internal number of writers to the filesystem. Wake up possible * waiters wanting to freeze the filesystem. */ static inline void sb_end_intwrite(struct super_block *sb) { __sb_end_write(sb, SB_FREEZE_FS); } /** * sb_start_write - get write access to a superblock * @sb: the super we write to * * When a process wants to write data or metadata to a file system (i.e. dirty * a page or an inode), it should embed the operation in a sb_start_write() - * sb_end_write() pair to get exclusion against file system freezing. This * function increments number of writers preventing freezing. If the file * system is already frozen, the function waits until the file system is * thawed. * * Since freeze protection behaves as a lock, users have to preserve * ordering of freeze protection and other filesystem locks. Generally, * freeze protection should be the outermost lock. In particular, we have: * * sb_start_write * -> i_rwsem (write path, truncate, directory ops, ...) * -> s_umount (freeze_super, thaw_super) */ static inline void sb_start_write(struct super_block *sb) { __sb_start_write(sb, SB_FREEZE_WRITE); } DEFINE_GUARD(super_write, struct super_block *, sb_start_write(_T), sb_end_write(_T)) static inline bool sb_start_write_trylock(struct super_block *sb) { return __sb_start_write_trylock(sb, SB_FREEZE_WRITE); } /** * sb_start_pagefault - get write access to a superblock from a page fault * @sb: the super we write to * * When a process starts handling write page fault, it should embed the * operation into sb_start_pagefault() - sb_end_pagefault() pair to get * exclusion against file system freezing. This is needed since the page fault * is going to dirty a page. This function increments number of running page * faults preventing freezing. If the file system is already frozen, the * function waits until the file system is thawed. * * Since page fault freeze protection behaves as a lock, users have to preserve * ordering of freeze protection and other filesystem locks. It is advised to * put sb_start_pagefault() close to mmap_lock in lock ordering. Page fault * handling code implies lock dependency: * * mmap_lock * -> sb_start_pagefault */ static inline void sb_start_pagefault(struct super_block *sb) { __sb_start_write(sb, SB_FREEZE_PAGEFAULT); } /** * sb_start_intwrite - get write access to a superblock for internal fs purposes * @sb: the super we write to * * This is the third level of protection against filesystem freezing. It is * free for use by a filesystem. The only requirement is that it must rank * below sb_start_pagefault. * * For example filesystem can call sb_start_intwrite() when starting a * transaction which somewhat eases handling of freezing for internal sources * of filesystem changes (internal fs threads, discarding preallocation on file * close, etc.). */ static inline void sb_start_intwrite(struct super_block *sb) { __sb_start_write(sb, SB_FREEZE_FS); } static inline bool sb_start_intwrite_trylock(struct super_block *sb) { return __sb_start_write_trylock(sb, SB_FREEZE_FS); } static inline bool sb_rdonly(const struct super_block *sb) { return sb->s_flags & SB_RDONLY; } static inline bool sb_is_blkdev_sb(struct super_block *sb) { return IS_ENABLED(CONFIG_BLOCK) && sb == blockdev_superblock; } #if IS_ENABLED(CONFIG_UNICODE) static inline struct unicode_map *sb_encoding(const struct super_block *sb) { return sb->s_encoding; } /* Compare if two super blocks have the same encoding and flags */ static inline bool sb_same_encoding(const struct super_block *sb1, const struct super_block *sb2) { if (sb1->s_encoding == sb2->s_encoding) return true; return (sb1->s_encoding && sb2->s_encoding && (sb1->s_encoding->version == sb2->s_encoding->version) && (sb1->s_encoding_flags == sb2->s_encoding_flags)); } #else static inline struct unicode_map *sb_encoding(const struct super_block *sb) { return NULL; } static inline bool sb_same_encoding(const struct super_block *sb1, const struct super_block *sb2) { return true; } #endif static inline bool sb_has_encoding(const struct super_block *sb) { return !!sb_encoding(sb); } int sb_set_blocksize(struct super_block *sb, int size); int __must_check sb_min_blocksize(struct super_block *sb, int size); int freeze_super(struct super_block *super, enum freeze_holder who, const void *freeze_owner); int thaw_super(struct super_block *super, enum freeze_holder who, const void *freeze_owner); #endif /* _LINUX_FS_SUPER_H */
46 3 45 46 47 45 46 2 44 46 45 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 // SPDX-License-Identifier: GPL-2.0-only /* * umh - the kernel usermode helper */ #include <linux/module.h> #include <linux/sched.h> #include <linux/sched/task.h> #include <linux/binfmts.h> #include <linux/syscalls.h> #include <linux/unistd.h> #include <linux/kmod.h> #include <linux/slab.h> #include <linux/completion.h> #include <linux/cred.h> #include <linux/file.h> #include <linux/fs_struct.h> #include <linux/workqueue.h> #include <linux/security.h> #include <linux/mount.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/resource.h> #include <linux/notifier.h> #include <linux/suspend.h> #include <linux/rwsem.h> #include <linux/ptrace.h> #include <linux/async.h> #include <linux/uaccess.h> #include <linux/initrd.h> #include <linux/freezer.h> #include <trace/events/module.h> static kernel_cap_t usermodehelper_bset = CAP_FULL_SET; static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET; static DEFINE_SPINLOCK(umh_sysctl_lock); static DECLARE_RWSEM(umhelper_sem); static void call_usermodehelper_freeinfo(struct subprocess_info *info) { if (info->cleanup) (*info->cleanup)(info); kfree(info); } static void umh_complete(struct subprocess_info *sub_info) { struct completion *comp = xchg(&sub_info->complete, NULL); /* * See call_usermodehelper_exec(). If xchg() returns NULL * we own sub_info, the UMH_KILLABLE caller has gone away * or the caller used UMH_NO_WAIT. */ if (comp) complete(comp); else call_usermodehelper_freeinfo(sub_info); } /* * This is the task which runs the usermode application */ static int call_usermodehelper_exec_async(void *data) { struct subprocess_info *sub_info = data; struct cred *new; int retval; spin_lock_irq(&current->sighand->siglock); flush_signal_handlers(current, 1); spin_unlock_irq(&current->sighand->siglock); /* * Initial kernel threads share ther FS with init, in order to * get the init root directory. But we've now created a new * thread that is going to execve a user process and has its own * 'struct fs_struct'. Reset umask to the default. */ current->fs->umask = 0022; /* * Our parent (unbound workqueue) runs with elevated scheduling * priority. Avoid propagating that into the userspace child. */ set_user_nice(current, 0); retval = -ENOMEM; new = prepare_kernel_cred(current); if (!new) goto out; spin_lock(&umh_sysctl_lock); new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset); new->cap_inheritable = cap_intersect(usermodehelper_inheritable, new->cap_inheritable); spin_unlock(&umh_sysctl_lock); if (sub_info->init) { retval = sub_info->init(sub_info, new); if (retval) { abort_creds(new); goto out; } } commit_creds(new); wait_for_initramfs(); retval = kernel_execve(sub_info->path, (const char *const *)sub_info->argv, (const char *const *)sub_info->envp); out: sub_info->retval = retval; /* * call_usermodehelper_exec_sync() will call umh_complete * if UHM_WAIT_PROC. */ if (!(sub_info->wait & UMH_WAIT_PROC)) umh_complete(sub_info); if (!retval) return 0; do_exit(0); } /* Handles UMH_WAIT_PROC. */ static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info) { pid_t pid; /* If SIGCLD is ignored do_wait won't populate the status. */ kernel_sigaction(SIGCHLD, SIG_DFL); pid = user_mode_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD); if (pid < 0) sub_info->retval = pid; else kernel_wait(pid, &sub_info->retval); /* Restore default kernel sig handler */ kernel_sigaction(SIGCHLD, SIG_IGN); umh_complete(sub_info); } /* * We need to create the usermodehelper kernel thread from a task that is affine * to an optimized set of CPUs (or nohz housekeeping ones) such that they * inherit a widest affinity irrespective of call_usermodehelper() callers with * possibly reduced affinity (eg: per-cpu workqueues). We don't want * usermodehelper targets to contend a busy CPU. * * Unbound workqueues provide such wide affinity and allow to block on * UMH_WAIT_PROC requests without blocking pending request (up to some limit). * * Besides, workqueues provide the privilege level that caller might not have * to perform the usermodehelper request. * */ static void call_usermodehelper_exec_work(struct work_struct *work) { struct subprocess_info *sub_info = container_of(work, struct subprocess_info, work); if (sub_info->wait & UMH_WAIT_PROC) { call_usermodehelper_exec_sync(sub_info); } else { pid_t pid; /* * Use CLONE_PARENT to reparent it to kthreadd; we do not * want to pollute current->children, and we need a parent * that always ignores SIGCHLD to ensure auto-reaping. */ pid = user_mode_thread(call_usermodehelper_exec_async, sub_info, CLONE_PARENT | SIGCHLD); if (pid < 0) { sub_info->retval = pid; umh_complete(sub_info); } } } /* * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY * (used for preventing user land processes from being created after the user * land has been frozen during a system-wide hibernation or suspend operation). * Should always be manipulated under umhelper_sem acquired for write. */ static enum umh_disable_depth usermodehelper_disabled = UMH_DISABLED; /* Number of helpers running */ static atomic_t running_helpers = ATOMIC_INIT(0); /* * Wait queue head used by usermodehelper_disable() to wait for all running * helpers to finish. */ static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq); /* * Used by usermodehelper_read_lock_wait() to wait for usermodehelper_disabled * to become 'false'. */ static DECLARE_WAIT_QUEUE_HEAD(usermodehelper_disabled_waitq); /* * Time to wait for running_helpers to become zero before the setting of * usermodehelper_disabled in usermodehelper_disable() fails */ #define RUNNING_HELPERS_TIMEOUT (5 * HZ) int usermodehelper_read_trylock(void) { DEFINE_WAIT(wait); int ret = 0; down_read(&umhelper_sem); for (;;) { prepare_to_wait(&usermodehelper_disabled_waitq, &wait, TASK_INTERRUPTIBLE); if (!usermodehelper_disabled) break; if (usermodehelper_disabled == UMH_DISABLED) ret = -EAGAIN; up_read(&umhelper_sem); if (ret) break; schedule(); try_to_freeze(); down_read(&umhelper_sem); } finish_wait(&usermodehelper_disabled_waitq, &wait); return ret; } EXPORT_SYMBOL_GPL(usermodehelper_read_trylock); long usermodehelper_read_lock_wait(long timeout) { DEFINE_WAIT(wait); if (timeout < 0) return -EINVAL; down_read(&umhelper_sem); for (;;) { prepare_to_wait(&usermodehelper_disabled_waitq, &wait, TASK_UNINTERRUPTIBLE); if (!usermodehelper_disabled) break; up_read(&umhelper_sem); timeout = schedule_timeout(timeout); if (!timeout) break; down_read(&umhelper_sem); } finish_wait(&usermodehelper_disabled_waitq, &wait); return timeout; } EXPORT_SYMBOL_GPL(usermodehelper_read_lock_wait); void usermodehelper_read_unlock(void) { up_read(&umhelper_sem); } EXPORT_SYMBOL_GPL(usermodehelper_read_unlock); /** * __usermodehelper_set_disable_depth - Modify usermodehelper_disabled. * @depth: New value to assign to usermodehelper_disabled. * * Change the value of usermodehelper_disabled (under umhelper_sem locked for * writing) and wakeup tasks waiting for it to change. */ void __usermodehelper_set_disable_depth(enum umh_disable_depth depth) { down_write(&umhelper_sem); usermodehelper_disabled = depth; wake_up(&usermodehelper_disabled_waitq); up_write(&umhelper_sem); } /** * __usermodehelper_disable - Prevent new helpers from being started. * @depth: New value to assign to usermodehelper_disabled. * * Set usermodehelper_disabled to @depth and wait for running helpers to exit. */ int __usermodehelper_disable(enum umh_disable_depth depth) { long retval; if (!depth) return -EINVAL; down_write(&umhelper_sem); usermodehelper_disabled = depth; up_write(&umhelper_sem); /* * From now on call_usermodehelper_exec() won't start any new * helpers, so it is sufficient if running_helpers turns out to * be zero at one point (it may be increased later, but that * doesn't matter). */ retval = wait_event_timeout(running_helpers_waitq, atomic_read(&running_helpers) == 0, RUNNING_HELPERS_TIMEOUT); if (retval) return 0; __usermodehelper_set_disable_depth(UMH_ENABLED); return -EAGAIN; } static void helper_lock(void) { atomic_inc(&running_helpers); smp_mb__after_atomic(); } static void helper_unlock(void) { if (atomic_dec_and_test(&running_helpers)) wake_up(&running_helpers_waitq); } /** * call_usermodehelper_setup - prepare to call a usermode helper * @path: path to usermode executable * @argv: arg vector for process * @envp: environment for process * @gfp_mask: gfp mask for memory allocation * @init: an init function * @cleanup: a cleanup function * @data: arbitrary context sensitive data * * Returns either %NULL on allocation failure, or a subprocess_info * structure. This should be passed to call_usermodehelper_exec to * exec the process and free the structure. * * The init function is used to customize the helper process prior to * exec. A non-zero return code causes the process to error out, exit, * and return the failure to the calling process * * The cleanup function is just before the subprocess_info is about to * be freed. This can be used for freeing the argv and envp. The * Function must be runnable in either a process context or the * context in which call_usermodehelper_exec is called. */ struct subprocess_info *call_usermodehelper_setup(const char *path, char **argv, char **envp, gfp_t gfp_mask, int (*init)(struct subprocess_info *info, struct cred *new), void (*cleanup)(struct subprocess_info *info), void *data) { struct subprocess_info *sub_info; sub_info = kzalloc_obj(struct subprocess_info, gfp_mask); if (!sub_info) goto out; INIT_WORK(&sub_info->work, call_usermodehelper_exec_work); #ifdef CONFIG_STATIC_USERMODEHELPER sub_info->path = CONFIG_STATIC_USERMODEHELPER_PATH; #else sub_info->path = path; #endif sub_info->argv = argv; sub_info->envp = envp; sub_info->cleanup = cleanup; sub_info->init = init; sub_info->data = data; out: return sub_info; } EXPORT_SYMBOL(call_usermodehelper_setup); /** * call_usermodehelper_exec - start a usermode application * @sub_info: information about the subprocess * @wait: wait for the application to finish and return status. * when UMH_NO_WAIT don't wait at all, but you get no useful error back * when the program couldn't be exec'ed. This makes it safe to call * from interrupt context. * * Runs a user-space application. The application is started * asynchronously if wait is not set, and runs as a child of system workqueues. * (ie. it runs with full root capabilities and optimized affinity). * * Note: successful return value does not guarantee the helper was called at * all. You can't rely on sub_info->{init,cleanup} being called even for * UMH_WAIT_* wait modes as STATIC_USERMODEHELPER_PATH="" turns all helpers * into a successful no-op. */ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) { unsigned int state = TASK_UNINTERRUPTIBLE; DECLARE_COMPLETION_ONSTACK(done); int retval = 0; if (!sub_info->path) { call_usermodehelper_freeinfo(sub_info); return -EINVAL; } helper_lock(); if (usermodehelper_disabled) { retval = -EBUSY; goto out; } /* * If there is no binary for us to call, then just return and get out of * here. This allows us to set STATIC_USERMODEHELPER_PATH to "" and * disable all call_usermodehelper() calls. */ if (strlen(sub_info->path) == 0) goto out; /* * Set the completion pointer only if there is a waiter. * This makes it possible to use umh_complete to free * the data structure in case of UMH_NO_WAIT. */ sub_info->complete = (wait == UMH_NO_WAIT) ? NULL : &done; sub_info->wait = wait; queue_work(system_unbound_wq, &sub_info->work); if (wait == UMH_NO_WAIT) /* task has freed sub_info */ goto unlock; if (wait & UMH_FREEZABLE) state |= TASK_FREEZABLE; if (wait & UMH_KILLABLE) { retval = wait_for_completion_state(&done, state | TASK_KILLABLE); if (!retval) goto wait_done; /* umh_complete() will see NULL and free sub_info */ if (xchg(&sub_info->complete, NULL)) goto unlock; /* * fallthrough; in case of -ERESTARTSYS now do uninterruptible * wait_for_completion_state(). Since umh_complete() shall call * complete() in a moment if xchg() above returned NULL, this * uninterruptible wait_for_completion_state() will not block * SIGKILL'ed processes for long. */ } wait_for_completion_state(&done, state); wait_done: retval = sub_info->retval; out: call_usermodehelper_freeinfo(sub_info); unlock: helper_unlock(); return retval; } EXPORT_SYMBOL(call_usermodehelper_exec); /** * call_usermodehelper() - prepare and start a usermode application * @path: path to usermode executable * @argv: arg vector for process * @envp: environment for process * @wait: wait for the application to finish and return status. * when UMH_NO_WAIT don't wait at all, but you get no useful error back * when the program couldn't be exec'ed. This makes it safe to call * from interrupt context. * * This function is the equivalent to use call_usermodehelper_setup() and * call_usermodehelper_exec(). */ int call_usermodehelper(const char *path, char **argv, char **envp, int wait) { struct subprocess_info *info; gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL; info = call_usermodehelper_setup(path, argv, envp, gfp_mask, NULL, NULL, NULL); if (info == NULL) return -ENOMEM; return call_usermodehelper_exec(info, wait); } EXPORT_SYMBOL(call_usermodehelper); #if defined(CONFIG_SYSCTL) static int proc_cap_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; unsigned long cap_array[2]; kernel_cap_t new_cap, *cap; int err; if (write && (!capable(CAP_SETPCAP) || !capable(CAP_SYS_MODULE))) return -EPERM; /* * convert from the global kernel_cap_t to the ulong array to print to * userspace if this is a read. * * Legacy format: capabilities are exposed as two 32-bit values */ cap = table->data; spin_lock(&umh_sysctl_lock); cap_array[0] = (u32) cap->val; cap_array[1] = cap->val >> 32; spin_unlock(&umh_sysctl_lock); t = *table; t.data = &cap_array; /* * actually read or write and array of ulongs from userspace. Remember * these are least significant 32 bits first */ err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos); if (err < 0) return err; new_cap.val = (u32)cap_array[0]; new_cap.val += (u64)cap_array[1] << 32; /* * Drop everything not in the new_cap (but don't add things) */ if (write) { spin_lock(&umh_sysctl_lock); *cap = cap_intersect(*cap, new_cap); spin_unlock(&umh_sysctl_lock); } return 0; } static const struct ctl_table usermodehelper_table[] = { { .procname = "bset", .data = &usermodehelper_bset, .maxlen = 2 * sizeof(unsigned long), .mode = 0600, .proc_handler = proc_cap_handler, }, { .procname = "inheritable", .data = &usermodehelper_inheritable, .maxlen = 2 * sizeof(unsigned long), .mode = 0600, .proc_handler = proc_cap_handler, }, }; static int __init init_umh_sysctls(void) { register_sysctl_init("kernel/usermodehelper", usermodehelper_table); return 0; } early_initcall(init_umh_sysctls); #endif /* CONFIG_SYSCTL */
1 1 6 1 4 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 // SPDX-License-Identifier: GPL-2.0-or-later /* * Squashfs - a compressed read only filesystem for Linux * * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 * Phillip Lougher <phillip@squashfs.org.uk> * * fragment.c */ /* * This file implements code to handle compressed fragments (tail-end packed * datablocks). * * Regular files contain a fragment index which is mapped to a fragment * location on disk and compressed size using a fragment lookup table. * Like everything in Squashfs this fragment lookup table is itself stored * compressed into metadata blocks. A second index table is used to locate * these. This second index table for speed of access (and because it * is small) is read at mount time and cached in memory. */ #include <linux/fs.h> #include <linux/vfs.h> #include <linux/slab.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" #include "squashfs.h" /* * Look-up fragment using the fragment index table. Return the on disk * location of the fragment and its compressed size */ int squashfs_frag_lookup(struct super_block *sb, unsigned int fragment, u64 *fragment_block) { struct squashfs_sb_info *msblk = sb->s_fs_info; int block, offset, size; struct squashfs_fragment_entry fragment_entry; u64 start_block; if (fragment >= msblk->fragments) return -EIO; block = SQUASHFS_FRAGMENT_INDEX(fragment); offset = SQUASHFS_FRAGMENT_INDEX_OFFSET(fragment); start_block = le64_to_cpu(msblk->fragment_index[block]); size = squashfs_read_metadata(sb, &fragment_entry, &start_block, &offset, sizeof(fragment_entry)); if (size < 0) return size; *fragment_block = le64_to_cpu(fragment_entry.start_block); return squashfs_block_size(fragment_entry.size); } /* * Read the uncompressed fragment lookup table indexes off disk into memory */ __le64 *squashfs_read_fragment_index_table(struct super_block *sb, u64 fragment_table_start, u64 next_table, unsigned int fragments) { unsigned int length = SQUASHFS_FRAGMENT_INDEX_BYTES(fragments); __le64 *table; /* * Sanity check, length bytes should not extend into the next table - * this check also traps instances where fragment_table_start is * incorrectly larger than the next table start */ if (fragment_table_start + length > next_table) return ERR_PTR(-EINVAL); table = squashfs_read_table(sb, fragment_table_start, length); /* * table[0] points to the first fragment table metadata block, this * should be less than fragment_table_start */ if (!IS_ERR(table) && le64_to_cpu(table[0]) >= fragment_table_start) { kfree(table); return ERR_PTR(-EINVAL); } return table; }
7 6 2 49 49 20 2 29 29 2 1 1 26 1 25 19 25 20 3 10 15 22 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) International Business Machines Corp., 2000-2002 * Portions Copyright (C) Christoph Hellwig, 2001-2002 */ #include <linux/mm.h> #include <linux/fs.h> #include <linux/filelock.h> #include <linux/posix_acl.h> #include <linux/quotaops.h> #include "jfs_incore.h" #include "jfs_inode.h" #include "jfs_dmap.h" #include "jfs_txnmgr.h" #include "jfs_xattr.h" #include "jfs_acl.h" #include "jfs_debug.h" int jfs_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file->f_mapping->host; int rc = 0; rc = file_write_and_wait_range(file, start, end); if (rc) return rc; inode_lock(inode); if (!(inode_state_read_once(inode) & I_DIRTY_ALL) || (datasync && !(inode_state_read_once(inode) & I_DIRTY_DATASYNC))) { /* Make sure committed changes hit the disk */ jfs_flush_journal(JFS_SBI(inode->i_sb)->log, 1); inode_unlock(inode); return rc; } rc |= jfs_commit_inode(inode, 1); inode_unlock(inode); return rc ? -EIO : 0; } static int jfs_open(struct inode *inode, struct file *file) { int rc; if (S_ISREG(inode->i_mode) && inode->i_size < 0) return -EIO; if ((rc = dquot_file_open(inode, file))) return rc; /* * We attempt to allow only one "active" file open per aggregate * group. Otherwise, appending to files in parallel can cause * fragmentation within the files. * * If the file is empty, it was probably just created and going * to be written to. If it has a size, we'll hold off until the * file is actually grown. */ if (S_ISREG(inode->i_mode) && file->f_mode & FMODE_WRITE && (inode->i_size == 0)) { struct jfs_inode_info *ji = JFS_IP(inode); spin_lock_irq(&ji->ag_lock); if (ji->active_ag == -1) { struct jfs_sb_info *jfs_sb = JFS_SBI(inode->i_sb); ji->active_ag = BLKTOAG(addressPXD(&ji->ixpxd), jfs_sb); atomic_inc(&jfs_sb->bmap->db_active[ji->active_ag]); } spin_unlock_irq(&ji->ag_lock); } return 0; } static int jfs_release(struct inode *inode, struct file *file) { struct jfs_inode_info *ji = JFS_IP(inode); spin_lock_irq(&ji->ag_lock); if (ji->active_ag != -1) { struct bmap *bmap = JFS_SBI(inode->i_sb)->bmap; atomic_dec(&bmap->db_active[ji->active_ag]); ji->active_ag = -1; } spin_unlock_irq(&ji->ag_lock); return 0; } int jfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { struct inode *inode = d_inode(dentry); int rc; rc = setattr_prepare(&nop_mnt_idmap, dentry, iattr); if (rc) return rc; if (is_quota_modification(&nop_mnt_idmap, inode, iattr)) { rc = dquot_initialize(inode); if (rc) return rc; } if ((iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)) || (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid))) { rc = dquot_transfer(&nop_mnt_idmap, inode, iattr); if (rc) return rc; } if ((iattr->ia_valid & ATTR_SIZE) && iattr->ia_size != i_size_read(inode)) { inode_dio_wait(inode); rc = inode_newsize_ok(inode, iattr->ia_size); if (rc) return rc; truncate_setsize(inode, iattr->ia_size); jfs_truncate(inode); } setattr_copy(&nop_mnt_idmap, inode, iattr); mark_inode_dirty(inode); if (iattr->ia_valid & ATTR_MODE) rc = posix_acl_chmod(&nop_mnt_idmap, dentry, inode->i_mode); return rc; } const struct inode_operations jfs_file_inode_operations = { .listxattr = jfs_listxattr, .setattr = jfs_setattr, .fileattr_get = jfs_fileattr_get, .fileattr_set = jfs_fileattr_set, #ifdef CONFIG_JFS_POSIX_ACL .get_inode_acl = jfs_get_acl, .set_acl = jfs_set_acl, #endif }; const struct file_operations jfs_file_operations = { .open = jfs_open, .llseek = generic_file_llseek, .read_iter = generic_file_read_iter, .write_iter = generic_file_write_iter, .mmap_prepare = generic_file_mmap_prepare, .splice_read = filemap_splice_read, .splice_write = iter_file_splice_write, .fsync = jfs_fsync, .release = jfs_release, .unlocked_ioctl = jfs_ioctl, .compat_ioctl = compat_ptr_ioctl, .setlease = generic_setlease, };
4 6 6 1 4 1 11 11 11 1 1 10 3 4 5 3 3 3 3 38 28 2 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/hpfs/dir.c * * Mikulas Patocka (mikulas@artax.karlin.mff.cuni.cz), 1998-1999 * * directory VFS functions */ #include <linux/slab.h> #include "hpfs_fn.h" static int hpfs_dir_release(struct inode *inode, struct file *filp) { hpfs_lock(inode->i_sb); hpfs_del_pos(inode, &filp->f_pos); /*hpfs_write_if_changed(inode);*/ hpfs_unlock(inode->i_sb); return 0; } /* This is slow, but it's not used often */ static loff_t hpfs_dir_lseek(struct file *filp, loff_t off, int whence) { loff_t new_off = off + (whence == 1 ? filp->f_pos : 0); loff_t pos; struct quad_buffer_head qbh; struct inode *i = file_inode(filp); struct hpfs_inode_info *hpfs_inode = hpfs_i(i); struct super_block *s = i->i_sb; /* Somebody else will have to figure out what to do here */ if (whence == SEEK_DATA || whence == SEEK_HOLE) return -EINVAL; inode_lock(i); hpfs_lock(s); /*pr_info("dir lseek\n");*/ if (new_off == 0 || new_off == 1 || new_off == 11 || new_off == 12 || new_off == 13) goto ok; pos = ((loff_t) hpfs_de_as_down_as_possible(s, hpfs_inode->i_dno) << 4) + 1; while (pos != new_off) { if (map_pos_dirent(i, &pos, &qbh)) hpfs_brelse4(&qbh); else goto fail; if (pos == 12) goto fail; } if (unlikely(hpfs_add_pos(i, &filp->f_pos) < 0)) { hpfs_unlock(s); inode_unlock(i); return -ENOMEM; } ok: filp->f_pos = new_off; hpfs_unlock(s); inode_unlock(i); return new_off; fail: /*pr_warn("illegal lseek: %016llx\n", new_off);*/ hpfs_unlock(s); inode_unlock(i); return -ESPIPE; } static int hpfs_readdir(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); struct hpfs_inode_info *hpfs_inode = hpfs_i(inode); struct quad_buffer_head qbh; struct hpfs_dirent *de; int lc; loff_t next_pos; unsigned char *tempname; int c1, c2 = 0; int ret = 0; hpfs_lock(inode->i_sb); if (hpfs_sb(inode->i_sb)->sb_chk) { if (hpfs_chk_sectors(inode->i_sb, inode->i_ino, 1, "dir_fnode")) { ret = -EFSERROR; goto out; } if (hpfs_chk_sectors(inode->i_sb, hpfs_inode->i_dno, 4, "dir_dnode")) { ret = -EFSERROR; goto out; } } if (hpfs_sb(inode->i_sb)->sb_chk >= 2) { struct buffer_head *bh; struct fnode *fno; int e = 0; if (!(fno = hpfs_map_fnode(inode->i_sb, inode->i_ino, &bh))) { ret = -EIOERROR; goto out; } if (!fnode_is_dir(fno)) { e = 1; hpfs_error(inode->i_sb, "not a directory, fnode %08llx", inode->i_ino); } if (hpfs_inode->i_dno != le32_to_cpu(fno->u.external[0].disk_secno)) { e = 1; hpfs_error(inode->i_sb, "corrupted inode: i_dno == %08x, fnode -> dnode == %08x", hpfs_inode->i_dno, le32_to_cpu(fno->u.external[0].disk_secno)); } brelse(bh); if (e) { ret = -EFSERROR; goto out; } } lc = hpfs_sb(inode->i_sb)->sb_lowercase; if (ctx->pos == 12) { /* diff -r requires this (note, that diff -r */ ctx->pos = 13; /* also fails on msdos filesystem in 2.0) */ goto out; } if (ctx->pos == 13) { ret = -ENOENT; goto out; } while (1) { again: /* This won't work when cycle is longer than number of dirents accepted by filldir, but what can I do? maybe killall -9 ls helps */ if (hpfs_sb(inode->i_sb)->sb_chk) if (hpfs_stop_cycles(inode->i_sb, ctx->pos, &c1, &c2, "hpfs_readdir")) { ret = -EFSERROR; goto out; } if (ctx->pos == 12) goto out; if (ctx->pos == 3 || ctx->pos == 4 || ctx->pos == 5) { pr_err("pos==%d\n", (int)ctx->pos); goto out; } if (ctx->pos == 0) { if (!dir_emit_dot(file, ctx)) goto out; ctx->pos = 11; } if (ctx->pos == 11) { if (!dir_emit(ctx, "..", 2, hpfs_inode->i_parent_dir, DT_DIR)) goto out; ctx->pos = 1; } if (ctx->pos == 1) { ret = hpfs_add_pos(inode, &file->f_pos); if (unlikely(ret < 0)) goto out; ctx->pos = ((loff_t) hpfs_de_as_down_as_possible(inode->i_sb, hpfs_inode->i_dno) << 4) + 1; } next_pos = ctx->pos; if (!(de = map_pos_dirent(inode, &next_pos, &qbh))) { ctx->pos = next_pos; ret = -EIOERROR; goto out; } if (de->first || de->last) { if (hpfs_sb(inode->i_sb)->sb_chk) { if (de->first && !de->last && (de->namelen != 2 || de ->name[0] != 1 || de->name[1] != 1)) hpfs_error(inode->i_sb, "hpfs_readdir: bad ^A^A entry; pos = %08lx", (unsigned long)ctx->pos); if (de->last && (de->namelen != 1 || de ->name[0] != 255)) hpfs_error(inode->i_sb, "hpfs_readdir: bad \\377 entry; pos = %08lx", (unsigned long)ctx->pos); } hpfs_brelse4(&qbh); ctx->pos = next_pos; goto again; } tempname = hpfs_translate_name(inode->i_sb, de->name, de->namelen, lc, de->not_8x3); if (!dir_emit(ctx, tempname, de->namelen, le32_to_cpu(de->fnode), DT_UNKNOWN)) { if (tempname != de->name) kfree(tempname); hpfs_brelse4(&qbh); goto out; } ctx->pos = next_pos; if (tempname != de->name) kfree(tempname); hpfs_brelse4(&qbh); } out: hpfs_unlock(inode->i_sb); return ret; } /* * lookup. Search the specified directory for the specified name, set * *result to the corresponding inode. * * lookup uses the inode number to tell read_inode whether it is reading * the inode of a directory or a file -- file ino's are odd, directory * ino's are even. read_inode avoids i/o for file inodes; everything * needed is up here in the directory. (And file fnodes are out in * the boondocks.) * * - M.P.: this is over, sometimes we've got to read file's fnode for eas * inode numbers are just fnode sector numbers; iget lock is used * to tell read_inode to read fnode or not. */ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { const unsigned char *name = dentry->d_name.name; unsigned len = dentry->d_name.len; struct quad_buffer_head qbh; struct hpfs_dirent *de; ino_t ino; int err; struct inode *result = NULL; struct hpfs_inode_info *hpfs_result; hpfs_lock(dir->i_sb); if ((err = hpfs_chk_name(name, &len))) { if (err == -ENAMETOOLONG) { hpfs_unlock(dir->i_sb); return ERR_PTR(-ENAMETOOLONG); } goto end_add; } /* * '.' and '..' will never be passed here. */ de = map_dirent(dir, hpfs_i(dir)->i_dno, name, len, NULL, &qbh); /* * This is not really a bailout, just means file not found. */ if (!de) goto end; /* * Get inode number, what we're after. */ ino = le32_to_cpu(de->fnode); /* * Go find or make an inode. */ result = iget_locked(dir->i_sb, ino); if (!result) { hpfs_error(dir->i_sb, "hpfs_lookup: can't get inode"); result = ERR_PTR(-ENOMEM); goto bail1; } if (inode_state_read_once(result) & I_NEW) { hpfs_init_inode(result); if (de->directory) hpfs_read_inode(result); else if (le32_to_cpu(de->ea_size) && hpfs_sb(dir->i_sb)->sb_eas) hpfs_read_inode(result); else { result->i_mode |= S_IFREG; result->i_mode &= ~0111; result->i_op = &hpfs_file_iops; result->i_fop = &hpfs_file_ops; set_nlink(result, 1); } unlock_new_inode(result); } hpfs_result = hpfs_i(result); if (!de->directory) hpfs_result->i_parent_dir = dir->i_ino; if (de->has_acl || de->has_xtd_perm) if (!sb_rdonly(dir->i_sb)) { hpfs_error(result->i_sb, "ACLs or XPERM found. This is probably HPFS386. This driver doesn't support it now. Send me some info on these structures"); iput(result); result = ERR_PTR(-EINVAL); goto bail1; } /* * Fill in the info from the directory if this is a newly created * inode. */ if (!inode_get_ctime_sec(result)) { time64_t csec = local_to_gmt(dir->i_sb, le32_to_cpu(de->creation_date)); inode_set_ctime(result, csec ? csec : 1, 0); inode_set_mtime(result, local_to_gmt(dir->i_sb, le32_to_cpu(de->write_date)), 0); inode_set_atime(result, local_to_gmt(dir->i_sb, le32_to_cpu(de->read_date)), 0); hpfs_result->i_ea_size = le32_to_cpu(de->ea_size); if (!hpfs_result->i_ea_mode && de->read_only) result->i_mode &= ~0222; if (!de->directory) { if (result->i_size == -1) { result->i_size = le32_to_cpu(de->file_size); result->i_data.a_ops = &hpfs_aops; hpfs_i(result)->mmu_private = result->i_size; /* * i_blocks should count the fnode and any anodes. * We count 1 for the fnode and don't bother about * anodes -- the disk heads are on the directory band * and we want them to stay there. */ result->i_blocks = 1 + ((result->i_size + 511) >> 9); } } } bail1: hpfs_brelse4(&qbh); /* * Made it. */ end: end_add: hpfs_unlock(dir->i_sb); return d_splice_alias(result, dentry); } const struct file_operations hpfs_dir_ops = { .llseek = hpfs_dir_lseek, .read = generic_read_dir, .iterate_shared = hpfs_readdir, .release = hpfs_dir_release, .fsync = hpfs_file_fsync, .unlocked_ioctl = hpfs_ioctl, .compat_ioctl = compat_ptr_ioctl, };
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 // SPDX-License-Identifier: GPL-2.0-only /* * kernel/freezer.c - Function to freeze a process * * Originally from kernel/power/process.c */ #include <linux/interrupt.h> #include <linux/suspend.h> #include <linux/export.h> #include <linux/syscalls.h> #include <linux/freezer.h> #include <linux/oom.h> #include <linux/kthread.h> /* total number of freezing conditions in effect */ DEFINE_STATIC_KEY_FALSE(freezer_active); EXPORT_SYMBOL(freezer_active); /* * indicate whether PM freezing is in effect, protected by * system_transition_mutex */ bool pm_freezing; bool pm_nosig_freezing; /* protects freezing and frozen transitions */ static DEFINE_SPINLOCK(freezer_lock); /** * freezing_slow_path - slow path for testing whether a task needs to be frozen * @p: task to be tested * * This function is called by freezing() if freezer_active isn't zero * and tests whether @p needs to enter and stay in frozen state. Can be * called under any context. The freezers are responsible for ensuring the * target tasks see the updated state. */ bool freezing_slow_path(struct task_struct *p) { if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK)) return false; if (tsk_is_oom_victim(p)) return false; if (pm_nosig_freezing || cgroup1_freezing(p)) return true; if (pm_freezing && !(p->flags & PF_KTHREAD)) return true; return false; } EXPORT_SYMBOL(freezing_slow_path); bool frozen(struct task_struct *p) { return READ_ONCE(p->__state) & TASK_FROZEN; } /* Refrigerator is place where frozen processes are stored :-). */ bool __refrigerator(bool check_kthr_stop) { unsigned int state = get_current_state(); bool was_frozen = false; pr_debug("%s entered refrigerator\n", current->comm); WARN_ON_ONCE(state && !(state & TASK_NORMAL)); for (;;) { bool freeze; raw_spin_lock_irq(&current->pi_lock); WRITE_ONCE(current->__state, TASK_FROZEN); /* unstale saved_state so that __thaw_task() will wake us up */ current->saved_state = TASK_RUNNING; raw_spin_unlock_irq(&current->pi_lock); spin_lock_irq(&freezer_lock); freeze = freezing(current) && !(check_kthr_stop && kthread_should_stop()); spin_unlock_irq(&freezer_lock); if (!freeze) break; was_frozen = true; schedule(); } __set_current_state(TASK_RUNNING); pr_debug("%s left refrigerator\n", current->comm); return was_frozen; } EXPORT_SYMBOL(__refrigerator); static void fake_signal_wake_up(struct task_struct *p) { unsigned long flags; if (lock_task_sighand(p, &flags)) { signal_wake_up(p, 0); unlock_task_sighand(p, &flags); } } static int __set_task_frozen(struct task_struct *p, void *arg) { unsigned int state = READ_ONCE(p->__state); /* * Allow freezing the sched_delayed tasks; they will not execute until * ttwu() fixes them up, so it is safe to swap their state now, instead * of waiting for them to get fully dequeued. */ if (task_is_runnable(p)) return 0; if (p != current && task_curr(p)) return 0; if (!(state & (TASK_FREEZABLE | __TASK_STOPPED | __TASK_TRACED))) return 0; /* * Only TASK_NORMAL can be augmented with TASK_FREEZABLE, since they * can suffer spurious wakeups. */ if (state & TASK_FREEZABLE) WARN_ON_ONCE(!(state & TASK_NORMAL)); #ifdef CONFIG_LOCKDEP /* * It's dangerous to freeze with locks held; there be dragons there. */ if (!(state & __TASK_FREEZABLE_UNSAFE)) WARN_ON_ONCE(debug_locks && p->lockdep_depth); #endif p->saved_state = p->__state; WRITE_ONCE(p->__state, TASK_FROZEN); return TASK_FROZEN; } static bool __freeze_task(struct task_struct *p) { /* TASK_FREEZABLE|TASK_STOPPED|TASK_TRACED -> TASK_FROZEN */ return task_call_func(p, __set_task_frozen, NULL); } /** * freeze_task - send a freeze request to given task * @p: task to send the request to * * If @p is freezing, the freeze request is sent either by sending a fake * signal (if it's not a kernel thread) or waking it up (if it's a kernel * thread). * * RETURNS: * %false, if @p is not freezing or already frozen; %true, otherwise */ bool freeze_task(struct task_struct *p) { unsigned long flags; spin_lock_irqsave(&freezer_lock, flags); if (!freezing(p) || frozen(p) || __freeze_task(p)) { spin_unlock_irqrestore(&freezer_lock, flags); return false; } if (!(p->flags & PF_KTHREAD)) fake_signal_wake_up(p); else wake_up_state(p, TASK_NORMAL); spin_unlock_irqrestore(&freezer_lock, flags); return true; } /* * Restore the saved_state before the task entered freezer. For typical task * in the __refrigerator(), saved_state == TASK_RUNNING so nothing happens * here. For tasks which were TASK_NORMAL | TASK_FREEZABLE, their initial state * is restored unless they got an expected wakeup (see ttwu_state_match()). * Returns 1 if the task state was restored. */ static int __restore_freezer_state(struct task_struct *p, void *arg) { unsigned int state = p->saved_state; if (state != TASK_RUNNING) { WRITE_ONCE(p->__state, state); p->saved_state = TASK_RUNNING; return 1; } return 0; } void __thaw_task(struct task_struct *p) { guard(spinlock_irqsave)(&freezer_lock); if (frozen(p) && !task_call_func(p, __restore_freezer_state, NULL)) wake_up_state(p, TASK_FROZEN); } /* * thaw_process - Thaw a frozen process * @p: the process to be thawed * * Iterate over all threads of @p and call __thaw_task() on each. */ void thaw_process(struct task_struct *p) { struct task_struct *t; rcu_read_lock(); for_each_thread(p, t) { __thaw_task(t); } rcu_read_unlock(); } /** * set_freezable - make %current freezable * * Mark %current freezable and enter refrigerator if necessary. */ bool set_freezable(void) { might_sleep(); /* * Modify flags while holding freezer_lock. This ensures the * freezer notices that we aren't frozen yet or the freezing * condition is visible to try_to_freeze() below. */ spin_lock_irq(&freezer_lock); current->flags &= ~PF_NOFREEZE; spin_unlock_irq(&freezer_lock); return try_to_freeze(); } EXPORT_SYMBOL(set_freezable);
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the * LICENSE file in the root directory of this source tree) and the GPLv2 (found * in the COPYING file in the root directory of this source tree). * You may select, at your option, one of the above-listed licenses. */ #ifndef ZSTD_COMMON_CPU_H #define ZSTD_COMMON_CPU_H /* * Implementation taken from folly/CpuId.h * https://github.com/facebook/folly/blob/master/folly/CpuId.h */ #include "mem.h" typedef struct { U32 f1c; U32 f1d; U32 f7b; U32 f7c; } ZSTD_cpuid_t; MEM_STATIC ZSTD_cpuid_t ZSTD_cpuid(void) { U32 f1c = 0; U32 f1d = 0; U32 f7b = 0; U32 f7c = 0; #if defined(__i386__) && defined(__PIC__) && !defined(__clang__) && defined(__GNUC__) /* The following block like the normal cpuid branch below, but gcc * reserves ebx for use of its pic register so we must specially * handle the save and restore to avoid clobbering the register */ U32 n; __asm__( "pushl %%ebx\n\t" "cpuid\n\t" "popl %%ebx\n\t" : "=a"(n) : "a"(0) : "ecx", "edx"); if (n >= 1) { U32 f1a; __asm__( "pushl %%ebx\n\t" "cpuid\n\t" "popl %%ebx\n\t" : "=a"(f1a), "=c"(f1c), "=d"(f1d) : "a"(1)); } if (n >= 7) { __asm__( "pushl %%ebx\n\t" "cpuid\n\t" "movl %%ebx, %%eax\n\t" "popl %%ebx" : "=a"(f7b), "=c"(f7c) : "a"(7), "c"(0) : "edx"); } #elif defined(__x86_64__) || defined(_M_X64) || defined(__i386__) U32 n; __asm__("cpuid" : "=a"(n) : "a"(0) : "ebx", "ecx", "edx"); if (n >= 1) { U32 f1a; __asm__("cpuid" : "=a"(f1a), "=c"(f1c), "=d"(f1d) : "a"(1) : "ebx"); } if (n >= 7) { U32 f7a; __asm__("cpuid" : "=a"(f7a), "=b"(f7b), "=c"(f7c) : "a"(7), "c"(0) : "edx"); } #endif { ZSTD_cpuid_t cpuid; cpuid.f1c = f1c; cpuid.f1d = f1d; cpuid.f7b = f7b; cpuid.f7c = f7c; return cpuid; } } #define X(name, r, bit) \ MEM_STATIC int ZSTD_cpuid_##name(ZSTD_cpuid_t const cpuid) { \ return ((cpuid.r) & (1U << bit)) != 0; \ } /* cpuid(1): Processor Info and Feature Bits. */ #define C(name, bit) X(name, f1c, bit) C(sse3, 0) C(pclmuldq, 1) C(dtes64, 2) C(monitor, 3) C(dscpl, 4) C(vmx, 5) C(smx, 6) C(eist, 7) C(tm2, 8) C(ssse3, 9) C(cnxtid, 10) C(fma, 12) C(cx16, 13) C(xtpr, 14) C(pdcm, 15) C(pcid, 17) C(dca, 18) C(sse41, 19) C(sse42, 20) C(x2apic, 21) C(movbe, 22) C(popcnt, 23) C(tscdeadline, 24) C(aes, 25) C(xsave, 26) C(osxsave, 27) C(avx, 28) C(f16c, 29) C(rdrand, 30) #undef C #define D(name, bit) X(name, f1d, bit) D(fpu, 0) D(vme, 1) D(de, 2) D(pse, 3) D(tsc, 4) D(msr, 5) D(pae, 6) D(mce, 7) D(cx8, 8) D(apic, 9) D(sep, 11) D(mtrr, 12) D(pge, 13) D(mca, 14) D(cmov, 15) D(pat, 16) D(pse36, 17) D(psn, 18) D(clfsh, 19) D(ds, 21) D(acpi, 22) D(mmx, 23) D(fxsr, 24) D(sse, 25) D(sse2, 26) D(ss, 27) D(htt, 28) D(tm, 29) D(pbe, 31) #undef D /* cpuid(7): Extended Features. */ #define B(name, bit) X(name, f7b, bit) B(bmi1, 3) B(hle, 4) B(avx2, 5) B(smep, 7) B(bmi2, 8) B(erms, 9) B(invpcid, 10) B(rtm, 11) B(mpx, 14) B(avx512f, 16) B(avx512dq, 17) B(rdseed, 18) B(adx, 19) B(smap, 20) B(avx512ifma, 21) B(pcommit, 22) B(clflushopt, 23) B(clwb, 24) B(avx512pf, 26) B(avx512er, 27) B(avx512cd, 28) B(sha, 29) B(avx512bw, 30) B(avx512vl, 31) #undef B #define C(name, bit) X(name, f7c, bit) C(prefetchwt1, 0) C(avx512vbmi, 1) #undef C #undef X #endif /* ZSTD_COMMON_CPU_H */
59 3641 3641 3641 3590 1978 457 3616 3594 1719 663 663 662 457 3633 2717 3632 294 477 2559 2718 3638 2716 1 3635 3630 3635 1 3631 3632 205 3229 3234 61 3223 1115 885 1581 2964 2077 1089 237 1832 1967 1502 1332 1916 662 345 501 663 3577 3003 3482 3585 3631 3590 3631 2253 2605 3583 44 3517 101 3631 74 252 252 79 295 3632 204 3635 380 3632 656 134 113 82 133 135 16 136 11 16 135 346 3633 3637 3556 346 3628 1918 1094 3638 3628 3630 3638 3624 1916 2816 3457 136 132 136 136 136 136 135 3631 3 3633 3629 3 3 3 3 135 1 1 1 1 3631 3636 3636 3632 347 3558 3625 3641 3633 3638 3633 3635 59 3552 3641 3640 135 4 136 136 135 136 766 1915 2 1370 1371 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 // SPDX-License-Identifier: GPL-2.0-only /* * mm/percpu.c - percpu memory allocator * * Copyright (C) 2009 SUSE Linux Products GmbH * Copyright (C) 2009 Tejun Heo <tj@kernel.org> * * Copyright (C) 2017 Facebook Inc. * Copyright (C) 2017 Dennis Zhou <dennis@kernel.org> * * The percpu allocator handles both static and dynamic areas. Percpu * areas are allocated in chunks which are divided into units. There is * a 1-to-1 mapping for units to possible cpus. These units are grouped * based on NUMA properties of the machine. * * c0 c1 c2 * ------------------- ------------------- ------------ * | u0 | u1 | u2 | u3 | | u0 | u1 | u2 | u3 | | u0 | u1 | u * ------------------- ...... ------------------- .... ------------ * * Allocation is done by offsets into a unit's address space. Ie., an * area of 512 bytes at 6k in c1 occupies 512 bytes at 6k in c1:u0, * c1:u1, c1:u2, etc. On NUMA machines, the mapping may be non-linear * and even sparse. Access is handled by configuring percpu base * registers according to the cpu to unit mappings and offsetting the * base address using pcpu_unit_size. * * There is special consideration for the first chunk which must handle * the static percpu variables in the kernel image as allocation services * are not online yet. In short, the first chunk is structured like so: * * <Static | [Reserved] | Dynamic> * * The static data is copied from the original section managed by the * linker. The reserved section, if non-zero, primarily manages static * percpu variables from kernel modules. Finally, the dynamic section * takes care of normal allocations. * * The allocator organizes chunks into lists according to free size and * memcg-awareness. To make a percpu allocation memcg-aware the __GFP_ACCOUNT * flag should be passed. All memcg-aware allocations are sharing one set * of chunks and all unaccounted allocations and allocations performed * by processes belonging to the root memory cgroup are using the second set. * * The allocator tries to allocate from the fullest chunk first. Each chunk * is managed by a bitmap with metadata blocks. The allocation map is updated * on every allocation and free to reflect the current state while the boundary * map is only updated on allocation. Each metadata block contains * information to help mitigate the need to iterate over large portions * of the bitmap. The reverse mapping from page to chunk is stored in * the page's index. Lastly, units are lazily backed and grow in unison. * * There is a unique conversion that goes on here between bytes and bits. * Each bit represents a fragment of size PCPU_MIN_ALLOC_SIZE. The chunk * tracks the number of pages it is responsible for in nr_pages. Helper * functions are used to convert from between the bytes, bits, and blocks. * All hints are managed in bits unless explicitly stated. * * To use this allocator, arch code should do the following: * * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate * regular address to percpu pointer and back if they need to be * different from the default * * - use pcpu_setup_first_chunk() during percpu area initialization to * setup the first chunk containing the kernel static percpu area */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/bitmap.h> #include <linux/cpumask.h> #include <linux/memblock.h> #include <linux/err.h> #include <linux/list.h> #include <linux/log2.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/percpu.h> #include <linux/pfn.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/vmalloc.h> #include <linux/workqueue.h> #include <linux/kmemleak.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/memcontrol.h> #include <asm/cacheflush.h> #include <asm/sections.h> #include <asm/tlbflush.h> #include <asm/io.h> #define CREATE_TRACE_POINTS #include <trace/events/percpu.h> #include "percpu-internal.h" /* * The slots are sorted by the size of the biggest continuous free area. * 1-31 bytes share the same slot. */ #define PCPU_SLOT_BASE_SHIFT 5 /* chunks in slots below this are subject to being sidelined on failed alloc */ #define PCPU_SLOT_FAIL_THRESHOLD 3 #define PCPU_EMPTY_POP_PAGES_LOW 2 #define PCPU_EMPTY_POP_PAGES_HIGH 4 #ifdef CONFIG_SMP /* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */ #ifndef __addr_to_pcpu_ptr #define __addr_to_pcpu_ptr(addr) \ (void __percpu *)((unsigned long)(addr) - \ (unsigned long)pcpu_base_addr + \ (unsigned long)__per_cpu_start) #endif #ifndef __pcpu_ptr_to_addr #define __pcpu_ptr_to_addr(ptr) \ (void __force *)((unsigned long)(ptr) + \ (unsigned long)pcpu_base_addr - \ (unsigned long)__per_cpu_start) #endif #else /* CONFIG_SMP */ /* on UP, it's always identity mapped */ #define __addr_to_pcpu_ptr(addr) (void __percpu *)(addr) #define __pcpu_ptr_to_addr(ptr) (void __force *)(ptr) #endif /* CONFIG_SMP */ static int pcpu_unit_pages __ro_after_init; static int pcpu_unit_size __ro_after_init; static int pcpu_nr_units __ro_after_init; static int pcpu_atom_size __ro_after_init; int pcpu_nr_slots __ro_after_init; static int pcpu_free_slot __ro_after_init; int pcpu_sidelined_slot __ro_after_init; int pcpu_to_depopulate_slot __ro_after_init; static size_t pcpu_chunk_struct_size __ro_after_init; /* cpus with the lowest and highest unit addresses */ static unsigned int pcpu_low_unit_cpu __ro_after_init; static unsigned int pcpu_high_unit_cpu __ro_after_init; /* the address of the first chunk which starts with the kernel static area */ void *pcpu_base_addr __ro_after_init; static const int *pcpu_unit_map __ro_after_init; /* cpu -> unit */ const unsigned long *pcpu_unit_offsets __ro_after_init; /* cpu -> unit offset */ /* group information, used for vm allocation */ static int pcpu_nr_groups __ro_after_init; static const unsigned long *pcpu_group_offsets __ro_after_init; static const size_t *pcpu_group_sizes __ro_after_init; /* * The first chunk which always exists. Note that unlike other * chunks, this one can be allocated and mapped in several different * ways and thus often doesn't live in the vmalloc area. */ struct pcpu_chunk *pcpu_first_chunk __ro_after_init; /* * Optional reserved chunk. This chunk reserves part of the first * chunk and serves it for reserved allocations. When the reserved * region doesn't exist, the following variable is NULL. */ struct pcpu_chunk *pcpu_reserved_chunk __ro_after_init; DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */ static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop, map ext */ struct list_head *pcpu_chunk_lists __ro_after_init; /* chunk list slots */ /* * The number of empty populated pages, protected by pcpu_lock. * The reserved chunk doesn't contribute to the count. */ int pcpu_nr_empty_pop_pages; /* * The number of populated pages in use by the allocator, protected by * pcpu_lock. This number is kept per a unit per chunk (i.e. when a page gets * allocated/deallocated, it is allocated/deallocated in all units of a chunk * and increments/decrements this count by 1). */ static unsigned long pcpu_nr_populated; /* * Balance work is used to populate or destroy chunks asynchronously. We * try to keep the number of populated free pages between * PCPU_EMPTY_POP_PAGES_LOW and HIGH for atomic allocations and at most one * empty chunk. */ static void pcpu_balance_workfn(struct work_struct *work); static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn); static bool pcpu_async_enabled __read_mostly; static bool pcpu_atomic_alloc_failed; static void pcpu_schedule_balance_work(void) { if (pcpu_async_enabled) schedule_work(&pcpu_balance_work); } /** * pcpu_addr_in_chunk - check if the address is served from this chunk * @chunk: chunk of interest * @addr: percpu address * * RETURNS: * True if the address is served from this chunk. */ static bool pcpu_addr_in_chunk(struct pcpu_chunk *chunk, void *addr) { void *start_addr, *end_addr; if (!chunk) return false; start_addr = chunk->base_addr + chunk->start_offset; end_addr = chunk->base_addr + chunk->nr_pages * PAGE_SIZE - chunk->end_offset; return addr >= start_addr && addr < end_addr; } static int __pcpu_size_to_slot(int size) { int highbit = fls(size); /* size is in bytes */ return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1); } static int pcpu_size_to_slot(int size) { if (size == pcpu_unit_size) return pcpu_free_slot; return __pcpu_size_to_slot(size); } static int pcpu_chunk_slot(const struct pcpu_chunk *chunk) { const struct pcpu_block_md *chunk_md = &chunk->chunk_md; if (chunk->free_bytes < PCPU_MIN_ALLOC_SIZE || chunk_md->contig_hint == 0) return 0; return pcpu_size_to_slot(chunk_md->contig_hint * PCPU_MIN_ALLOC_SIZE); } /* set the pointer to a chunk in a page struct */ static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu) { page->private = (unsigned long)pcpu; } /* obtain pointer to a chunk from a page struct */ static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page) { return (struct pcpu_chunk *)page->private; } static int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx) { return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx; } static unsigned long pcpu_unit_page_offset(unsigned int cpu, int page_idx) { return pcpu_unit_offsets[cpu] + (page_idx << PAGE_SHIFT); } static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk, unsigned int cpu, int page_idx) { return (unsigned long)chunk->base_addr + pcpu_unit_page_offset(cpu, page_idx); } /* * The following are helper functions to help access bitmaps and convert * between bitmap offsets to address offsets. */ static unsigned long *pcpu_index_alloc_map(struct pcpu_chunk *chunk, int index) { return chunk->alloc_map + (index * PCPU_BITMAP_BLOCK_BITS / BITS_PER_LONG); } static unsigned long pcpu_off_to_block_index(int off) { return off / PCPU_BITMAP_BLOCK_BITS; } static unsigned long pcpu_off_to_block_off(int off) { return off & (PCPU_BITMAP_BLOCK_BITS - 1); } static unsigned long pcpu_block_off_to_off(int index, int off) { return index * PCPU_BITMAP_BLOCK_BITS + off; } /** * pcpu_check_block_hint - check against the contig hint * @block: block of interest * @bits: size of allocation * @align: alignment of area (max PAGE_SIZE) * * Check to see if the allocation can fit in the block's contig hint. * Note, a chunk uses the same hints as a block so this can also check against * the chunk's contig hint. */ static bool pcpu_check_block_hint(struct pcpu_block_md *block, int bits, size_t align) { int bit_off = ALIGN(block->contig_hint_start, align) - block->contig_hint_start; return bit_off + bits <= block->contig_hint; } /* * pcpu_next_hint - determine which hint to use * @block: block of interest * @alloc_bits: size of allocation * * This determines if we should scan based on the scan_hint or first_free. * In general, we want to scan from first_free to fulfill allocations by * first fit. However, if we know a scan_hint at position scan_hint_start * cannot fulfill an allocation, we can begin scanning from there knowing * the contig_hint will be our fallback. */ static int pcpu_next_hint(struct pcpu_block_md *block, int alloc_bits) { /* * The three conditions below determine if we can skip past the * scan_hint. First, does the scan hint exist. Second, is the * contig_hint after the scan_hint (possibly not true iff * contig_hint == scan_hint). Third, is the allocation request * larger than the scan_hint. */ if (block->scan_hint && block->contig_hint_start > block->scan_hint_start && alloc_bits > block->scan_hint) return block->scan_hint_start + block->scan_hint; return block->first_free; } /** * pcpu_next_md_free_region - finds the next hint free area * @chunk: chunk of interest * @bit_off: chunk offset * @bits: size of free area * * Helper function for pcpu_for_each_md_free_region. It checks * block->contig_hint and performs aggregation across blocks to find the * next hint. It modifies bit_off and bits in-place to be consumed in the * loop. */ static void pcpu_next_md_free_region(struct pcpu_chunk *chunk, int *bit_off, int *bits) { int i = pcpu_off_to_block_index(*bit_off); int block_off = pcpu_off_to_block_off(*bit_off); struct pcpu_block_md *block; *bits = 0; for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk); block++, i++) { /* handles contig area across blocks */ if (*bits) { *bits += block->left_free; if (block->left_free == PCPU_BITMAP_BLOCK_BITS) continue; return; } /* * This checks three things. First is there a contig_hint to * check. Second, have we checked this hint before by * comparing the block_off. Third, is this the same as the * right contig hint. In the last case, it spills over into * the next block and should be handled by the contig area * across blocks code. */ *bits = block->contig_hint; if (*bits && block->contig_hint_start >= block_off && *bits + block->contig_hint_start < PCPU_BITMAP_BLOCK_BITS) { *bit_off = pcpu_block_off_to_off(i, block->contig_hint_start); return; } /* reset to satisfy the second predicate above */ block_off = 0; *bits = block->right_free; *bit_off = (i + 1) * PCPU_BITMAP_BLOCK_BITS - block->right_free; } } /** * pcpu_next_fit_region - finds fit areas for a given allocation request * @chunk: chunk of interest * @alloc_bits: size of allocation * @align: alignment of area (max PAGE_SIZE) * @bit_off: chunk offset * @bits: size of free area * * Finds the next free region that is viable for use with a given size and * alignment. This only returns if there is a valid area to be used for this * allocation. block->first_free is returned if the allocation request fits * within the block to see if the request can be fulfilled prior to the contig * hint. */ static void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits, int align, int *bit_off, int *bits) { int i = pcpu_off_to_block_index(*bit_off); int block_off = pcpu_off_to_block_off(*bit_off); struct pcpu_block_md *block; *bits = 0; for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk); block++, i++) { /* handles contig area across blocks */ if (*bits) { *bits += block->left_free; if (*bits >= alloc_bits) return; if (block->left_free == PCPU_BITMAP_BLOCK_BITS) continue; } /* check block->contig_hint */ *bits = ALIGN(block->contig_hint_start, align) - block->contig_hint_start; /* * This uses the block offset to determine if this has been * checked in the prior iteration. */ if (block->contig_hint && block->contig_hint_start >= block_off && block->contig_hint >= *bits + alloc_bits) { int start = pcpu_next_hint(block, alloc_bits); *bits += alloc_bits + block->contig_hint_start - start; *bit_off = pcpu_block_off_to_off(i, start); return; } /* reset to satisfy the second predicate above */ block_off = 0; *bit_off = ALIGN(PCPU_BITMAP_BLOCK_BITS - block->right_free, align); *bits = PCPU_BITMAP_BLOCK_BITS - *bit_off; *bit_off = pcpu_block_off_to_off(i, *bit_off); if (*bits >= alloc_bits) return; } /* no valid offsets were found - fail condition */ *bit_off = pcpu_chunk_map_bits(chunk); } /* * Metadata free area iterators. These perform aggregation of free areas * based on the metadata blocks and return the offset @bit_off and size in * bits of the free area @bits. pcpu_for_each_fit_region only returns when * a fit is found for the allocation request. */ #define pcpu_for_each_md_free_region(chunk, bit_off, bits) \ for (pcpu_next_md_free_region((chunk), &(bit_off), &(bits)); \ (bit_off) < pcpu_chunk_map_bits((chunk)); \ (bit_off) += (bits) + 1, \ pcpu_next_md_free_region((chunk), &(bit_off), &(bits))) #define pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits) \ for (pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \ &(bits)); \ (bit_off) < pcpu_chunk_map_bits((chunk)); \ (bit_off) += (bits), \ pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \ &(bits))) /** * pcpu_mem_zalloc - allocate memory * @size: bytes to allocate * @gfp: allocation flags * * Allocate @size bytes. If @size is smaller than PAGE_SIZE, * kzalloc() is used; otherwise, the equivalent of vzalloc() is used. * This is to facilitate passing through whitelisted flags. The * returned memory is always zeroed. * * RETURNS: * Pointer to the allocated area on success, NULL on failure. */ static void *pcpu_mem_zalloc(size_t size, gfp_t gfp) { if (WARN_ON_ONCE(!slab_is_available())) return NULL; if (size <= PAGE_SIZE) return kzalloc(size, gfp); else return __vmalloc(size, gfp | __GFP_ZERO); } /** * pcpu_mem_free - free memory * @ptr: memory to free * * Free @ptr. @ptr should have been allocated using pcpu_mem_zalloc(). */ static void pcpu_mem_free(void *ptr) { kvfree(ptr); } static void __pcpu_chunk_move(struct pcpu_chunk *chunk, int slot, bool move_front) { if (chunk != pcpu_reserved_chunk) { if (move_front) list_move(&chunk->list, &pcpu_chunk_lists[slot]); else list_move_tail(&chunk->list, &pcpu_chunk_lists[slot]); } } static void pcpu_chunk_move(struct pcpu_chunk *chunk, int slot) { __pcpu_chunk_move(chunk, slot, true); } /** * pcpu_chunk_relocate - put chunk in the appropriate chunk slot * @chunk: chunk of interest * @oslot: the previous slot it was on * * This function is called after an allocation or free changed @chunk. * New slot according to the changed state is determined and @chunk is * moved to the slot. Note that the reserved chunk is never put on * chunk slots. * * CONTEXT: * pcpu_lock. */ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) { int nslot = pcpu_chunk_slot(chunk); /* leave isolated chunks in-place */ if (chunk->isolated) return; if (oslot != nslot) __pcpu_chunk_move(chunk, nslot, oslot < nslot); } static void pcpu_isolate_chunk(struct pcpu_chunk *chunk) { lockdep_assert_held(&pcpu_lock); if (!chunk->isolated) { chunk->isolated = true; pcpu_nr_empty_pop_pages -= chunk->nr_empty_pop_pages; } list_move(&chunk->list, &pcpu_chunk_lists[pcpu_to_depopulate_slot]); } static void pcpu_reintegrate_chunk(struct pcpu_chunk *chunk) { lockdep_assert_held(&pcpu_lock); if (chunk->isolated) { chunk->isolated = false; pcpu_nr_empty_pop_pages += chunk->nr_empty_pop_pages; pcpu_chunk_relocate(chunk, -1); } } /* * pcpu_update_empty_pages - update empty page counters * @chunk: chunk of interest * @nr: nr of empty pages * * This is used to keep track of the empty pages now based on the premise * a md_block covers a page. The hint update functions recognize if a block * is made full or broken to calculate deltas for keeping track of free pages. */ static inline void pcpu_update_empty_pages(struct pcpu_chunk *chunk, int nr) { chunk->nr_empty_pop_pages += nr; if (chunk != pcpu_reserved_chunk && !chunk->isolated) pcpu_nr_empty_pop_pages += nr; } /* * pcpu_region_overlap - determines if two regions overlap * @a: start of first region, inclusive * @b: end of first region, exclusive * @x: start of second region, inclusive * @y: end of second region, exclusive * * This is used to determine if the hint region [a, b) overlaps with the * allocated region [x, y). */ static inline bool pcpu_region_overlap(int a, int b, int x, int y) { return (a < y) && (x < b); } /** * pcpu_block_update - updates a block given a free area * @block: block of interest * @start: start offset in block * @end: end offset in block * * Updates a block given a known free area. The region [start, end) is * expected to be the entirety of the free area within a block. Chooses * the best starting offset if the contig hints are equal. */ static void pcpu_block_update(struct pcpu_block_md *block, int start, int end) { int contig = end - start; block->first_free = min(block->first_free, start); if (start == 0) block->left_free = contig; if (end == block->nr_bits) block->right_free = contig; if (contig > block->contig_hint) { /* promote the old contig_hint to be the new scan_hint */ if (start > block->contig_hint_start) { if (block->contig_hint > block->scan_hint) { block->scan_hint_start = block->contig_hint_start; block->scan_hint = block->contig_hint; } else if (start < block->scan_hint_start) { /* * The old contig_hint == scan_hint. But, the * new contig is larger so hold the invariant * scan_hint_start < contig_hint_start. */ block->scan_hint = 0; } } else { block->scan_hint = 0; } block->contig_hint_start = start; block->contig_hint = contig; } else if (contig == block->contig_hint) { if (block->contig_hint_start && (!start || __ffs(start) > __ffs(block->contig_hint_start))) { /* start has a better alignment so use it */ block->contig_hint_start = start; if (start < block->scan_hint_start && block->contig_hint > block->scan_hint) block->scan_hint = 0; } else if (start > block->scan_hint_start || block->contig_hint > block->scan_hint) { /* * Knowing contig == contig_hint, update the scan_hint * if it is farther than or larger than the current * scan_hint. */ block->scan_hint_start = start; block->scan_hint = contig; } } else { /* * The region is smaller than the contig_hint. So only update * the scan_hint if it is larger than or equal and farther than * the current scan_hint. */ if ((start < block->contig_hint_start && (contig > block->scan_hint || (contig == block->scan_hint && start > block->scan_hint_start)))) { block->scan_hint_start = start; block->scan_hint = contig; } } } /* * pcpu_block_update_scan - update a block given a free area from a scan * @chunk: chunk of interest * @bit_off: chunk offset * @bits: size of free area * * Finding the final allocation spot first goes through pcpu_find_block_fit() * to find a block that can hold the allocation and then pcpu_alloc_area() * where a scan is used. When allocations require specific alignments, * we can inadvertently create holes which will not be seen in the alloc * or free paths. * * This takes a given free area hole and updates a block as it may change the * scan_hint. We need to scan backwards to ensure we don't miss free bits * from alignment. */ static void pcpu_block_update_scan(struct pcpu_chunk *chunk, int bit_off, int bits) { int s_off = pcpu_off_to_block_off(bit_off); int e_off = s_off + bits; int s_index, l_bit; struct pcpu_block_md *block; if (e_off > PCPU_BITMAP_BLOCK_BITS) return; s_index = pcpu_off_to_block_index(bit_off); block = chunk->md_blocks + s_index; /* scan backwards in case of alignment skipping free bits */ l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index), s_off); s_off = (s_off == l_bit) ? 0 : l_bit + 1; pcpu_block_update(block, s_off, e_off); } /** * pcpu_chunk_refresh_hint - updates metadata about a chunk * @chunk: chunk of interest * @full_scan: if we should scan from the beginning * * Iterates over the metadata blocks to find the largest contig area. * A full scan can be avoided on the allocation path as this is triggered * if we broke the contig_hint. In doing so, the scan_hint will be before * the contig_hint or after if the scan_hint == contig_hint. This cannot * be prevented on freeing as we want to find the largest area possibly * spanning blocks. */ static void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk, bool full_scan) { struct pcpu_block_md *chunk_md = &chunk->chunk_md; int bit_off, bits; /* promote scan_hint to contig_hint */ if (!full_scan && chunk_md->scan_hint) { bit_off = chunk_md->scan_hint_start + chunk_md->scan_hint; chunk_md->contig_hint_start = chunk_md->scan_hint_start; chunk_md->contig_hint = chunk_md->scan_hint; chunk_md->scan_hint = 0; } else { bit_off = chunk_md->first_free; chunk_md->contig_hint = 0; } bits = 0; pcpu_for_each_md_free_region(chunk, bit_off, bits) pcpu_block_update(chunk_md, bit_off, bit_off + bits); } /** * pcpu_block_refresh_hint * @chunk: chunk of interest * @index: index of the metadata block * * Scans over the block beginning at first_free and updates the block * metadata accordingly. */ static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index) { struct pcpu_block_md *block = chunk->md_blocks + index; unsigned long *alloc_map = pcpu_index_alloc_map(chunk, index); unsigned int start, end; /* region start, region end */ /* promote scan_hint to contig_hint */ if (block->scan_hint) { start = block->scan_hint_start + block->scan_hint; block->contig_hint_start = block->scan_hint_start; block->contig_hint = block->scan_hint; block->scan_hint = 0; } else { start = block->first_free; block->contig_hint = 0; } block->right_free = 0; /* iterate over free areas and update the contig hints */ for_each_clear_bitrange_from(start, end, alloc_map, PCPU_BITMAP_BLOCK_BITS) pcpu_block_update(block, start, end); } /** * pcpu_block_update_hint_alloc - update hint on allocation path * @chunk: chunk of interest * @bit_off: chunk offset * @bits: size of request * * Updates metadata for the allocation path. The metadata only has to be * refreshed by a full scan iff the chunk's contig hint is broken. Block level * scans are required if the block's contig hint is broken. */ static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off, int bits) { struct pcpu_block_md *chunk_md = &chunk->chunk_md; int nr_empty_pages = 0; struct pcpu_block_md *s_block, *e_block, *block; int s_index, e_index; /* block indexes of the freed allocation */ int s_off, e_off; /* block offsets of the freed allocation */ /* * Calculate per block offsets. * The calculation uses an inclusive range, but the resulting offsets * are [start, end). e_index always points to the last block in the * range. */ s_index = pcpu_off_to_block_index(bit_off); e_index = pcpu_off_to_block_index(bit_off + bits - 1); s_off = pcpu_off_to_block_off(bit_off); e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1; s_block = chunk->md_blocks + s_index; e_block = chunk->md_blocks + e_index; /* * Update s_block. */ if (s_block->contig_hint == PCPU_BITMAP_BLOCK_BITS) nr_empty_pages++; /* * block->first_free must be updated if the allocation takes its place. * If the allocation breaks the contig_hint, a scan is required to * restore this hint. */ if (s_off == s_block->first_free) s_block->first_free = find_next_zero_bit( pcpu_index_alloc_map(chunk, s_index), PCPU_BITMAP_BLOCK_BITS, s_off + bits); if (pcpu_region_overlap(s_block->scan_hint_start, s_block->scan_hint_start + s_block->scan_hint, s_off, s_off + bits)) s_block->scan_hint = 0; if (pcpu_region_overlap(s_block->contig_hint_start, s_block->contig_hint_start + s_block->contig_hint, s_off, s_off + bits)) { /* block contig hint is broken - scan to fix it */ if (!s_off) s_block->left_free = 0; pcpu_block_refresh_hint(chunk, s_index); } else { /* update left and right contig manually */ s_block->left_free = min(s_block->left_free, s_off); if (s_index == e_index) s_block->right_free = min_t(int, s_block->right_free, PCPU_BITMAP_BLOCK_BITS - e_off); else s_block->right_free = 0; } /* * Update e_block. */ if (s_index != e_index) { if (e_block->contig_hint == PCPU_BITMAP_BLOCK_BITS) nr_empty_pages++; /* * When the allocation is across blocks, the end is along * the left part of the e_block. */ e_block->first_free = find_next_zero_bit( pcpu_index_alloc_map(chunk, e_index), PCPU_BITMAP_BLOCK_BITS, e_off); if (e_off == PCPU_BITMAP_BLOCK_BITS) { /* reset the block */ e_block++; } else { if (e_off > e_block->scan_hint_start) e_block->scan_hint = 0; e_block->left_free = 0; if (e_off > e_block->contig_hint_start) { /* contig hint is broken - scan to fix it */ pcpu_block_refresh_hint(chunk, e_index); } else { e_block->right_free = min_t(int, e_block->right_free, PCPU_BITMAP_BLOCK_BITS - e_off); } } /* update in-between md_blocks */ nr_empty_pages += (e_index - s_index - 1); for (block = s_block + 1; block < e_block; block++) { block->scan_hint = 0; block->contig_hint = 0; block->left_free = 0; block->right_free = 0; } } /* * If the allocation is not atomic, some blocks may not be * populated with pages, while we account it here. The number * of pages will be added back with pcpu_chunk_populated() * when populating pages. */ if (nr_empty_pages) pcpu_update_empty_pages(chunk, -nr_empty_pages); if (pcpu_region_overlap(chunk_md->scan_hint_start, chunk_md->scan_hint_start + chunk_md->scan_hint, bit_off, bit_off + bits)) chunk_md->scan_hint = 0; /* * The only time a full chunk scan is required is if the chunk * contig hint is broken. Otherwise, it means a smaller space * was used and therefore the chunk contig hint is still correct. */ if (pcpu_region_overlap(chunk_md->contig_hint_start, chunk_md->contig_hint_start + chunk_md->contig_hint, bit_off, bit_off + bits)) pcpu_chunk_refresh_hint(chunk, false); } /** * pcpu_block_update_hint_free - updates the block hints on the free path * @chunk: chunk of interest * @bit_off: chunk offset * @bits: size of request * * Updates metadata for the allocation path. This avoids a blind block * refresh by making use of the block contig hints. If this fails, it scans * forward and backward to determine the extent of the free area. This is * capped at the boundary of blocks. * * A chunk update is triggered if a page becomes free, a block becomes free, * or the free spans across blocks. This tradeoff is to minimize iterating * over the block metadata to update chunk_md->contig_hint. * chunk_md->contig_hint may be off by up to a page, but it will never be more * than the available space. If the contig hint is contained in one block, it * will be accurate. */ static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off, int bits) { int nr_empty_pages = 0; struct pcpu_block_md *s_block, *e_block, *block; int s_index, e_index; /* block indexes of the freed allocation */ int s_off, e_off; /* block offsets of the freed allocation */ int start, end; /* start and end of the whole free area */ /* * Calculate per block offsets. * The calculation uses an inclusive range, but the resulting offsets * are [start, end). e_index always points to the last block in the * range. */ s_index = pcpu_off_to_block_index(bit_off); e_index = pcpu_off_to_block_index(bit_off + bits - 1); s_off = pcpu_off_to_block_off(bit_off); e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1; s_block = chunk->md_blocks + s_index; e_block = chunk->md_blocks + e_index; /* * Check if the freed area aligns with the block->contig_hint. * If it does, then the scan to find the beginning/end of the * larger free area can be avoided. * * start and end refer to beginning and end of the free area * within each their respective blocks. This is not necessarily * the entire free area as it may span blocks past the beginning * or end of the block. */ start = s_off; if (s_off == s_block->contig_hint + s_block->contig_hint_start) { start = s_block->contig_hint_start; } else { /* * Scan backwards to find the extent of the free area. * find_last_bit returns the starting bit, so if the start bit * is returned, that means there was no last bit and the * remainder of the chunk is free. */ int l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index), start); start = (start == l_bit) ? 0 : l_bit + 1; } end = e_off; if (e_off == e_block->contig_hint_start) end = e_block->contig_hint_start + e_block->contig_hint; else end = find_next_bit(pcpu_index_alloc_map(chunk, e_index), PCPU_BITMAP_BLOCK_BITS, end); /* update s_block */ e_off = (s_index == e_index) ? end : PCPU_BITMAP_BLOCK_BITS; if (!start && e_off == PCPU_BITMAP_BLOCK_BITS) nr_empty_pages++; pcpu_block_update(s_block, start, e_off); /* freeing in the same block */ if (s_index != e_index) { /* update e_block */ if (end == PCPU_BITMAP_BLOCK_BITS) nr_empty_pages++; pcpu_block_update(e_block, 0, end); /* reset md_blocks in the middle */ nr_empty_pages += (e_index - s_index - 1); for (block = s_block + 1; block < e_block; block++) { block->first_free = 0; block->scan_hint = 0; block->contig_hint_start = 0; block->contig_hint = PCPU_BITMAP_BLOCK_BITS; block->left_free = PCPU_BITMAP_BLOCK_BITS; block->right_free = PCPU_BITMAP_BLOCK_BITS; } } if (nr_empty_pages) pcpu_update_empty_pages(chunk, nr_empty_pages); /* * Refresh chunk metadata when the free makes a block free or spans * across blocks. The contig_hint may be off by up to a page, but if * the contig_hint is contained in a block, it will be accurate with * the else condition below. */ if (((end - start) >= PCPU_BITMAP_BLOCK_BITS) || s_index != e_index) pcpu_chunk_refresh_hint(chunk, true); else pcpu_block_update(&chunk->chunk_md, pcpu_block_off_to_off(s_index, start), end); } /** * pcpu_is_populated - determines if the region is populated * @chunk: chunk of interest * @bit_off: chunk offset * @bits: size of area * @next_off: return value for the next offset to start searching * * For atomic allocations, check if the backing pages are populated. * * RETURNS: * Bool if the backing pages are populated. * next_index is to skip over unpopulated blocks in pcpu_find_block_fit. */ static bool pcpu_is_populated(struct pcpu_chunk *chunk, int bit_off, int bits, int *next_off) { unsigned int start, end; start = PFN_DOWN(bit_off * PCPU_MIN_ALLOC_SIZE); end = PFN_UP((bit_off + bits) * PCPU_MIN_ALLOC_SIZE); start = find_next_zero_bit(chunk->populated, end, start); if (start >= end) return true; end = find_next_bit(chunk->populated, end, start + 1); *next_off = end * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE; return false; } /** * pcpu_find_block_fit - finds the block index to start searching * @chunk: chunk of interest * @alloc_bits: size of request in allocation units * @align: alignment of area (max PAGE_SIZE bytes) * @pop_only: use populated regions only * * Given a chunk and an allocation spec, find the offset to begin searching * for a free region. This iterates over the bitmap metadata blocks to * find an offset that will be guaranteed to fit the requirements. It is * not quite first fit as if the allocation does not fit in the contig hint * of a block or chunk, it is skipped. This errs on the side of caution * to prevent excess iteration. Poor alignment can cause the allocator to * skip over blocks and chunks that have valid free areas. * * RETURNS: * The offset in the bitmap to begin searching. * -1 if no offset is found. */ static int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits, size_t align, bool pop_only) { struct pcpu_block_md *chunk_md = &chunk->chunk_md; int bit_off, bits, next_off; /* * This is an optimization to prevent scanning by assuming if the * allocation cannot fit in the global hint, there is memory pressure * and creating a new chunk would happen soon. */ if (!pcpu_check_block_hint(chunk_md, alloc_bits, align)) return -1; bit_off = pcpu_next_hint(chunk_md, alloc_bits); bits = 0; pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits) { if (!pop_only || pcpu_is_populated(chunk, bit_off, bits, &next_off)) break; bit_off = next_off; bits = 0; } if (bit_off == pcpu_chunk_map_bits(chunk)) return -1; return bit_off; } /* * pcpu_find_zero_area - modified from bitmap_find_next_zero_area_off() * @map: the address to base the search on * @size: the bitmap size in bits * @start: the bitnumber to start searching at * @nr: the number of zeroed bits we're looking for * @align_mask: alignment mask for zero area * @largest_off: offset of the largest area skipped * @largest_bits: size of the largest area skipped * * The @align_mask should be one less than a power of 2. * * This is a modified version of bitmap_find_next_zero_area_off() to remember * the largest area that was skipped. This is imperfect, but in general is * good enough. The largest remembered region is the largest failed region * seen. This does not include anything we possibly skipped due to alignment. * pcpu_block_update_scan() does scan backwards to try and recover what was * lost to alignment. While this can cause scanning to miss earlier possible * free areas, smaller allocations will eventually fill those holes. */ static unsigned long pcpu_find_zero_area(unsigned long *map, unsigned long size, unsigned long start, unsigned long nr, unsigned long align_mask, unsigned long *largest_off, unsigned long *largest_bits) { unsigned long index, end, i, area_off, area_bits; again: index = find_next_zero_bit(map, size, start); /* Align allocation */ index = __ALIGN_MASK(index, align_mask); area_off = index; end = index + nr; if (end > size) return end; i = find_next_bit(map, end, index); if (i < end) { area_bits = i - area_off; /* remember largest unused area with best alignment */ if (area_bits > *largest_bits || (area_bits == *largest_bits && *largest_off && (!area_off || __ffs(area_off) > __ffs(*largest_off)))) { *largest_off = area_off; *largest_bits = area_bits; } start = i + 1; goto again; } return index; } /** * pcpu_alloc_area - allocates an area from a pcpu_chunk * @chunk: chunk of interest * @alloc_bits: size of request in allocation units * @align: alignment of area (max PAGE_SIZE) * @start: bit_off to start searching * * This function takes in a @start offset to begin searching to fit an * allocation of @alloc_bits with alignment @align. It needs to scan * the allocation map because if it fits within the block's contig hint, * @start will be block->first_free. This is an attempt to fill the * allocation prior to breaking the contig hint. The allocation and * boundary maps are updated accordingly if it confirms a valid * free area. * * RETURNS: * Allocated addr offset in @chunk on success. * -1 if no matching area is found. */ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits, size_t align, int start) { struct pcpu_block_md *chunk_md = &chunk->chunk_md; size_t align_mask = (align) ? (align - 1) : 0; unsigned long area_off = 0, area_bits = 0; int bit_off, end, oslot; lockdep_assert_held(&pcpu_lock); oslot = pcpu_chunk_slot(chunk); /* * Search to find a fit. */ end = min_t(int, start + alloc_bits + PCPU_BITMAP_BLOCK_BITS, pcpu_chunk_map_bits(chunk)); bit_off = pcpu_find_zero_area(chunk->alloc_map, end, start, alloc_bits, align_mask, &area_off, &area_bits); if (bit_off >= end) return -1; if (area_bits) pcpu_block_update_scan(chunk, area_off, area_bits); /* update alloc map */ bitmap_set(chunk->alloc_map, bit_off, alloc_bits); /* update boundary map */ set_bit(bit_off, chunk->bound_map); bitmap_clear(chunk->bound_map, bit_off + 1, alloc_bits - 1); set_bit(bit_off + alloc_bits, chunk->bound_map); chunk->free_bytes -= alloc_bits * PCPU_MIN_ALLOC_SIZE; /* update first free bit */ if (bit_off == chunk_md->first_free) chunk_md->first_free = find_next_zero_bit( chunk->alloc_map, pcpu_chunk_map_bits(chunk), bit_off + alloc_bits); pcpu_block_update_hint_alloc(chunk, bit_off, alloc_bits); pcpu_chunk_relocate(chunk, oslot); return bit_off * PCPU_MIN_ALLOC_SIZE; } /** * pcpu_free_area - frees the corresponding offset * @chunk: chunk of interest * @off: addr offset into chunk * * This function determines the size of an allocation to free using * the boundary bitmap and clears the allocation map. * * RETURNS: * Number of freed bytes. */ static int pcpu_free_area(struct pcpu_chunk *chunk, int off) { struct pcpu_block_md *chunk_md = &chunk->chunk_md; int bit_off, bits, end, oslot, freed; lockdep_assert_held(&pcpu_lock); oslot = pcpu_chunk_slot(chunk); bit_off = off / PCPU_MIN_ALLOC_SIZE; /* check invalid free */ if (!test_bit(bit_off, chunk->alloc_map) || !test_bit(bit_off, chunk->bound_map)) return 0; /* find end index */ end = find_next_bit(chunk->bound_map, pcpu_chunk_map_bits(chunk), bit_off + 1); bits = end - bit_off; bitmap_clear(chunk->alloc_map, bit_off, bits); freed = bits * PCPU_MIN_ALLOC_SIZE; /* update metadata */ chunk->free_bytes += freed; /* update first free bit */ chunk_md->first_free = min(chunk_md->first_free, bit_off); pcpu_block_update_hint_free(chunk, bit_off, bits); pcpu_chunk_relocate(chunk, oslot); pcpu_stats_area_dealloc(chunk); return freed; } static void pcpu_init_md_block(struct pcpu_block_md *block, int nr_bits) { block->scan_hint = 0; block->contig_hint = nr_bits; block->left_free = nr_bits; block->right_free = nr_bits; block->first_free = 0; block->nr_bits = nr_bits; } static void pcpu_init_md_blocks(struct pcpu_chunk *chunk) { struct pcpu_block_md *md_block; /* init the chunk's block */ pcpu_init_md_block(&chunk->chunk_md, pcpu_chunk_map_bits(chunk)); for (md_block = chunk->md_blocks; md_block != chunk->md_blocks + pcpu_chunk_nr_blocks(chunk); md_block++) pcpu_init_md_block(md_block, PCPU_BITMAP_BLOCK_BITS); } /** * pcpu_alloc_first_chunk - creates chunks that serve the first chunk * @tmp_addr: the start of the region served * @map_size: size of the region served * * This is responsible for creating the chunks that serve the first chunk. The * base_addr is page aligned down of @tmp_addr while the region end is page * aligned up. Offsets are kept track of to determine the region served. All * this is done to appease the bitmap allocator in avoiding partial blocks. * * RETURNS: * Chunk serving the region at @tmp_addr of @map_size. */ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, int map_size) { struct pcpu_chunk *chunk; unsigned long aligned_addr; int start_offset, offset_bits, region_size, region_bits; size_t alloc_size; /* region calculations */ aligned_addr = tmp_addr & PAGE_MASK; start_offset = tmp_addr - aligned_addr; region_size = ALIGN(start_offset + map_size, PAGE_SIZE); /* allocate chunk */ alloc_size = struct_size(chunk, populated, BITS_TO_LONGS(region_size >> PAGE_SHIFT)); chunk = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES); INIT_LIST_HEAD(&chunk->list); chunk->base_addr = (void *)aligned_addr; chunk->start_offset = start_offset; chunk->end_offset = region_size - chunk->start_offset - map_size; chunk->nr_pages = region_size >> PAGE_SHIFT; region_bits = pcpu_chunk_map_bits(chunk); alloc_size = BITS_TO_LONGS(region_bits) * sizeof(chunk->alloc_map[0]); chunk->alloc_map = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES); alloc_size = BITS_TO_LONGS(region_bits + 1) * sizeof(chunk->bound_map[0]); chunk->bound_map = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES); alloc_size = pcpu_chunk_nr_blocks(chunk) * sizeof(chunk->md_blocks[0]); chunk->md_blocks = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES); #ifdef NEED_PCPUOBJ_EXT /* first chunk is free to use */ chunk->obj_exts = NULL; #endif pcpu_init_md_blocks(chunk); /* manage populated page bitmap */ chunk->immutable = true; bitmap_fill(chunk->populated, chunk->nr_pages); chunk->nr_populated = chunk->nr_pages; chunk->nr_empty_pop_pages = chunk->nr_pages; chunk->free_bytes = map_size; if (chunk->start_offset) { /* hide the beginning of the bitmap */ offset_bits = chunk->start_offset / PCPU_MIN_ALLOC_SIZE; bitmap_set(chunk->alloc_map, 0, offset_bits); set_bit(0, chunk->bound_map); set_bit(offset_bits, chunk->bound_map); chunk->chunk_md.first_free = offset_bits; pcpu_block_update_hint_alloc(chunk, 0, offset_bits); } if (chunk->end_offset) { /* hide the end of the bitmap */ offset_bits = chunk->end_offset / PCPU_MIN_ALLOC_SIZE; bitmap_set(chunk->alloc_map, pcpu_chunk_map_bits(chunk) - offset_bits, offset_bits); set_bit((start_offset + map_size) / PCPU_MIN_ALLOC_SIZE, chunk->bound_map); set_bit(region_bits, chunk->bound_map); pcpu_block_update_hint_alloc(chunk, pcpu_chunk_map_bits(chunk) - offset_bits, offset_bits); } return chunk; } static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp) { struct pcpu_chunk *chunk; int region_bits; chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size, gfp); if (!chunk) return NULL; INIT_LIST_HEAD(&chunk->list); chunk->nr_pages = pcpu_unit_pages; region_bits = pcpu_chunk_map_bits(chunk); chunk->alloc_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits) * sizeof(chunk->alloc_map[0]), gfp); if (!chunk->alloc_map) goto alloc_map_fail; chunk->bound_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits + 1) * sizeof(chunk->bound_map[0]), gfp); if (!chunk->bound_map) goto bound_map_fail; chunk->md_blocks = pcpu_mem_zalloc(pcpu_chunk_nr_blocks(chunk) * sizeof(chunk->md_blocks[0]), gfp); if (!chunk->md_blocks) goto md_blocks_fail; #ifdef NEED_PCPUOBJ_EXT if (need_pcpuobj_ext()) { chunk->obj_exts = pcpu_mem_zalloc(pcpu_chunk_map_bits(chunk) * sizeof(struct pcpuobj_ext), gfp); if (!chunk->obj_exts) goto objcg_fail; } #endif pcpu_init_md_blocks(chunk); /* init metadata */ chunk->free_bytes = chunk->nr_pages * PAGE_SIZE; return chunk; #ifdef NEED_PCPUOBJ_EXT objcg_fail: pcpu_mem_free(chunk->md_blocks); #endif md_blocks_fail: pcpu_mem_free(chunk->bound_map); bound_map_fail: pcpu_mem_free(chunk->alloc_map); alloc_map_fail: pcpu_mem_free(chunk); return NULL; } static void pcpu_free_chunk(struct pcpu_chunk *chunk) { if (!chunk) return; #ifdef NEED_PCPUOBJ_EXT pcpu_mem_free(chunk->obj_exts); #endif pcpu_mem_free(chunk->md_blocks); pcpu_mem_free(chunk->bound_map); pcpu_mem_free(chunk->alloc_map); pcpu_mem_free(chunk); } /** * pcpu_chunk_populated - post-population bookkeeping * @chunk: pcpu_chunk which got populated * @page_start: the start page * @page_end: the end page * * Pages in [@page_start,@page_end) have been populated to @chunk. Update * the bookkeeping information accordingly. Must be called after each * successful population. */ static void pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start, int page_end) { int nr = page_end - page_start; lockdep_assert_held(&pcpu_lock); bitmap_set(chunk->populated, page_start, nr); chunk->nr_populated += nr; pcpu_nr_populated += nr; pcpu_update_empty_pages(chunk, nr); } /** * pcpu_chunk_depopulated - post-depopulation bookkeeping * @chunk: pcpu_chunk which got depopulated * @page_start: the start page * @page_end: the end page * * Pages in [@page_start,@page_end) have been depopulated from @chunk. * Update the bookkeeping information accordingly. Must be called after * each successful depopulation. */ static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk, int page_start, int page_end) { int nr = page_end - page_start; lockdep_assert_held(&pcpu_lock); bitmap_clear(chunk->populated, page_start, nr); chunk->nr_populated -= nr; pcpu_nr_populated -= nr; pcpu_update_empty_pages(chunk, -nr); } /* * Chunk management implementation. * * To allow different implementations, chunk alloc/free and * [de]population are implemented in a separate file which is pulled * into this file and compiled together. The following functions * should be implemented. * * pcpu_populate_chunk - populate the specified range of a chunk * pcpu_depopulate_chunk - depopulate the specified range of a chunk * pcpu_post_unmap_tlb_flush - flush tlb for the specified range of a chunk * pcpu_create_chunk - create a new chunk * pcpu_destroy_chunk - destroy a chunk, always preceded by full depop * pcpu_addr_to_page - translate address to physical address * pcpu_verify_alloc_info - check alloc_info is acceptable during init */ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int page_start, int page_end, gfp_t gfp); static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int page_start, int page_end); static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk, int page_start, int page_end); static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp); static void pcpu_destroy_chunk(struct pcpu_chunk *chunk); static struct page *pcpu_addr_to_page(void *addr); static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai); #ifdef CONFIG_NEED_PER_CPU_KM #include "percpu-km.c" #else #include "percpu-vm.c" #endif /** * pcpu_chunk_addr_search - determine chunk containing specified address * @addr: address for which the chunk needs to be determined. * * This is an internal function that handles all but static allocations. * Static percpu address values should never be passed into the allocator. * * RETURNS: * The address of the found chunk. */ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) { /* is it in the dynamic region (first chunk)? */ if (pcpu_addr_in_chunk(pcpu_first_chunk, addr)) return pcpu_first_chunk; /* is it in the reserved region? */ if (pcpu_addr_in_chunk(pcpu_reserved_chunk, addr)) return pcpu_reserved_chunk; /* * The address is relative to unit0 which might be unused and * thus unmapped. Offset the address to the unit space of the * current processor before looking it up in the vmalloc * space. Note that any possible cpu id can be used here, so * there's no need to worry about preemption or cpu hotplug. */ addr += pcpu_unit_offsets[raw_smp_processor_id()]; return pcpu_get_page_chunk(pcpu_addr_to_page(addr)); } #ifdef CONFIG_MEMCG static bool pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp, struct obj_cgroup **objcgp) { struct obj_cgroup *objcg; if (!memcg_kmem_online() || !(gfp & __GFP_ACCOUNT)) return true; objcg = current_obj_cgroup(); if (!objcg || obj_cgroup_is_root(objcg)) return true; if (obj_cgroup_charge(objcg, gfp, pcpu_obj_full_size(size))) return false; *objcgp = objcg; return true; } static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg, struct pcpu_chunk *chunk, int off, size_t size) { if (!objcg) return; if (likely(chunk && chunk->obj_exts)) { obj_cgroup_get(objcg); chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].cgroup = objcg; rcu_read_lock(); mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B, pcpu_obj_full_size(size)); rcu_read_unlock(); } else { obj_cgroup_uncharge(objcg, pcpu_obj_full_size(size)); } } static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size) { struct obj_cgroup *objcg; if (unlikely(!chunk->obj_exts)) return; objcg = chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].cgroup; if (!objcg) return; chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].cgroup = NULL; obj_cgroup_uncharge(objcg, pcpu_obj_full_size(size)); rcu_read_lock(); mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B, -pcpu_obj_full_size(size)); rcu_read_unlock(); obj_cgroup_put(objcg); } #else /* CONFIG_MEMCG */ static bool pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp, struct obj_cgroup **objcgp) { return true; } static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg, struct pcpu_chunk *chunk, int off, size_t size) { } static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size) { } #endif /* CONFIG_MEMCG */ #ifdef CONFIG_MEM_ALLOC_PROFILING static void pcpu_alloc_tag_alloc_hook(struct pcpu_chunk *chunk, int off, size_t size) { if (mem_alloc_profiling_enabled() && likely(chunk->obj_exts)) { alloc_tag_add(&chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].tag, current->alloc_tag, size); } } static void pcpu_alloc_tag_free_hook(struct pcpu_chunk *chunk, int off, size_t size) { if (mem_alloc_profiling_enabled() && likely(chunk->obj_exts)) alloc_tag_sub(&chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].tag, size); } #else static void pcpu_alloc_tag_alloc_hook(struct pcpu_chunk *chunk, int off, size_t size) { } static void pcpu_alloc_tag_free_hook(struct pcpu_chunk *chunk, int off, size_t size) { } #endif /** * pcpu_alloc - the percpu allocator * @size: size of area to allocate in bytes * @align: alignment of area (max PAGE_SIZE) * @reserved: allocate from the reserved chunk if available * @gfp: allocation flags * * Allocate percpu area of @size bytes aligned at @align. If @gfp doesn't * contain %GFP_KERNEL, the allocation is atomic. If @gfp has __GFP_NOWARN * then no warning will be triggered on invalid or failed allocation * requests. * * RETURNS: * Percpu pointer to the allocated area on success, NULL on failure. */ void __percpu *pcpu_alloc_noprof(size_t size, size_t align, bool reserved, gfp_t gfp) { gfp_t pcpu_gfp; bool is_atomic; bool do_warn; struct obj_cgroup *objcg = NULL; static atomic_t warn_limit = ATOMIC_INIT(10); struct pcpu_chunk *chunk, *next; const char *err; int slot, off, cpu, ret; unsigned long flags; void __percpu *ptr; size_t bits, bit_align; gfp = current_gfp_context(gfp); /* whitelisted flags that can be passed to the backing allocators */ pcpu_gfp = gfp & (GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN); is_atomic = !gfpflags_allow_blocking(gfp); do_warn = !(gfp & __GFP_NOWARN); /* * There is now a minimum allocation size of PCPU_MIN_ALLOC_SIZE, * therefore alignment must be a minimum of that many bytes. * An allocation may have internal fragmentation from rounding up * of up to PCPU_MIN_ALLOC_SIZE - 1 bytes. */ if (unlikely(align < PCPU_MIN_ALLOC_SIZE)) align = PCPU_MIN_ALLOC_SIZE; size = ALIGN(size, PCPU_MIN_ALLOC_SIZE); bits = size >> PCPU_MIN_ALLOC_SHIFT; bit_align = align >> PCPU_MIN_ALLOC_SHIFT; if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE || !is_power_of_2(align))) { WARN(do_warn, "illegal size (%zu) or align (%zu) for percpu allocation\n", size, align); return NULL; } if (unlikely(!pcpu_memcg_pre_alloc_hook(size, gfp, &objcg))) return NULL; if (!is_atomic) { /* * pcpu_balance_workfn() allocates memory under this mutex, * and it may wait for memory reclaim. Allow current task * to become OOM victim, in case of memory pressure. */ if (gfp & __GFP_NOFAIL) { mutex_lock(&pcpu_alloc_mutex); } else if (mutex_lock_killable(&pcpu_alloc_mutex)) { pcpu_memcg_post_alloc_hook(objcg, NULL, 0, size); return NULL; } } spin_lock_irqsave(&pcpu_lock, flags); /* serve reserved allocations from the reserved chunk if available */ if (reserved && pcpu_reserved_chunk) { chunk = pcpu_reserved_chunk; off = pcpu_find_block_fit(chunk, bits, bit_align, is_atomic); if (off < 0) { err = "alloc from reserved chunk failed"; goto fail_unlock; } off = pcpu_alloc_area(chunk, bits, bit_align, off); if (off >= 0) goto area_found; err = "alloc from reserved chunk failed"; goto fail_unlock; } restart: /* search through normal chunks */ for (slot = pcpu_size_to_slot(size); slot <= pcpu_free_slot; slot++) { list_for_each_entry_safe(chunk, next, &pcpu_chunk_lists[slot], list) { off = pcpu_find_block_fit(chunk, bits, bit_align, is_atomic); if (off < 0) { if (slot < PCPU_SLOT_FAIL_THRESHOLD) pcpu_chunk_move(chunk, 0); continue; } off = pcpu_alloc_area(chunk, bits, bit_align, off); if (off >= 0) { pcpu_reintegrate_chunk(chunk); goto area_found; } } } spin_unlock_irqrestore(&pcpu_lock, flags); if (is_atomic) { err = "atomic alloc failed, no space left"; goto fail; } /* No space left. Create a new chunk. */ if (list_empty(&pcpu_chunk_lists[pcpu_free_slot])) { chunk = pcpu_create_chunk(pcpu_gfp); if (!chunk) { err = "failed to allocate new chunk"; goto fail; } spin_lock_irqsave(&pcpu_lock, flags); pcpu_chunk_relocate(chunk, -1); } else { spin_lock_irqsave(&pcpu_lock, flags); } goto restart; area_found: pcpu_stats_area_alloc(chunk, size); if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW) pcpu_schedule_balance_work(); spin_unlock_irqrestore(&pcpu_lock, flags); /* populate if not all pages are already there */ if (!is_atomic) { unsigned int page_end, rs, re; rs = PFN_DOWN(off); page_end = PFN_UP(off + size); for_each_clear_bitrange_from(rs, re, chunk->populated, page_end) { WARN_ON(chunk->immutable); ret = pcpu_populate_chunk(chunk, rs, re, pcpu_gfp); spin_lock_irqsave(&pcpu_lock, flags); if (ret) { pcpu_free_area(chunk, off); err = "failed to populate"; goto fail_unlock; } pcpu_chunk_populated(chunk, rs, re); spin_unlock_irqrestore(&pcpu_lock, flags); } mutex_unlock(&pcpu_alloc_mutex); } /* clear the areas and return address relative to base address */ for_each_possible_cpu(cpu) memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); ptr = __addr_to_pcpu_ptr(chunk->base_addr + off); kmemleak_alloc_percpu(ptr, size, gfp); trace_percpu_alloc_percpu(_RET_IP_, reserved, is_atomic, size, align, chunk->base_addr, off, ptr, pcpu_obj_full_size(size), gfp); pcpu_memcg_post_alloc_hook(objcg, chunk, off, size); pcpu_alloc_tag_alloc_hook(chunk, off, size); return ptr; fail_unlock: spin_unlock_irqrestore(&pcpu_lock, flags); fail: trace_percpu_alloc_percpu_fail(reserved, is_atomic, size, align); if (do_warn) { int remaining = atomic_dec_if_positive(&warn_limit); if (remaining >= 0) { pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n", size, align, is_atomic, err); if (!is_atomic) dump_stack(); if (remaining == 0) pr_info("limit reached, disable warning\n"); } } if (is_atomic) { /* see the flag handling in pcpu_balance_workfn() */ pcpu_atomic_alloc_failed = true; pcpu_schedule_balance_work(); } else { mutex_unlock(&pcpu_alloc_mutex); } pcpu_memcg_post_alloc_hook(objcg, NULL, 0, size); return NULL; } EXPORT_SYMBOL_GPL(pcpu_alloc_noprof); /** * pcpu_balance_free - manage the amount of free chunks * @empty_only: free chunks only if there are no populated pages * * If empty_only is %false, reclaim all fully free chunks regardless of the * number of populated pages. Otherwise, only reclaim chunks that have no * populated pages. * * CONTEXT: * pcpu_lock (can be dropped temporarily) */ static void pcpu_balance_free(bool empty_only) { LIST_HEAD(to_free); struct list_head *free_head = &pcpu_chunk_lists[pcpu_free_slot]; struct pcpu_chunk *chunk, *next; lockdep_assert_held(&pcpu_lock); /* * There's no reason to keep around multiple unused chunks and VM * areas can be scarce. Destroy all free chunks except for one. */ list_for_each_entry_safe(chunk, next, free_head, list) { WARN_ON(chunk->immutable); /* spare the first one */ if (chunk == list_first_entry(free_head, struct pcpu_chunk, list)) continue; if (!empty_only || chunk->nr_empty_pop_pages == 0) list_move(&chunk->list, &to_free); } if (list_empty(&to_free)) return; spin_unlock_irq(&pcpu_lock); list_for_each_entry_safe(chunk, next, &to_free, list) { unsigned int rs, re; for_each_set_bitrange(rs, re, chunk->populated, chunk->nr_pages) { pcpu_depopulate_chunk(chunk, rs, re); spin_lock_irq(&pcpu_lock); pcpu_chunk_depopulated(chunk, rs, re); spin_unlock_irq(&pcpu_lock); } pcpu_destroy_chunk(chunk); cond_resched(); } spin_lock_irq(&pcpu_lock); } /** * pcpu_balance_populated - manage the amount of populated pages * * Maintain a certain amount of populated pages to satisfy atomic allocations. * It is possible that this is called when physical memory is scarce causing * OOM killer to be triggered. We should avoid doing so until an actual * allocation causes the failure as it is possible that requests can be * serviced from already backed regions. * * CONTEXT: * pcpu_lock (can be dropped temporarily) */ static void pcpu_balance_populated(void) { /* gfp flags passed to underlying allocators */ const gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; struct pcpu_chunk *chunk; int slot, nr_to_pop, ret; lockdep_assert_held(&pcpu_lock); /* * Ensure there are certain number of free populated pages for * atomic allocs. Fill up from the most packed so that atomic * allocs don't increase fragmentation. If atomic allocation * failed previously, always populate the maximum amount. This * should prevent atomic allocs larger than PAGE_SIZE from keeping * failing indefinitely; however, large atomic allocs are not * something we support properly and can be highly unreliable and * inefficient. */ retry_pop: if (pcpu_atomic_alloc_failed) { nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH; /* best effort anyway, don't worry about synchronization */ pcpu_atomic_alloc_failed = false; } else { nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH - pcpu_nr_empty_pop_pages, 0, PCPU_EMPTY_POP_PAGES_HIGH); } for (slot = pcpu_size_to_slot(PAGE_SIZE); slot <= pcpu_free_slot; slot++) { unsigned int nr_unpop = 0, rs, re; if (!nr_to_pop) break; list_for_each_entry(chunk, &pcpu_chunk_lists[slot], list) { nr_unpop = chunk->nr_pages - chunk->nr_populated; if (nr_unpop) break; } if (!nr_unpop) continue; /* @chunk can't go away while pcpu_alloc_mutex is held */ for_each_clear_bitrange(rs, re, chunk->populated, chunk->nr_pages) { int nr = min_t(int, re - rs, nr_to_pop); spin_unlock_irq(&pcpu_lock); ret = pcpu_populate_chunk(chunk, rs, rs + nr, gfp); cond_resched(); spin_lock_irq(&pcpu_lock); if (!ret) { nr_to_pop -= nr; pcpu_chunk_populated(chunk, rs, rs + nr); } else { nr_to_pop = 0; } if (!nr_to_pop) break; } } if (nr_to_pop) { /* ran out of chunks to populate, create a new one and retry */ spin_unlock_irq(&pcpu_lock); chunk = pcpu_create_chunk(gfp); cond_resched(); spin_lock_irq(&pcpu_lock); if (chunk) { pcpu_chunk_relocate(chunk, -1); goto retry_pop; } } } /** * pcpu_reclaim_populated - scan over to_depopulate chunks and free empty pages * * Scan over chunks in the depopulate list and try to release unused populated * pages back to the system. Depopulated chunks are sidelined to prevent * repopulating these pages unless required. Fully free chunks are reintegrated * and freed accordingly (1 is kept around). If we drop below the empty * populated pages threshold, reintegrate the chunk if it has empty free pages. * Each chunk is scanned in the reverse order to keep populated pages close to * the beginning of the chunk. * * CONTEXT: * pcpu_lock (can be dropped temporarily) * */ static void pcpu_reclaim_populated(void) { struct pcpu_chunk *chunk; struct pcpu_block_md *block; int freed_page_start, freed_page_end; int i, end; bool reintegrate; lockdep_assert_held(&pcpu_lock); /* * Once a chunk is isolated to the to_depopulate list, the chunk is no * longer discoverable to allocations whom may populate pages. The only * other accessor is the free path which only returns area back to the * allocator not touching the populated bitmap. */ while ((chunk = list_first_entry_or_null( &pcpu_chunk_lists[pcpu_to_depopulate_slot], struct pcpu_chunk, list))) { WARN_ON(chunk->immutable); /* * Scan chunk's pages in the reverse order to keep populated * pages close to the beginning of the chunk. */ freed_page_start = chunk->nr_pages; freed_page_end = 0; reintegrate = false; for (i = chunk->nr_pages - 1, end = -1; i >= 0; i--) { /* no more work to do */ if (chunk->nr_empty_pop_pages == 0) break; /* reintegrate chunk to prevent atomic alloc failures */ if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_HIGH) { reintegrate = true; break; } /* * If the page is empty and populated, start or * extend the (i, end) range. If i == 0, decrease * i and perform the depopulation to cover the last * (first) page in the chunk. */ block = chunk->md_blocks + i; if (block->contig_hint == PCPU_BITMAP_BLOCK_BITS && test_bit(i, chunk->populated)) { if (end == -1) end = i; if (i > 0) continue; i--; } /* depopulate if there is an active range */ if (end == -1) continue; spin_unlock_irq(&pcpu_lock); pcpu_depopulate_chunk(chunk, i + 1, end + 1); cond_resched(); spin_lock_irq(&pcpu_lock); pcpu_chunk_depopulated(chunk, i + 1, end + 1); freed_page_start = min(freed_page_start, i + 1); freed_page_end = max(freed_page_end, end + 1); /* reset the range and continue */ end = -1; } /* batch tlb flush per chunk to amortize cost */ if (freed_page_start < freed_page_end) { spin_unlock_irq(&pcpu_lock); pcpu_post_unmap_tlb_flush(chunk, freed_page_start, freed_page_end); cond_resched(); spin_lock_irq(&pcpu_lock); } if (reintegrate || chunk->free_bytes == pcpu_unit_size) pcpu_reintegrate_chunk(chunk); else list_move_tail(&chunk->list, &pcpu_chunk_lists[pcpu_sidelined_slot]); } } /** * pcpu_balance_workfn - manage the amount of free chunks and populated pages * @work: unused * * For each chunk type, manage the number of fully free chunks and the number of * populated pages. An important thing to consider is when pages are freed and * how they contribute to the global counts. */ static void pcpu_balance_workfn(struct work_struct *work) { /* * pcpu_balance_free() is called twice because the first time we may * trim pages in the active pcpu_nr_empty_pop_pages which may cause us * to grow other chunks. This then gives pcpu_reclaim_populated() time * to move fully free chunks to the active list to be freed if * appropriate. * * Enforce GFP_NOIO allocations because we have pcpu_alloc users * constrained to GFP_NOIO/NOFS contexts and they could form lock * dependency through pcpu_alloc_mutex */ unsigned int flags = memalloc_noio_save(); mutex_lock(&pcpu_alloc_mutex); spin_lock_irq(&pcpu_lock); pcpu_balance_free(false); pcpu_reclaim_populated(); pcpu_balance_populated(); pcpu_balance_free(true); spin_unlock_irq(&pcpu_lock); mutex_unlock(&pcpu_alloc_mutex); memalloc_noio_restore(flags); } /** * free_percpu - free percpu area * @ptr: pointer to area to free * * Free percpu area @ptr. * * CONTEXT: * Can be called from atomic context. */ void free_percpu(void __percpu *ptr) { void *addr; struct pcpu_chunk *chunk; unsigned long flags; int size, off; bool need_balance = false; if (!ptr) return; kmemleak_free_percpu(ptr); addr = __pcpu_ptr_to_addr(ptr); chunk = pcpu_chunk_addr_search(addr); off = addr - chunk->base_addr; spin_lock_irqsave(&pcpu_lock, flags); size = pcpu_free_area(chunk, off); if (size == 0) { spin_unlock_irqrestore(&pcpu_lock, flags); /* invalid percpu free */ WARN_ON_ONCE(1); return; } pcpu_alloc_tag_free_hook(chunk, off, size); pcpu_memcg_free_hook(chunk, off, size); /* * If there are more than one fully free chunks, wake up grim reaper. * If the chunk is isolated, it may be in the process of being * reclaimed. Let reclaim manage cleaning up of that chunk. */ if (!chunk->isolated && chunk->free_bytes == pcpu_unit_size) { struct pcpu_chunk *pos; list_for_each_entry(pos, &pcpu_chunk_lists[pcpu_free_slot], list) if (pos != chunk) { need_balance = true; break; } } else if (pcpu_should_reclaim_chunk(chunk)) { pcpu_isolate_chunk(chunk); need_balance = true; } trace_percpu_free_percpu(chunk->base_addr, off, ptr); spin_unlock_irqrestore(&pcpu_lock, flags); if (need_balance) pcpu_schedule_balance_work(); } EXPORT_SYMBOL_GPL(free_percpu); bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr) { #ifdef CONFIG_SMP const size_t static_size = __per_cpu_end - __per_cpu_start; void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr); unsigned int cpu; for_each_possible_cpu(cpu) { void *start = per_cpu_ptr(base, cpu); void *va = (void *)addr; if (va >= start && va < start + static_size) { if (can_addr) { *can_addr = (unsigned long) (va - start); *can_addr += (unsigned long) per_cpu_ptr(base, get_boot_cpu_id()); } return true; } } #endif /* on UP, can't distinguish from other static vars, always false */ return false; } /** * is_kernel_percpu_address - test whether address is from static percpu area * @addr: address to test * * Test whether @addr belongs to in-kernel static percpu area. Module * static percpu areas are not considered. For those, use * is_module_percpu_address(). * * RETURNS: * %true if @addr is from in-kernel static percpu area, %false otherwise. */ bool is_kernel_percpu_address(unsigned long addr) { return __is_kernel_percpu_address(addr, NULL); } /** * per_cpu_ptr_to_phys - convert translated percpu address to physical address * @addr: the address to be converted to physical address * * Given @addr which is dereferenceable address obtained via one of * percpu access macros, this function translates it into its physical * address. The caller is responsible for ensuring @addr stays valid * until this function finishes. * * percpu allocator has special setup for the first chunk, which currently * supports either embedding in linear address space or vmalloc mapping, * and, from the second one, the backing allocator (currently either vm or * km) provides translation. * * The addr can be translated simply without checking if it falls into the * first chunk. But the current code reflects better how percpu allocator * actually works, and the verification can discover both bugs in percpu * allocator itself and per_cpu_ptr_to_phys() callers. So we keep current * code. * * RETURNS: * The physical address for @addr. */ phys_addr_t per_cpu_ptr_to_phys(void *addr) { void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr); bool in_first_chunk = false; unsigned long first_low, first_high; unsigned int cpu; /* * The following test on unit_low/high isn't strictly * necessary but will speed up lookups of addresses which * aren't in the first chunk. * * The address check is against full chunk sizes. pcpu_base_addr * points to the beginning of the first chunk including the * static region. Assumes good intent as the first chunk may * not be full (ie. < pcpu_unit_pages in size). */ first_low = (unsigned long)pcpu_base_addr + pcpu_unit_page_offset(pcpu_low_unit_cpu, 0); first_high = (unsigned long)pcpu_base_addr + pcpu_unit_page_offset(pcpu_high_unit_cpu, pcpu_unit_pages); if ((unsigned long)addr >= first_low && (unsigned long)addr < first_high) { for_each_possible_cpu(cpu) { void *start = per_cpu_ptr(base, cpu); if (addr >= start && addr < start + pcpu_unit_size) { in_first_chunk = true; break; } } } if (in_first_chunk) { if (!is_vmalloc_addr(addr)) return __pa(addr); else return page_to_phys(vmalloc_to_page(addr)) + offset_in_page(addr); } else return page_to_phys(pcpu_addr_to_page(addr)) + offset_in_page(addr); } /** * pcpu_alloc_alloc_info - allocate percpu allocation info * @nr_groups: the number of groups * @nr_units: the number of units * * Allocate ai which is large enough for @nr_groups groups containing * @nr_units units. The returned ai's groups[0].cpu_map points to the * cpu_map array which is long enough for @nr_units and filled with * NR_CPUS. It's the caller's responsibility to initialize cpu_map * pointer of other groups. * * RETURNS: * Pointer to the allocated pcpu_alloc_info on success, NULL on * failure. */ struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups, int nr_units) { struct pcpu_alloc_info *ai; size_t base_size, ai_size; void *ptr; int unit; base_size = ALIGN(struct_size(ai, groups, nr_groups), __alignof__(ai->groups[0].cpu_map[0])); ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]); ptr = memblock_alloc(PFN_ALIGN(ai_size), PAGE_SIZE); if (!ptr) return NULL; ai = ptr; ptr += base_size; ai->groups[0].cpu_map = ptr; for (unit = 0; unit < nr_units; unit++) ai->groups[0].cpu_map[unit] = NR_CPUS; ai->nr_groups = nr_groups; ai->__ai_size = PFN_ALIGN(ai_size); return ai; } /** * pcpu_free_alloc_info - free percpu allocation info * @ai: pcpu_alloc_info to free * * Free @ai which was allocated by pcpu_alloc_alloc_info(). */ void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai) { memblock_free(ai, ai->__ai_size); } /** * pcpu_dump_alloc_info - print out information about pcpu_alloc_info * @lvl: loglevel * @ai: allocation info to dump * * Print out information about @ai using loglevel @lvl. */ static void pcpu_dump_alloc_info(const char *lvl, const struct pcpu_alloc_info *ai) { int group_width = 1, cpu_width = 1, width; char empty_str[] = "--------"; int alloc = 0, alloc_end = 0; int group, v; int upa, apl; /* units per alloc, allocs per line */ v = ai->nr_groups; while (v /= 10) group_width++; v = num_possible_cpus(); while (v /= 10) cpu_width++; empty_str[min_t(int, cpu_width, sizeof(empty_str) - 1)] = '\0'; upa = ai->alloc_size / ai->unit_size; width = upa * (cpu_width + 1) + group_width + 3; apl = rounddown_pow_of_two(max(60 / width, 1)); printk("%spcpu-alloc: s%zu r%zu d%zu u%zu alloc=%zu*%zu", lvl, ai->static_size, ai->reserved_size, ai->dyn_size, ai->unit_size, ai->alloc_size / ai->atom_size, ai->atom_size); for (group = 0; group < ai->nr_groups; group++) { const struct pcpu_group_info *gi = &ai->groups[group]; int unit = 0, unit_end = 0; BUG_ON(gi->nr_units % upa); for (alloc_end += gi->nr_units / upa; alloc < alloc_end; alloc++) { if (!(alloc % apl)) { pr_cont("\n"); printk("%spcpu-alloc: ", lvl); } pr_cont("[%0*d] ", group_width, group); for (unit_end += upa; unit < unit_end; unit++) if (gi->cpu_map[unit] != NR_CPUS) pr_cont("%0*d ", cpu_width, gi->cpu_map[unit]); else pr_cont("%s ", empty_str); } } pr_cont("\n"); } /** * pcpu_setup_first_chunk - initialize the first percpu chunk * @ai: pcpu_alloc_info describing how to percpu area is shaped * @base_addr: mapped address * * Initialize the first percpu chunk which contains the kernel static * percpu area. This function is to be called from arch percpu area * setup path. * * @ai contains all information necessary to initialize the first * chunk and prime the dynamic percpu allocator. * * @ai->static_size is the size of static percpu area. * * @ai->reserved_size, if non-zero, specifies the amount of bytes to * reserve after the static area in the first chunk. This reserves * the first chunk such that it's available only through reserved * percpu allocation. This is primarily used to serve module percpu * static areas on architectures where the addressing model has * limited offset range for symbol relocations to guarantee module * percpu symbols fall inside the relocatable range. * * @ai->dyn_size determines the number of bytes available for dynamic * allocation in the first chunk. The area between @ai->static_size + * @ai->reserved_size + @ai->dyn_size and @ai->unit_size is unused. * * @ai->unit_size specifies unit size and must be aligned to PAGE_SIZE * and equal to or larger than @ai->static_size + @ai->reserved_size + * @ai->dyn_size. * * @ai->atom_size is the allocation atom size and used as alignment * for vm areas. * * @ai->alloc_size is the allocation size and always multiple of * @ai->atom_size. This is larger than @ai->atom_size if * @ai->unit_size is larger than @ai->atom_size. * * @ai->nr_groups and @ai->groups describe virtual memory layout of * percpu areas. Units which should be colocated are put into the * same group. Dynamic VM areas will be allocated according to these * groupings. If @ai->nr_groups is zero, a single group containing * all units is assumed. * * The caller should have mapped the first chunk at @base_addr and * copied static data to each unit. * * The first chunk will always contain a static and a dynamic region. * However, the static region is not managed by any chunk. If the first * chunk also contains a reserved region, it is served by two chunks - * one for the reserved region and one for the dynamic region. They * share the same vm, but use offset regions in the area allocation map. * The chunk serving the dynamic region is circulated in the chunk slots * and available for dynamic allocation like any other chunk. */ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, void *base_addr) { size_t size_sum = ai->static_size + ai->reserved_size + ai->dyn_size; size_t static_size, dyn_size; unsigned long *group_offsets; size_t *group_sizes; unsigned long *unit_off; unsigned int cpu; int *unit_map; int group, unit, i; unsigned long tmp_addr; size_t alloc_size; #define PCPU_SETUP_BUG_ON(cond) do { \ if (unlikely(cond)) { \ pr_emerg("failed to initialize, %s\n", #cond); \ pr_emerg("cpu_possible_mask=%*pb\n", \ cpumask_pr_args(cpu_possible_mask)); \ pcpu_dump_alloc_info(KERN_EMERG, ai); \ BUG(); \ } \ } while (0) /* sanity checks */ PCPU_SETUP_BUG_ON(ai->nr_groups <= 0); #ifdef CONFIG_SMP PCPU_SETUP_BUG_ON(!ai->static_size); PCPU_SETUP_BUG_ON(offset_in_page(__per_cpu_start)); #endif PCPU_SETUP_BUG_ON(!base_addr); PCPU_SETUP_BUG_ON(offset_in_page(base_addr)); PCPU_SETUP_BUG_ON(ai->unit_size < size_sum); PCPU_SETUP_BUG_ON(offset_in_page(ai->unit_size)); PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE); PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->unit_size, PCPU_BITMAP_BLOCK_SIZE)); PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE); PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->reserved_size, PCPU_MIN_ALLOC_SIZE)); PCPU_SETUP_BUG_ON(!(IS_ALIGNED(PCPU_BITMAP_BLOCK_SIZE, PAGE_SIZE) || IS_ALIGNED(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE))); PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0); /* process group information and build config tables accordingly */ alloc_size = ai->nr_groups * sizeof(group_offsets[0]); group_offsets = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES); alloc_size = ai->nr_groups * sizeof(group_sizes[0]); group_sizes = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES); alloc_size = nr_cpu_ids * sizeof(unit_map[0]); unit_map = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES); alloc_size = nr_cpu_ids * sizeof(unit_off[0]); unit_off = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES); for (cpu = 0; cpu < nr_cpu_ids; cpu++) unit_map[cpu] = UINT_MAX; pcpu_low_unit_cpu = NR_CPUS; pcpu_high_unit_cpu = NR_CPUS; for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) { const struct pcpu_group_info *gi = &ai->groups[group]; group_offsets[group] = gi->base_offset; group_sizes[group] = gi->nr_units * ai->unit_size; for (i = 0; i < gi->nr_units; i++) { cpu = gi->cpu_map[i]; if (cpu == NR_CPUS) continue; PCPU_SETUP_BUG_ON(cpu >= nr_cpu_ids); PCPU_SETUP_BUG_ON(!cpu_possible(cpu)); PCPU_SETUP_BUG_ON(unit_map[cpu] != UINT_MAX); unit_map[cpu] = unit + i; unit_off[cpu] = gi->base_offset + i * ai->unit_size; /* determine low/high unit_cpu */ if (pcpu_low_unit_cpu == NR_CPUS || unit_off[cpu] < unit_off[pcpu_low_unit_cpu]) pcpu_low_unit_cpu = cpu; if (pcpu_high_unit_cpu == NR_CPUS || unit_off[cpu] > unit_off[pcpu_high_unit_cpu]) pcpu_high_unit_cpu = cpu; } } pcpu_nr_units = unit; for_each_possible_cpu(cpu) PCPU_SETUP_BUG_ON(unit_map[cpu] == UINT_MAX); /* we're done parsing the input, undefine BUG macro and dump config */ #undef PCPU_SETUP_BUG_ON pcpu_dump_alloc_info(KERN_DEBUG, ai); pcpu_nr_groups = ai->nr_groups; pcpu_group_offsets = group_offsets; pcpu_group_sizes = group_sizes; pcpu_unit_map = unit_map; pcpu_unit_offsets = unit_off; /* determine basic parameters */ pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT; pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; pcpu_atom_size = ai->atom_size; pcpu_chunk_struct_size = struct_size((struct pcpu_chunk *)0, populated, BITS_TO_LONGS(pcpu_unit_pages)); pcpu_stats_save_ai(ai); /* * Allocate chunk slots. The slots after the active slots are: * sidelined_slot - isolated, depopulated chunks * free_slot - fully free chunks * to_depopulate_slot - isolated, chunks to depopulate */ pcpu_sidelined_slot = __pcpu_size_to_slot(pcpu_unit_size) + 1; pcpu_free_slot = pcpu_sidelined_slot + 1; pcpu_to_depopulate_slot = pcpu_free_slot + 1; pcpu_nr_slots = pcpu_to_depopulate_slot + 1; pcpu_chunk_lists = memblock_alloc_or_panic(pcpu_nr_slots * sizeof(pcpu_chunk_lists[0]), SMP_CACHE_BYTES); for (i = 0; i < pcpu_nr_slots; i++) INIT_LIST_HEAD(&pcpu_chunk_lists[i]); /* * The end of the static region needs to be aligned with the * minimum allocation size as this offsets the reserved and * dynamic region. The first chunk ends page aligned by * expanding the dynamic region, therefore the dynamic region * can be shrunk to compensate while still staying above the * configured sizes. */ static_size = ALIGN(ai->static_size, PCPU_MIN_ALLOC_SIZE); dyn_size = ai->dyn_size - (static_size - ai->static_size); /* * Initialize first chunk: * This chunk is broken up into 3 parts: * < static | [reserved] | dynamic > * - static - there is no backing chunk because these allocations can * never be freed. * - reserved (pcpu_reserved_chunk) - exists primarily to serve * allocations from module load. * - dynamic (pcpu_first_chunk) - serves the dynamic part of the first * chunk. */ tmp_addr = (unsigned long)base_addr + static_size; if (ai->reserved_size) pcpu_reserved_chunk = pcpu_alloc_first_chunk(tmp_addr, ai->reserved_size); tmp_addr = (unsigned long)base_addr + static_size + ai->reserved_size; pcpu_first_chunk = pcpu_alloc_first_chunk(tmp_addr, dyn_size); pcpu_nr_empty_pop_pages = pcpu_first_chunk->nr_empty_pop_pages; pcpu_chunk_relocate(pcpu_first_chunk, -1); /* include all regions of the first chunk */ pcpu_nr_populated += PFN_DOWN(size_sum); pcpu_stats_chunk_alloc(); trace_percpu_create_chunk(base_addr); /* we're done */ pcpu_base_addr = base_addr; } #ifdef CONFIG_SMP const char * const pcpu_fc_names[PCPU_FC_NR] __initconst = { [PCPU_FC_AUTO] = "auto", [PCPU_FC_EMBED] = "embed", [PCPU_FC_PAGE] = "page", }; enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO; static int __init percpu_alloc_setup(char *str) { if (!str) return -EINVAL; if (0) /* nada */; #ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK else if (!strcmp(str, "embed")) pcpu_chosen_fc = PCPU_FC_EMBED; #endif #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK else if (!strcmp(str, "page")) pcpu_chosen_fc = PCPU_FC_PAGE; #endif else pr_warn("unknown allocator %s specified\n", str); return 0; } early_param("percpu_alloc", percpu_alloc_setup); /* * pcpu_embed_first_chunk() is used by the generic percpu setup. * Build it if needed by the arch config or the generic setup is going * to be used. */ #if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \ !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) #define BUILD_EMBED_FIRST_CHUNK #endif /* build pcpu_page_first_chunk() iff needed by the arch config */ #if defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK) #define BUILD_PAGE_FIRST_CHUNK #endif /* pcpu_build_alloc_info() is used by both embed and page first chunk */ #if defined(BUILD_EMBED_FIRST_CHUNK) || defined(BUILD_PAGE_FIRST_CHUNK) /** * pcpu_build_alloc_info - build alloc_info considering distances between CPUs * @reserved_size: the size of reserved percpu area in bytes * @dyn_size: minimum free size for dynamic allocation in bytes * @atom_size: allocation atom size * @cpu_distance_fn: callback to determine distance between cpus, optional * * This function determines grouping of units, their mappings to cpus * and other parameters considering needed percpu size, allocation * atom size and distances between CPUs. * * Groups are always multiples of atom size and CPUs which are of * LOCAL_DISTANCE both ways are grouped together and share space for * units in the same group. The returned configuration is guaranteed * to have CPUs on different nodes on different groups and >=75% usage * of allocated virtual address space. * * RETURNS: * On success, pointer to the new allocation_info is returned. On * failure, ERR_PTR value is returned. */ static struct pcpu_alloc_info * __init __flatten pcpu_build_alloc_info( size_t reserved_size, size_t dyn_size, size_t atom_size, pcpu_fc_cpu_distance_fn_t cpu_distance_fn) { static int group_map[NR_CPUS] __initdata; static int group_cnt[NR_CPUS] __initdata; static struct cpumask mask __initdata; const size_t static_size = __per_cpu_end - __per_cpu_start; int nr_groups = 1, nr_units = 0; size_t size_sum, min_unit_size, alloc_size; int upa, max_upa, best_upa; /* units_per_alloc */ int last_allocs, group, unit; unsigned int cpu, tcpu; struct pcpu_alloc_info *ai; unsigned int *cpu_map; /* this function may be called multiple times */ memset(group_map, 0, sizeof(group_map)); memset(group_cnt, 0, sizeof(group_cnt)); cpumask_clear(&mask); /* calculate size_sum and ensure dyn_size is enough for early alloc */ size_sum = PFN_ALIGN(static_size + reserved_size + max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE)); dyn_size = size_sum - static_size - reserved_size; /* * Determine min_unit_size, alloc_size and max_upa such that * alloc_size is multiple of atom_size and is the smallest * which can accommodate 4k aligned segments which are equal to * or larger than min_unit_size. */ min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); /* determine the maximum # of units that can fit in an allocation */ alloc_size = roundup(min_unit_size, atom_size); upa = alloc_size / min_unit_size; while (alloc_size % upa || (offset_in_page(alloc_size / upa))) upa--; max_upa = upa; cpumask_copy(&mask, cpu_possible_mask); /* group cpus according to their proximity */ for (group = 0; !cpumask_empty(&mask); group++) { /* pop the group's first cpu */ cpu = cpumask_first(&mask); group_map[cpu] = group; group_cnt[group]++; cpumask_clear_cpu(cpu, &mask); for_each_cpu(tcpu, &mask) { if (!cpu_distance_fn || (cpu_distance_fn(cpu, tcpu) == LOCAL_DISTANCE && cpu_distance_fn(tcpu, cpu) == LOCAL_DISTANCE)) { group_map[tcpu] = group; group_cnt[group]++; cpumask_clear_cpu(tcpu, &mask); } } } nr_groups = group; /* * Wasted space is caused by a ratio imbalance of upa to group_cnt. * Expand the unit_size until we use >= 75% of the units allocated. * Related to atom_size, which could be much larger than the unit_size. */ last_allocs = INT_MAX; best_upa = 0; for (upa = max_upa; upa; upa--) { int allocs = 0, wasted = 0; if (alloc_size % upa || (offset_in_page(alloc_size / upa))) continue; for (group = 0; group < nr_groups; group++) { int this_allocs = DIV_ROUND_UP(group_cnt[group], upa); allocs += this_allocs; wasted += this_allocs * upa - group_cnt[group]; } /* * Don't accept if wastage is over 1/3. The * greater-than comparison ensures upa==1 always * passes the following check. */ if (wasted > num_possible_cpus() / 3) continue; /* and then don't consume more memory */ if (allocs > last_allocs) break; last_allocs = allocs; best_upa = upa; } BUG_ON(!best_upa); upa = best_upa; /* allocate and fill alloc_info */ for (group = 0; group < nr_groups; group++) nr_units += roundup(group_cnt[group], upa); ai = pcpu_alloc_alloc_info(nr_groups, nr_units); if (!ai) return ERR_PTR(-ENOMEM); cpu_map = ai->groups[0].cpu_map; for (group = 0; group < nr_groups; group++) { ai->groups[group].cpu_map = cpu_map; cpu_map += roundup(group_cnt[group], upa); } ai->static_size = static_size; ai->reserved_size = reserved_size; ai->dyn_size = dyn_size; ai->unit_size = alloc_size / upa; ai->atom_size = atom_size; ai->alloc_size = alloc_size; for (group = 0, unit = 0; group < nr_groups; group++) { struct pcpu_group_info *gi = &ai->groups[group]; /* * Initialize base_offset as if all groups are located * back-to-back. The caller should update this to * reflect actual allocation. */ gi->base_offset = unit * ai->unit_size; for_each_possible_cpu(cpu) if (group_map[cpu] == group) gi->cpu_map[gi->nr_units++] = cpu; gi->nr_units = roundup(gi->nr_units, upa); unit += gi->nr_units; } BUG_ON(unit != nr_units); return ai; } static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align, pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn) { const unsigned long goal = __pa(MAX_DMA_ADDRESS); #ifdef CONFIG_NUMA int node = NUMA_NO_NODE; void *ptr; if (cpu_to_nd_fn) node = cpu_to_nd_fn(cpu); if (node == NUMA_NO_NODE || !node_online(node) || !NODE_DATA(node)) { ptr = memblock_alloc_from(size, align, goal); pr_info("cpu %d has no node %d or node-local memory\n", cpu, node); pr_debug("per cpu data for cpu%d %zu bytes at 0x%llx\n", cpu, size, (u64)__pa(ptr)); } else { ptr = memblock_alloc_try_nid(size, align, goal, MEMBLOCK_ALLOC_ACCESSIBLE, node); pr_debug("per cpu data for cpu%d %zu bytes on node%d at 0x%llx\n", cpu, size, node, (u64)__pa(ptr)); } return ptr; #else return memblock_alloc_from(size, align, goal); #endif } static void __init pcpu_fc_free(void *ptr, size_t size) { memblock_free(ptr, size); } #endif /* BUILD_EMBED_FIRST_CHUNK || BUILD_PAGE_FIRST_CHUNK */ #if defined(BUILD_EMBED_FIRST_CHUNK) /** * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem * @reserved_size: the size of reserved percpu area in bytes * @dyn_size: minimum free size for dynamic allocation in bytes * @atom_size: allocation atom size * @cpu_distance_fn: callback to determine distance between cpus, optional * @cpu_to_nd_fn: callback to convert cpu to it's node, optional * * This is a helper to ease setting up embedded first percpu chunk and * can be called where pcpu_setup_first_chunk() is expected. * * If this function is used to setup the first chunk, it is allocated * by calling pcpu_fc_alloc and used as-is without being mapped into * vmalloc area. Allocations are always whole multiples of @atom_size * aligned to @atom_size. * * This enables the first chunk to piggy back on the linear physical * mapping which often uses larger page size. Please note that this * can result in very sparse cpu->unit mapping on NUMA machines thus * requiring large vmalloc address space. Don't use this allocator if * vmalloc space is not orders of magnitude larger than distances * between node memory addresses (ie. 32bit NUMA machines). * * @dyn_size specifies the minimum dynamic area size. * * If the needed size is smaller than the minimum or specified unit * size, the leftover is returned using pcpu_fc_free. * * RETURNS: * 0 on success, -errno on failure. */ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, size_t atom_size, pcpu_fc_cpu_distance_fn_t cpu_distance_fn, pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn) { void *base = (void *)ULONG_MAX; void **areas = NULL; struct pcpu_alloc_info *ai; size_t size_sum, areas_size; unsigned long max_distance; int group, i, highest_group, rc = 0; ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size, cpu_distance_fn); if (IS_ERR(ai)) return PTR_ERR(ai); size_sum = ai->static_size + ai->reserved_size + ai->dyn_size; areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *)); areas = memblock_alloc(areas_size, SMP_CACHE_BYTES); if (!areas) { rc = -ENOMEM; goto out_free; } /* allocate, copy and determine base address & max_distance */ highest_group = 0; for (group = 0; group < ai->nr_groups; group++) { struct pcpu_group_info *gi = &ai->groups[group]; unsigned int cpu = NR_CPUS; void *ptr; for (i = 0; i < gi->nr_units && cpu == NR_CPUS; i++) cpu = gi->cpu_map[i]; BUG_ON(cpu == NR_CPUS); /* allocate space for the whole group */ ptr = pcpu_fc_alloc(cpu, gi->nr_units * ai->unit_size, atom_size, cpu_to_nd_fn); if (!ptr) { rc = -ENOMEM; goto out_free_areas; } /* kmemleak tracks the percpu allocations separately */ kmemleak_ignore_phys(__pa(ptr)); areas[group] = ptr; base = min(ptr, base); if (ptr > areas[highest_group]) highest_group = group; } max_distance = areas[highest_group] - base; max_distance += ai->unit_size * ai->groups[highest_group].nr_units; /* warn if maximum distance is further than 75% of vmalloc space */ if (max_distance > VMALLOC_TOTAL * 3 / 4) { pr_warn("max_distance=0x%lx too large for vmalloc space 0x%lx\n", max_distance, VMALLOC_TOTAL); #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK /* and fail if we have fallback */ rc = -EINVAL; goto out_free_areas; #endif } /* * Copy data and free unused parts. This should happen after all * allocations are complete; otherwise, we may end up with * overlapping groups. */ for (group = 0; group < ai->nr_groups; group++) { struct pcpu_group_info *gi = &ai->groups[group]; void *ptr = areas[group]; for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) { if (gi->cpu_map[i] == NR_CPUS) { /* unused unit, free whole */ pcpu_fc_free(ptr, ai->unit_size); continue; } /* copy and return the unused part */ memcpy(ptr, __per_cpu_start, ai->static_size); pcpu_fc_free(ptr + size_sum, ai->unit_size - size_sum); } } /* base address is now known, determine group base offsets */ for (group = 0; group < ai->nr_groups; group++) { ai->groups[group].base_offset = areas[group] - base; } pr_info("Embedded %zu pages/cpu s%zu r%zu d%zu u%zu\n", PFN_DOWN(size_sum), ai->static_size, ai->reserved_size, ai->dyn_size, ai->unit_size); pcpu_setup_first_chunk(ai, base); goto out_free; out_free_areas: for (group = 0; group < ai->nr_groups; group++) if (areas[group]) pcpu_fc_free(areas[group], ai->groups[group].nr_units * ai->unit_size); out_free: pcpu_free_alloc_info(ai); if (areas) memblock_free(areas, areas_size); return rc; } #endif /* BUILD_EMBED_FIRST_CHUNK */ #ifdef BUILD_PAGE_FIRST_CHUNK #include <linux/pgalloc.h> #ifndef P4D_TABLE_SIZE #define P4D_TABLE_SIZE PAGE_SIZE #endif #ifndef PUD_TABLE_SIZE #define PUD_TABLE_SIZE PAGE_SIZE #endif #ifndef PMD_TABLE_SIZE #define PMD_TABLE_SIZE PAGE_SIZE #endif #ifndef PTE_TABLE_SIZE #define PTE_TABLE_SIZE PAGE_SIZE #endif void __init __weak pcpu_populate_pte(unsigned long addr) { pgd_t *pgd = pgd_offset_k(addr); p4d_t *p4d; pud_t *pud; pmd_t *pmd; if (pgd_none(*pgd)) { p4d = memblock_alloc_or_panic(P4D_TABLE_SIZE, P4D_TABLE_SIZE); pgd_populate_kernel(addr, pgd, p4d); } p4d = p4d_offset(pgd, addr); if (p4d_none(*p4d)) { pud = memblock_alloc_or_panic(PUD_TABLE_SIZE, PUD_TABLE_SIZE); p4d_populate_kernel(addr, p4d, pud); } pud = pud_offset(p4d, addr); if (pud_none(*pud)) { pmd = memblock_alloc_or_panic(PMD_TABLE_SIZE, PMD_TABLE_SIZE); pud_populate(&init_mm, pud, pmd); } pmd = pmd_offset(pud, addr); if (!pmd_present(*pmd)) { pte_t *new; new = memblock_alloc_or_panic(PTE_TABLE_SIZE, PTE_TABLE_SIZE); pmd_populate_kernel(&init_mm, pmd, new); } return; } /** * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages * @reserved_size: the size of reserved percpu area in bytes * @cpu_to_nd_fn: callback to convert cpu to it's node, optional * * This is a helper to ease setting up page-remapped first percpu * chunk and can be called where pcpu_setup_first_chunk() is expected. * * This is the basic allocator. Static percpu area is allocated * page-by-page into vmalloc area. * * RETURNS: * 0 on success, -errno on failure. */ int __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn) { static struct vm_struct vm; struct pcpu_alloc_info *ai; char psize_str[16]; int unit_pages; size_t pages_size; struct page **pages; int unit, i, j, rc = 0; int upa; int nr_g0_units; snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10); ai = pcpu_build_alloc_info(reserved_size, 0, PAGE_SIZE, NULL); if (IS_ERR(ai)) return PTR_ERR(ai); BUG_ON(ai->nr_groups != 1); upa = ai->alloc_size/ai->unit_size; nr_g0_units = roundup(num_possible_cpus(), upa); if (WARN_ON(ai->groups[0].nr_units != nr_g0_units)) { pcpu_free_alloc_info(ai); return -EINVAL; } unit_pages = ai->unit_size >> PAGE_SHIFT; /* unaligned allocations can't be freed, round up to page size */ pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() * sizeof(pages[0])); pages = memblock_alloc_or_panic(pages_size, SMP_CACHE_BYTES); /* allocate pages */ j = 0; for (unit = 0; unit < num_possible_cpus(); unit++) { unsigned int cpu = ai->groups[0].cpu_map[unit]; for (i = 0; i < unit_pages; i++) { void *ptr; ptr = pcpu_fc_alloc(cpu, PAGE_SIZE, PAGE_SIZE, cpu_to_nd_fn); if (!ptr) { pr_warn("failed to allocate %s page for cpu%u\n", psize_str, cpu); goto enomem; } /* kmemleak tracks the percpu allocations separately */ kmemleak_ignore_phys(__pa(ptr)); pages[j++] = virt_to_page(ptr); } } /* allocate vm area, map the pages and copy static data */ vm.flags = VM_ALLOC; vm.size = num_possible_cpus() * ai->unit_size; vm_area_register_early(&vm, PAGE_SIZE); for (unit = 0; unit < num_possible_cpus(); unit++) { unsigned long unit_addr = (unsigned long)vm.addr + unit * ai->unit_size; for (i = 0; i < unit_pages; i++) pcpu_populate_pte(unit_addr + (i << PAGE_SHIFT)); /* pte already populated, the following shouldn't fail */ rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages], unit_pages); if (rc < 0) panic("failed to map percpu area, err=%d\n", rc); flush_cache_vmap_early(unit_addr, unit_addr + ai->unit_size); /* copy static data */ memcpy((void *)unit_addr, __per_cpu_start, ai->static_size); } /* we're ready, commit */ pr_info("%d %s pages/cpu s%zu r%zu d%zu\n", unit_pages, psize_str, ai->static_size, ai->reserved_size, ai->dyn_size); pcpu_setup_first_chunk(ai, vm.addr); goto out_free_ar; enomem: while (--j >= 0) pcpu_fc_free(page_address(pages[j]), PAGE_SIZE); rc = -ENOMEM; out_free_ar: memblock_free(pages, pages_size); pcpu_free_alloc_info(ai); return rc; } #endif /* BUILD_PAGE_FIRST_CHUNK */ #ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA /* * Generic SMP percpu area setup. * * The embedding helper is used because its behavior closely resembles * the original non-dynamic generic percpu area setup. This is * important because many archs have addressing restrictions and might * fail if the percpu area is located far away from the previous * location. As an added bonus, in non-NUMA cases, embedding is * generally a good idea TLB-wise because percpu area can piggy back * on the physical linear memory mapping which uses large page * mappings on applicable archs. */ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; EXPORT_SYMBOL(__per_cpu_offset); void __init setup_per_cpu_areas(void) { unsigned long delta; unsigned int cpu; int rc; /* * Always reserve area for module percpu variables. That's * what the legacy allocator did. */ rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL, NULL); if (rc < 0) panic("Failed to initialize percpu areas."); delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; for_each_possible_cpu(cpu) __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu]; } #endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */ #else /* CONFIG_SMP */ /* * UP percpu area setup. * * UP always uses km-based percpu allocator with identity mapping. * Static percpu variables are indistinguishable from the usual static * variables and don't require any special preparation. */ void __init setup_per_cpu_areas(void) { const size_t unit_size = roundup_pow_of_two(max_t(size_t, PCPU_MIN_UNIT_SIZE, PERCPU_DYNAMIC_RESERVE)); struct pcpu_alloc_info *ai; void *fc; ai = pcpu_alloc_alloc_info(1, 1); fc = memblock_alloc_from(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); if (!ai || !fc) panic("Failed to allocate memory for percpu areas."); /* kmemleak tracks the percpu allocations separately */ kmemleak_ignore_phys(__pa(fc)); ai->dyn_size = unit_size; ai->unit_size = unit_size; ai->atom_size = unit_size; ai->alloc_size = unit_size; ai->groups[0].nr_units = 1; ai->groups[0].cpu_map[0] = 0; pcpu_setup_first_chunk(ai, fc); pcpu_free_alloc_info(ai); } #endif /* CONFIG_SMP */ /* * pcpu_nr_pages - calculate total number of populated backing pages * * This reflects the number of pages populated to back chunks. Metadata is * excluded in the number exposed in meminfo as the number of backing pages * scales with the number of cpus and can quickly outweigh the memory used for * metadata. It also keeps this calculation nice and simple. * * RETURNS: * Total number of populated backing pages in use by the allocator. */ unsigned long pcpu_nr_pages(void) { return data_race(READ_ONCE(pcpu_nr_populated)) * pcpu_nr_units; } /* * Percpu allocator is initialized early during boot when neither slab or * workqueue is available. Plug async management until everything is up * and running. */ static int __init percpu_enable_async(void) { pcpu_async_enabled = true; return 0; } subsys_initcall(percpu_enable_async);
2 2 2 2 2 2 2 2 2 2 35 32 13 13 12 7 9 9 13 25 1 1 1 25 25 1 25 25 1 1 2 1 1 26 2 25 27 27 26 27 2 25 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 // SPDX-License-Identifier: GPL-2.0-or-later /* Generic associative array implementation. * * See Documentation/core-api/assoc_array.rst for information. * * Copyright (C) 2013 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ //#define DEBUG #include <linux/rcupdate.h> #include <linux/slab.h> #include <linux/err.h> #include <linux/assoc_array_priv.h> /* * Iterate over an associative array. The caller must hold the RCU read lock * or better. */ static int assoc_array_subtree_iterate(const struct assoc_array_ptr *root, const struct assoc_array_ptr *stop, int (*iterator)(const void *leaf, void *iterator_data), void *iterator_data) { const struct assoc_array_shortcut *shortcut; const struct assoc_array_node *node; const struct assoc_array_ptr *cursor, *ptr, *parent; unsigned long has_meta; int slot, ret; cursor = root; begin_node: if (assoc_array_ptr_is_shortcut(cursor)) { /* Descend through a shortcut */ shortcut = assoc_array_ptr_to_shortcut(cursor); cursor = READ_ONCE(shortcut->next_node); /* Address dependency. */ } node = assoc_array_ptr_to_node(cursor); slot = 0; /* We perform two passes of each node. * * The first pass does all the leaves in this node. This means we * don't miss any leaves if the node is split up by insertion whilst * we're iterating over the branches rooted here (we may, however, see * some leaves twice). */ has_meta = 0; for (; slot < ASSOC_ARRAY_FAN_OUT; slot++) { ptr = READ_ONCE(node->slots[slot]); /* Address dependency. */ has_meta |= (unsigned long)ptr; if (ptr && assoc_array_ptr_is_leaf(ptr)) { /* We need a barrier between the read of the pointer, * which is supplied by the above READ_ONCE(). */ /* Invoke the callback */ ret = iterator(assoc_array_ptr_to_leaf(ptr), iterator_data); if (ret) return ret; } } /* The second pass attends to all the metadata pointers. If we follow * one of these we may find that we don't come back here, but rather go * back to a replacement node with the leaves in a different layout. * * We are guaranteed to make progress, however, as the slot number for * a particular portion of the key space cannot change - and we * continue at the back pointer + 1. */ if (!(has_meta & ASSOC_ARRAY_PTR_META_TYPE)) goto finished_node; slot = 0; continue_node: node = assoc_array_ptr_to_node(cursor); for (; slot < ASSOC_ARRAY_FAN_OUT; slot++) { ptr = READ_ONCE(node->slots[slot]); /* Address dependency. */ if (assoc_array_ptr_is_meta(ptr)) { cursor = ptr; goto begin_node; } } finished_node: /* Move up to the parent (may need to skip back over a shortcut) */ parent = READ_ONCE(node->back_pointer); /* Address dependency. */ slot = node->parent_slot; if (parent == stop) return 0; if (assoc_array_ptr_is_shortcut(parent)) { shortcut = assoc_array_ptr_to_shortcut(parent); cursor = parent; parent = READ_ONCE(shortcut->back_pointer); /* Address dependency. */ slot = shortcut->parent_slot; if (parent == stop) return 0; } /* Ascend to next slot in parent node */ cursor = parent; slot++; goto continue_node; } /** * assoc_array_iterate - Pass all objects in the array to a callback * @array: The array to iterate over. * @iterator: The callback function. * @iterator_data: Private data for the callback function. * * Iterate over all the objects in an associative array. Each one will be * presented to the iterator function. * * If the array is being modified concurrently with the iteration then it is * possible that some objects in the array will be passed to the iterator * callback more than once - though every object should be passed at least * once. If this is undesirable then the caller must lock against modification * for the duration of this function. * * The function will return 0 if no objects were in the array or else it will * return the result of the last iterator function called. Iteration stops * immediately if any call to the iteration function results in a non-zero * return. * * The caller should hold the RCU read lock or better if concurrent * modification is possible. */ int assoc_array_iterate(const struct assoc_array *array, int (*iterator)(const void *object, void *iterator_data), void *iterator_data) { struct assoc_array_ptr *root = READ_ONCE(array->root); /* Address dependency. */ if (!root) return 0; return assoc_array_subtree_iterate(root, NULL, iterator, iterator_data); } enum assoc_array_walk_status { assoc_array_walk_tree_empty, assoc_array_walk_found_terminal_node, assoc_array_walk_found_wrong_shortcut, }; struct assoc_array_walk_result { struct { struct assoc_array_node *node; /* Node in which leaf might be found */ int level; int slot; } terminal_node; struct { struct assoc_array_shortcut *shortcut; int level; int sc_level; unsigned long sc_segments; unsigned long dissimilarity; } wrong_shortcut; }; /* * Navigate through the internal tree looking for the closest node to the key. */ static enum assoc_array_walk_status assoc_array_walk(const struct assoc_array *array, const struct assoc_array_ops *ops, const void *index_key, struct assoc_array_walk_result *result) { struct assoc_array_shortcut *shortcut; struct assoc_array_node *node; struct assoc_array_ptr *cursor, *ptr; unsigned long sc_segments, dissimilarity; unsigned long segments; int level, sc_level, next_sc_level; int slot; pr_devel("-->%s()\n", __func__); cursor = READ_ONCE(array->root); /* Address dependency. */ if (!cursor) return assoc_array_walk_tree_empty; level = 0; /* Use segments from the key for the new leaf to navigate through the * internal tree, skipping through nodes and shortcuts that are on * route to the destination. Eventually we'll come to a slot that is * either empty or contains a leaf at which point we've found a node in * which the leaf we're looking for might be found or into which it * should be inserted. */ jumped: segments = ops->get_key_chunk(index_key, level); pr_devel("segments[%d]: %lx\n", level, segments); if (assoc_array_ptr_is_shortcut(cursor)) goto follow_shortcut; consider_node: node = assoc_array_ptr_to_node(cursor); slot = segments >> (level & ASSOC_ARRAY_KEY_CHUNK_MASK); slot &= ASSOC_ARRAY_FAN_MASK; ptr = READ_ONCE(node->slots[slot]); /* Address dependency. */ pr_devel("consider slot %x [ix=%d type=%lu]\n", slot, level, (unsigned long)ptr & 3); if (!assoc_array_ptr_is_meta(ptr)) { /* The node doesn't have a node/shortcut pointer in the slot * corresponding to the index key that we have to follow. */ result->terminal_node.node = node; result->terminal_node.level = level; result->terminal_node.slot = slot; pr_devel("<--%s() = terminal_node\n", __func__); return assoc_array_walk_found_terminal_node; } if (assoc_array_ptr_is_node(ptr)) { /* There is a pointer to a node in the slot corresponding to * this index key segment, so we need to follow it. */ cursor = ptr; level += ASSOC_ARRAY_LEVEL_STEP; if ((level & ASSOC_ARRAY_KEY_CHUNK_MASK) != 0) goto consider_node; goto jumped; } /* There is a shortcut in the slot corresponding to the index key * segment. We follow the shortcut if its partial index key matches * this leaf's. Otherwise we need to split the shortcut. */ cursor = ptr; follow_shortcut: shortcut = assoc_array_ptr_to_shortcut(cursor); pr_devel("shortcut to %d\n", shortcut->skip_to_level); sc_level = level + ASSOC_ARRAY_LEVEL_STEP; BUG_ON(sc_level > shortcut->skip_to_level); do { /* Check the leaf against the shortcut's index key a word at a * time, trimming the final word (the shortcut stores the index * key completely from the root to the shortcut's target). */ if ((sc_level & ASSOC_ARRAY_KEY_CHUNK_MASK) == 0) segments = ops->get_key_chunk(index_key, sc_level); sc_segments = shortcut->index_key[sc_level >> ASSOC_ARRAY_KEY_CHUNK_SHIFT]; dissimilarity = segments ^ sc_segments; if (round_up(sc_level, ASSOC_ARRAY_KEY_CHUNK_SIZE) > shortcut->skip_to_level) { /* Trim segments that are beyond the shortcut */ int shift = shortcut->skip_to_level & ASSOC_ARRAY_KEY_CHUNK_MASK; dissimilarity &= ~(ULONG_MAX << shift); next_sc_level = shortcut->skip_to_level; } else { next_sc_level = sc_level + ASSOC_ARRAY_KEY_CHUNK_SIZE; next_sc_level = round_down(next_sc_level, ASSOC_ARRAY_KEY_CHUNK_SIZE); } if (dissimilarity != 0) { /* This shortcut points elsewhere */ result->wrong_shortcut.shortcut = shortcut; result->wrong_shortcut.level = level; result->wrong_shortcut.sc_level = sc_level; result->wrong_shortcut.sc_segments = sc_segments; result->wrong_shortcut.dissimilarity = dissimilarity; return assoc_array_walk_found_wrong_shortcut; } sc_level = next_sc_level; } while (sc_level < shortcut->skip_to_level); /* The shortcut matches the leaf's index to this point. */ cursor = READ_ONCE(shortcut->next_node); /* Address dependency. */ if (((level ^ sc_level) & ~ASSOC_ARRAY_KEY_CHUNK_MASK) != 0) { level = sc_level; goto jumped; } else { level = sc_level; goto consider_node; } } /** * assoc_array_find - Find an object by index key * @array: The associative array to search. * @ops: The operations to use. * @index_key: The key to the object. * * Find an object in an associative array by walking through the internal tree * to the node that should contain the object and then searching the leaves * there. NULL is returned if the requested object was not found in the array. * * The caller must hold the RCU read lock or better. */ void *assoc_array_find(const struct assoc_array *array, const struct assoc_array_ops *ops, const void *index_key) { struct assoc_array_walk_result result; const struct assoc_array_node *node; const struct assoc_array_ptr *ptr; const void *leaf; int slot; if (assoc_array_walk(array, ops, index_key, &result) != assoc_array_walk_found_terminal_node) return NULL; node = result.terminal_node.node; /* If the target key is available to us, it's has to be pointed to by * the terminal node. */ for (slot = 0; slot < ASSOC_ARRAY_FAN_OUT; slot++) { ptr = READ_ONCE(node->slots[slot]); /* Address dependency. */ if (ptr && assoc_array_ptr_is_leaf(ptr)) { /* We need a barrier between the read of the pointer * and dereferencing the pointer - but only if we are * actually going to dereference it. */ leaf = assoc_array_ptr_to_leaf(ptr); if (ops->compare_object(leaf, index_key)) return (void *)leaf; } } return NULL; } /* * Destructively iterate over an associative array. The caller must prevent * other simultaneous accesses. */ static void assoc_array_destroy_subtree(struct assoc_array_ptr *root, const struct assoc_array_ops *ops) { struct assoc_array_shortcut *shortcut; struct assoc_array_node *node; struct assoc_array_ptr *cursor, *parent = NULL; int slot = -1; pr_devel("-->%s()\n", __func__); cursor = root; if (!cursor) { pr_devel("empty\n"); return; } move_to_meta: if (assoc_array_ptr_is_shortcut(cursor)) { /* Descend through a shortcut */ pr_devel("[%d] shortcut\n", slot); BUG_ON(!assoc_array_ptr_is_shortcut(cursor)); shortcut = assoc_array_ptr_to_shortcut(cursor); BUG_ON(shortcut->back_pointer != parent); BUG_ON(slot != -1 && shortcut->parent_slot != slot); parent = cursor; cursor = shortcut->next_node; slot = -1; BUG_ON(!assoc_array_ptr_is_node(cursor)); } pr_devel("[%d] node\n", slot); node = assoc_array_ptr_to_node(cursor); BUG_ON(node->back_pointer != parent); BUG_ON(slot != -1 && node->parent_slot != slot); slot = 0; continue_node: pr_devel("Node %p [back=%p]\n", node, node->back_pointer); for (; slot < ASSOC_ARRAY_FAN_OUT; slot++) { struct assoc_array_ptr *ptr = node->slots[slot]; if (!ptr) continue; if (assoc_array_ptr_is_meta(ptr)) { parent = cursor; cursor = ptr; goto move_to_meta; } if (ops) { pr_devel("[%d] free leaf\n", slot); ops->free_object(assoc_array_ptr_to_leaf(ptr)); } } parent = node->back_pointer; slot = node->parent_slot; pr_devel("free node\n"); kfree(node); if (!parent) return; /* Done */ /* Move back up to the parent (may need to free a shortcut on * the way up) */ if (assoc_array_ptr_is_shortcut(parent)) { shortcut = assoc_array_ptr_to_shortcut(parent); BUG_ON(shortcut->next_node != cursor); cursor = parent; parent = shortcut->back_pointer; slot = shortcut->parent_slot; pr_devel("free shortcut\n"); kfree(shortcut); if (!parent) return; BUG_ON(!assoc_array_ptr_is_node(parent)); } /* Ascend to next slot in parent node */ pr_devel("ascend to %p[%d]\n", parent, slot); cursor = parent; node = assoc_array_ptr_to_node(cursor); slot++; goto continue_node; } /** * assoc_array_destroy - Destroy an associative array * @array: The array to destroy. * @ops: The operations to use. * * Discard all metadata and free all objects in an associative array. The * array will be empty and ready to use again upon completion. This function * cannot fail. * * The caller must prevent all other accesses whilst this takes place as no * attempt is made to adjust pointers gracefully to permit RCU readlock-holding * accesses to continue. On the other hand, no memory allocation is required. */ void assoc_array_destroy(struct assoc_array *array, const struct assoc_array_ops *ops) { assoc_array_destroy_subtree(array->root, ops); array->root = NULL; } /* * Handle insertion into an empty tree. */ static bool assoc_array_insert_in_empty_tree(struct assoc_array_edit *edit) { struct assoc_array_node *new_n0; pr_devel("-->%s()\n", __func__); new_n0 = kzalloc_obj(struct assoc_array_node); if (!new_n0) return false; edit->new_meta[0] = assoc_array_node_to_ptr(new_n0); edit->leaf_p = &new_n0->slots[0]; edit->adjust_count_on = new_n0; edit->set[0].ptr = &edit->array->root; edit->set[0].to = assoc_array_node_to_ptr(new_n0); pr_devel("<--%s() = ok [no root]\n", __func__); return true; } /* * Handle insertion into a terminal node. */ static bool assoc_array_insert_into_terminal_node(struct assoc_array_edit *edit, const struct assoc_array_ops *ops, const void *index_key, struct assoc_array_walk_result *result) { struct assoc_array_shortcut *shortcut, *new_s0; struct assoc_array_node *node, *new_n0, *new_n1, *side; struct assoc_array_ptr *ptr; unsigned long dissimilarity, base_seg, blank; size_t keylen; bool have_meta; int level, diff; int slot, next_slot, free_slot, i, j; node = result->terminal_node.node; level = result->terminal_node.level; edit->segment_cache[ASSOC_ARRAY_FAN_OUT] = result->terminal_node.slot; pr_devel("-->%s()\n", __func__); /* We arrived at a node which doesn't have an onward node or shortcut * pointer that we have to follow. This means that (a) the leaf we * want must go here (either by insertion or replacement) or (b) we * need to split this node and insert in one of the fragments. */ free_slot = -1; /* Firstly, we have to check the leaves in this node to see if there's * a matching one we should replace in place. */ for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) { ptr = node->slots[i]; if (!ptr) { free_slot = i; continue; } if (assoc_array_ptr_is_leaf(ptr) && ops->compare_object(assoc_array_ptr_to_leaf(ptr), index_key)) { pr_devel("replace in slot %d\n", i); edit->leaf_p = &node->slots[i]; edit->dead_leaf = node->slots[i]; pr_devel("<--%s() = ok [replace]\n", __func__); return true; } } /* If there is a free slot in this node then we can just insert the * leaf here. */ if (free_slot >= 0) { pr_devel("insert in free slot %d\n", free_slot); edit->leaf_p = &node->slots[free_slot]; edit->adjust_count_on = node; pr_devel("<--%s() = ok [insert]\n", __func__); return true; } /* The node has no spare slots - so we're either going to have to split * it or insert another node before it. * * Whatever, we're going to need at least two new nodes - so allocate * those now. We may also need a new shortcut, but we deal with that * when we need it. */ new_n0 = kzalloc_obj(struct assoc_array_node); if (!new_n0) return false; edit->new_meta[0] = assoc_array_node_to_ptr(new_n0); new_n1 = kzalloc_obj(struct assoc_array_node); if (!new_n1) return false; edit->new_meta[1] = assoc_array_node_to_ptr(new_n1); /* We need to find out how similar the leaves are. */ pr_devel("no spare slots\n"); have_meta = false; for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) { ptr = node->slots[i]; if (assoc_array_ptr_is_meta(ptr)) { edit->segment_cache[i] = 0xff; have_meta = true; continue; } base_seg = ops->get_object_key_chunk( assoc_array_ptr_to_leaf(ptr), level); base_seg >>= level & ASSOC_ARRAY_KEY_CHUNK_MASK; edit->segment_cache[i] = base_seg & ASSOC_ARRAY_FAN_MASK; } if (have_meta) { pr_devel("have meta\n"); goto split_node; } /* The node contains only leaves */ dissimilarity = 0; base_seg = edit->segment_cache[0]; for (i = 1; i < ASSOC_ARRAY_FAN_OUT; i++) dissimilarity |= edit->segment_cache[i] ^ base_seg; pr_devel("only leaves; dissimilarity=%lx\n", dissimilarity); if ((dissimilarity & ASSOC_ARRAY_FAN_MASK) == 0) { /* The old leaves all cluster in the same slot. We will need * to insert a shortcut if the new node wants to cluster with them. */ if ((edit->segment_cache[ASSOC_ARRAY_FAN_OUT] ^ base_seg) == 0) goto all_leaves_cluster_together; /* Otherwise all the old leaves cluster in the same slot, but * the new leaf wants to go into a different slot - so we * create a new node (n0) to hold the new leaf and a pointer to * a new node (n1) holding all the old leaves. * * This can be done by falling through to the node splitting * path. */ pr_devel("present leaves cluster but not new leaf\n"); } split_node: pr_devel("split node\n"); /* We need to split the current node. The node must contain anything * from a single leaf (in the one leaf case, this leaf will cluster * with the new leaf) and the rest meta-pointers, to all leaves, some * of which may cluster. * * It won't contain the case in which all the current leaves plus the * new leaves want to cluster in the same slot. * * We need to expel at least two leaves out of a set consisting of the * leaves in the node and the new leaf. The current meta pointers can * just be copied as they shouldn't cluster with any of the leaves. * * We need a new node (n0) to replace the current one and a new node to * take the expelled nodes (n1). */ edit->set[0].to = assoc_array_node_to_ptr(new_n0); new_n0->back_pointer = node->back_pointer; new_n0->parent_slot = node->parent_slot; new_n1->back_pointer = assoc_array_node_to_ptr(new_n0); new_n1->parent_slot = -1; /* Need to calculate this */ do_split_node: pr_devel("do_split_node\n"); new_n0->nr_leaves_on_branch = node->nr_leaves_on_branch; new_n1->nr_leaves_on_branch = 0; /* Begin by finding two matching leaves. There have to be at least two * that match - even if there are meta pointers - because any leaf that * would match a slot with a meta pointer in it must be somewhere * behind that meta pointer and cannot be here. Further, given N * remaining leaf slots, we now have N+1 leaves to go in them. */ for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) { slot = edit->segment_cache[i]; if (slot != 0xff) for (j = i + 1; j < ASSOC_ARRAY_FAN_OUT + 1; j++) if (edit->segment_cache[j] == slot) goto found_slot_for_multiple_occupancy; } found_slot_for_multiple_occupancy: pr_devel("same slot: %x %x [%02x]\n", i, j, slot); BUG_ON(i >= ASSOC_ARRAY_FAN_OUT); BUG_ON(j >= ASSOC_ARRAY_FAN_OUT + 1); BUG_ON(slot >= ASSOC_ARRAY_FAN_OUT); new_n1->parent_slot = slot; /* Metadata pointers cannot change slot */ for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) if (assoc_array_ptr_is_meta(node->slots[i])) new_n0->slots[i] = node->slots[i]; else new_n0->slots[i] = NULL; BUG_ON(new_n0->slots[slot] != NULL); new_n0->slots[slot] = assoc_array_node_to_ptr(new_n1); /* Filter the leaf pointers between the new nodes */ free_slot = -1; next_slot = 0; for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) { if (assoc_array_ptr_is_meta(node->slots[i])) continue; if (edit->segment_cache[i] == slot) { new_n1->slots[next_slot++] = node->slots[i]; new_n1->nr_leaves_on_branch++; } else { do { free_slot++; } while (new_n0->slots[free_slot] != NULL); new_n0->slots[free_slot] = node->slots[i]; } } pr_devel("filtered: f=%x n=%x\n", free_slot, next_slot); if (edit->segment_cache[ASSOC_ARRAY_FAN_OUT] != slot) { do { free_slot++; } while (new_n0->slots[free_slot] != NULL); edit->leaf_p = &new_n0->slots[free_slot]; edit->adjust_count_on = new_n0; } else { edit->leaf_p = &new_n1->slots[next_slot++]; edit->adjust_count_on = new_n1; } BUG_ON(next_slot <= 1); edit->set_backpointers_to = assoc_array_node_to_ptr(new_n0); for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) { if (edit->segment_cache[i] == 0xff) { ptr = node->slots[i]; BUG_ON(assoc_array_ptr_is_leaf(ptr)); if (assoc_array_ptr_is_node(ptr)) { side = assoc_array_ptr_to_node(ptr); edit->set_backpointers[i] = &side->back_pointer; } else { shortcut = assoc_array_ptr_to_shortcut(ptr); edit->set_backpointers[i] = &shortcut->back_pointer; } } } ptr = node->back_pointer; if (!ptr) edit->set[0].ptr = &edit->array->root; else if (assoc_array_ptr_is_node(ptr)) edit->set[0].ptr = &assoc_array_ptr_to_node(ptr)->slots[node->parent_slot]; else edit->set[0].ptr = &assoc_array_ptr_to_shortcut(ptr)->next_node; edit->excised_meta[0] = assoc_array_node_to_ptr(node); pr_devel("<--%s() = ok [split node]\n", __func__); return true; all_leaves_cluster_together: /* All the leaves, new and old, want to cluster together in this node * in the same slot, so we have to replace this node with a shortcut to * skip over the identical parts of the key and then place a pair of * nodes, one inside the other, at the end of the shortcut and * distribute the keys between them. * * Firstly we need to work out where the leaves start diverging as a * bit position into their keys so that we know how big the shortcut * needs to be. * * We only need to make a single pass of N of the N+1 leaves because if * any keys differ between themselves at bit X then at least one of * them must also differ with the base key at bit X or before. */ pr_devel("all leaves cluster together\n"); diff = INT_MAX; for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) { int x = ops->diff_objects(assoc_array_ptr_to_leaf(node->slots[i]), index_key); if (x < diff) { BUG_ON(x < 0); diff = x; } } BUG_ON(diff == INT_MAX); BUG_ON(diff < level + ASSOC_ARRAY_LEVEL_STEP); keylen = round_up(diff, ASSOC_ARRAY_KEY_CHUNK_SIZE); keylen >>= ASSOC_ARRAY_KEY_CHUNK_SHIFT; new_s0 = kzalloc_flex(*new_s0, index_key, keylen); if (!new_s0) return false; edit->new_meta[2] = assoc_array_shortcut_to_ptr(new_s0); edit->set[0].to = assoc_array_shortcut_to_ptr(new_s0); new_s0->back_pointer = node->back_pointer; new_s0->parent_slot = node->parent_slot; new_s0->next_node = assoc_array_node_to_ptr(new_n0); new_n0->back_pointer = assoc_array_shortcut_to_ptr(new_s0); new_n0->parent_slot = 0; new_n1->back_pointer = assoc_array_node_to_ptr(new_n0); new_n1->parent_slot = -1; /* Need to calculate this */ new_s0->skip_to_level = level = diff & ~ASSOC_ARRAY_LEVEL_STEP_MASK; pr_devel("skip_to_level = %d [diff %d]\n", level, diff); BUG_ON(level <= 0); for (i = 0; i < keylen; i++) new_s0->index_key[i] = ops->get_key_chunk(index_key, i * ASSOC_ARRAY_KEY_CHUNK_SIZE); if (level & ASSOC_ARRAY_KEY_CHUNK_MASK) { blank = ULONG_MAX << (level & ASSOC_ARRAY_KEY_CHUNK_MASK); pr_devel("blank off [%zu] %d: %lx\n", keylen - 1, level, blank); new_s0->index_key[keylen - 1] &= ~blank; } /* This now reduces to a node splitting exercise for which we'll need * to regenerate the disparity table. */ for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) { ptr = node->slots[i]; base_seg = ops->get_object_key_chunk(assoc_array_ptr_to_leaf(ptr), level); base_seg >>= level & ASSOC_ARRAY_KEY_CHUNK_MASK; edit->segment_cache[i] = base_seg & ASSOC_ARRAY_FAN_MASK; } base_seg = ops->get_key_chunk(index_key, level); base_seg >>= level & ASSOC_ARRAY_KEY_CHUNK_MASK; edit->segment_cache[ASSOC_ARRAY_FAN_OUT] = base_seg & ASSOC_ARRAY_FAN_MASK; goto do_split_node; } /* * Handle insertion into the middle of a shortcut. */ static bool assoc_array_insert_mid_shortcut(struct assoc_array_edit *edit, const struct assoc_array_ops *ops, struct assoc_array_walk_result *result) { struct assoc_array_shortcut *shortcut, *new_s0, *new_s1; struct assoc_array_node *node, *new_n0, *side; unsigned long sc_segments, dissimilarity, blank; size_t keylen; int level, sc_level, diff; int sc_slot; shortcut = result->wrong_shortcut.shortcut; level = result->wrong_shortcut.level; sc_level = result->wrong_shortcut.sc_level; sc_segments = result->wrong_shortcut.sc_segments; dissimilarity = result->wrong_shortcut.dissimilarity; pr_devel("-->%s(ix=%d dis=%lx scix=%d)\n", __func__, level, dissimilarity, sc_level); /* We need to split a shortcut and insert a node between the two * pieces. Zero-length pieces will be dispensed with entirely. * * First of all, we need to find out in which level the first * difference was. */ diff = __ffs(dissimilarity); diff &= ~ASSOC_ARRAY_LEVEL_STEP_MASK; diff += sc_level & ~ASSOC_ARRAY_KEY_CHUNK_MASK; pr_devel("diff=%d\n", diff); if (!shortcut->back_pointer) { edit->set[0].ptr = &edit->array->root; } else if (assoc_array_ptr_is_node(shortcut->back_pointer)) { node = assoc_array_ptr_to_node(shortcut->back_pointer); edit->set[0].ptr = &node->slots[shortcut->parent_slot]; } else { BUG(); } edit->excised_meta[0] = assoc_array_shortcut_to_ptr(shortcut); /* Create a new node now since we're going to need it anyway */ new_n0 = kzalloc_obj(struct assoc_array_node); if (!new_n0) return false; edit->new_meta[0] = assoc_array_node_to_ptr(new_n0); edit->adjust_count_on = new_n0; /* Insert a new shortcut before the new node if this segment isn't of * zero length - otherwise we just connect the new node directly to the * parent. */ level += ASSOC_ARRAY_LEVEL_STEP; if (diff > level) { pr_devel("pre-shortcut %d...%d\n", level, diff); keylen = round_up(diff, ASSOC_ARRAY_KEY_CHUNK_SIZE); keylen >>= ASSOC_ARRAY_KEY_CHUNK_SHIFT; new_s0 = kzalloc_flex(*new_s0, index_key, keylen); if (!new_s0) return false; edit->new_meta[1] = assoc_array_shortcut_to_ptr(new_s0); edit->set[0].to = assoc_array_shortcut_to_ptr(new_s0); new_s0->back_pointer = shortcut->back_pointer; new_s0->parent_slot = shortcut->parent_slot; new_s0->next_node = assoc_array_node_to_ptr(new_n0); new_s0->skip_to_level = diff; new_n0->back_pointer = assoc_array_shortcut_to_ptr(new_s0); new_n0->parent_slot = 0; memcpy(new_s0->index_key, shortcut->index_key, flex_array_size(new_s0, index_key, keylen)); blank = ULONG_MAX << (diff & ASSOC_ARRAY_KEY_CHUNK_MASK); pr_devel("blank off [%zu] %d: %lx\n", keylen - 1, diff, blank); new_s0->index_key[keylen - 1] &= ~blank; } else { pr_devel("no pre-shortcut\n"); edit->set[0].to = assoc_array_node_to_ptr(new_n0); new_n0->back_pointer = shortcut->back_pointer; new_n0->parent_slot = shortcut->parent_slot; } side = assoc_array_ptr_to_node(shortcut->next_node); new_n0->nr_leaves_on_branch = side->nr_leaves_on_branch; /* We need to know which slot in the new node is going to take a * metadata pointer. */ sc_slot = sc_segments >> (diff & ASSOC_ARRAY_KEY_CHUNK_MASK); sc_slot &= ASSOC_ARRAY_FAN_MASK; pr_devel("new slot %lx >> %d -> %d\n", sc_segments, diff & ASSOC_ARRAY_KEY_CHUNK_MASK, sc_slot); /* Determine whether we need to follow the new node with a replacement * for the current shortcut. We could in theory reuse the current * shortcut if its parent slot number doesn't change - but that's a * 1-in-16 chance so not worth expending the code upon. */ level = diff + ASSOC_ARRAY_LEVEL_STEP; if (level < shortcut->skip_to_level) { pr_devel("post-shortcut %d...%d\n", level, shortcut->skip_to_level); keylen = round_up(shortcut->skip_to_level, ASSOC_ARRAY_KEY_CHUNK_SIZE); keylen >>= ASSOC_ARRAY_KEY_CHUNK_SHIFT; new_s1 = kzalloc_flex(*new_s1, index_key, keylen); if (!new_s1) return false; edit->new_meta[2] = assoc_array_shortcut_to_ptr(new_s1); new_s1->back_pointer = assoc_array_node_to_ptr(new_n0); new_s1->parent_slot = sc_slot; new_s1->next_node = shortcut->next_node; new_s1->skip_to_level = shortcut->skip_to_level; new_n0->slots[sc_slot] = assoc_array_shortcut_to_ptr(new_s1); memcpy(new_s1->index_key, shortcut->index_key, flex_array_size(new_s1, index_key, keylen)); edit->set[1].ptr = &side->back_pointer; edit->set[1].to = assoc_array_shortcut_to_ptr(new_s1); } else { pr_devel("no post-shortcut\n"); /* We don't have to replace the pointed-to node as long as we * use memory barriers to make sure the parent slot number is * changed before the back pointer (the parent slot number is * irrelevant to the old parent shortcut). */ new_n0->slots[sc_slot] = shortcut->next_node; edit->set_parent_slot[0].p = &side->parent_slot; edit->set_parent_slot[0].to = sc_slot; edit->set[1].ptr = &side->back_pointer; edit->set[1].to = assoc_array_node_to_ptr(new_n0); } /* Install the new leaf in a spare slot in the new node. */ if (sc_slot == 0) edit->leaf_p = &new_n0->slots[1]; else edit->leaf_p = &new_n0->slots[0]; pr_devel("<--%s() = ok [split shortcut]\n", __func__); return true; } /** * assoc_array_insert - Script insertion of an object into an associative array * @array: The array to insert into. * @ops: The operations to use. * @index_key: The key to insert at. * @object: The object to insert. * * Precalculate and preallocate a script for the insertion or replacement of an * object in an associative array. This results in an edit script that can * either be applied or cancelled. * * The function returns a pointer to an edit script or -ENOMEM. * * The caller should lock against other modifications and must continue to hold * the lock until assoc_array_apply_edit() has been called. * * Accesses to the tree may take place concurrently with this function, * provided they hold the RCU read lock. */ struct assoc_array_edit *assoc_array_insert(struct assoc_array *array, const struct assoc_array_ops *ops, const void *index_key, void *object) { struct assoc_array_walk_result result; struct assoc_array_edit *edit; pr_devel("-->%s()\n", __func__); /* The leaf pointer we're given must not have the bottom bit set as we * use those for type-marking the pointer. NULL pointers are also not * allowed as they indicate an empty slot but we have to allow them * here as they can be updated later. */ BUG_ON(assoc_array_ptr_is_meta(object)); edit = kzalloc_obj(struct assoc_array_edit); if (!edit) return ERR_PTR(-ENOMEM); edit->array = array; edit->ops = ops; edit->leaf = assoc_array_leaf_to_ptr(object); edit->adjust_count_by = 1; switch (assoc_array_walk(array, ops, index_key, &result)) { case assoc_array_walk_tree_empty: /* Allocate a root node if there isn't one yet */ if (!assoc_array_insert_in_empty_tree(edit)) goto enomem; return edit; case assoc_array_walk_found_terminal_node: /* We found a node that doesn't have a node/shortcut pointer in * the slot corresponding to the index key that we have to * follow. */ if (!assoc_array_insert_into_terminal_node(edit, ops, index_key, &result)) goto enomem; return edit; case assoc_array_walk_found_wrong_shortcut: /* We found a shortcut that didn't match our key in a slot we * needed to follow. */ if (!assoc_array_insert_mid_shortcut(edit, ops, &result)) goto enomem; return edit; } enomem: /* Clean up after an out of memory error */ pr_devel("enomem\n"); assoc_array_cancel_edit(edit); return ERR_PTR(-ENOMEM); } /** * assoc_array_insert_set_object - Set the new object pointer in an edit script * @edit: The edit script to modify. * @object: The object pointer to set. * * Change the object to be inserted in an edit script. The object pointed to * by the old object is not freed. This must be done prior to applying the * script. */ void assoc_array_insert_set_object(struct assoc_array_edit *edit, void *object) { BUG_ON(!object); edit->leaf = assoc_array_leaf_to_ptr(object); } struct assoc_array_delete_collapse_context { struct assoc_array_node *node; const void *skip_leaf; int slot; }; /* * Subtree collapse to node iterator. */ static int assoc_array_delete_collapse_iterator(const void *leaf, void *iterator_data) { struct assoc_array_delete_collapse_context *collapse = iterator_data; if (leaf == collapse->skip_leaf) return 0; BUG_ON(collapse->slot >= ASSOC_ARRAY_FAN_OUT); collapse->node->slots[collapse->slot++] = assoc_array_leaf_to_ptr(leaf); return 0; } /** * assoc_array_delete - Script deletion of an object from an associative array * @array: The array to search. * @ops: The operations to use. * @index_key: The key to the object. * * Precalculate and preallocate a script for the deletion of an object from an * associative array. This results in an edit script that can either be * applied or cancelled. * * The function returns a pointer to an edit script if the object was found, * NULL if the object was not found or -ENOMEM. * * The caller should lock against other modifications and must continue to hold * the lock until assoc_array_apply_edit() has been called. * * Accesses to the tree may take place concurrently with this function, * provided they hold the RCU read lock. */ struct assoc_array_edit *assoc_array_delete(struct assoc_array *array, const struct assoc_array_ops *ops, const void *index_key) { struct assoc_array_delete_collapse_context collapse; struct assoc_array_walk_result result; struct assoc_array_node *node, *new_n0; struct assoc_array_edit *edit; struct assoc_array_ptr *ptr; bool has_meta; int slot, i; pr_devel("-->%s()\n", __func__); edit = kzalloc_obj(struct assoc_array_edit); if (!edit) return ERR_PTR(-ENOMEM); edit->array = array; edit->ops = ops; edit->adjust_count_by = -1; switch (assoc_array_walk(array, ops, index_key, &result)) { case assoc_array_walk_found_terminal_node: /* We found a node that should contain the leaf we've been * asked to remove - *if* it's in the tree. */ pr_devel("terminal_node\n"); node = result.terminal_node.node; for (slot = 0; slot < ASSOC_ARRAY_FAN_OUT; slot++) { ptr = node->slots[slot]; if (ptr && assoc_array_ptr_is_leaf(ptr) && ops->compare_object(assoc_array_ptr_to_leaf(ptr), index_key)) goto found_leaf; } fallthrough; case assoc_array_walk_tree_empty: case assoc_array_walk_found_wrong_shortcut: default: assoc_array_cancel_edit(edit); pr_devel("not found\n"); return NULL; } found_leaf: BUG_ON(array->nr_leaves_on_tree <= 0); /* In the simplest form of deletion we just clear the slot and release * the leaf after a suitable interval. */ edit->dead_leaf = node->slots[slot]; edit->set[0].ptr = &node->slots[slot]; edit->set[0].to = NULL; edit->adjust_count_on = node; /* If that concludes erasure of the last leaf, then delete the entire * internal array. */ if (array->nr_leaves_on_tree == 1) { edit->set[1].ptr = &array->root; edit->set[1].to = NULL; edit->adjust_count_on = NULL; edit->excised_subtree = array->root; pr_devel("all gone\n"); return edit; } /* However, we'd also like to clear up some metadata blocks if we * possibly can. * * We go for a simple algorithm of: if this node has FAN_OUT or fewer * leaves in it, then attempt to collapse it - and attempt to * recursively collapse up the tree. * * We could also try and collapse in partially filled subtrees to take * up space in this node. */ if (node->nr_leaves_on_branch <= ASSOC_ARRAY_FAN_OUT + 1) { struct assoc_array_node *parent, *grandparent; struct assoc_array_ptr *ptr; /* First of all, we need to know if this node has metadata so * that we don't try collapsing if all the leaves are already * here. */ has_meta = false; for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) { ptr = node->slots[i]; if (assoc_array_ptr_is_meta(ptr)) { has_meta = true; break; } } pr_devel("leaves: %ld [m=%d]\n", node->nr_leaves_on_branch - 1, has_meta); /* Look further up the tree to see if we can collapse this node * into a more proximal node too. */ parent = node; collapse_up: pr_devel("collapse subtree: %ld\n", parent->nr_leaves_on_branch); ptr = parent->back_pointer; if (!ptr) goto do_collapse; if (assoc_array_ptr_is_shortcut(ptr)) { struct assoc_array_shortcut *s = assoc_array_ptr_to_shortcut(ptr); ptr = s->back_pointer; if (!ptr) goto do_collapse; } grandparent = assoc_array_ptr_to_node(ptr); if (grandparent->nr_leaves_on_branch <= ASSOC_ARRAY_FAN_OUT + 1) { parent = grandparent; goto collapse_up; } do_collapse: /* There's no point collapsing if the original node has no meta * pointers to discard and if we didn't merge into one of that * node's ancestry. */ if (has_meta || parent != node) { node = parent; /* Create a new node to collapse into */ new_n0 = kzalloc_obj(struct assoc_array_node); if (!new_n0) goto enomem; edit->new_meta[0] = assoc_array_node_to_ptr(new_n0); new_n0->back_pointer = node->back_pointer; new_n0->parent_slot = node->parent_slot; new_n0->nr_leaves_on_branch = node->nr_leaves_on_branch; edit->adjust_count_on = new_n0; collapse.node = new_n0; collapse.skip_leaf = assoc_array_ptr_to_leaf(edit->dead_leaf); collapse.slot = 0; assoc_array_subtree_iterate(assoc_array_node_to_ptr(node), node->back_pointer, assoc_array_delete_collapse_iterator, &collapse); pr_devel("collapsed %d,%lu\n", collapse.slot, new_n0->nr_leaves_on_branch); BUG_ON(collapse.slot != new_n0->nr_leaves_on_branch - 1); if (!node->back_pointer) { edit->set[1].ptr = &array->root; } else if (assoc_array_ptr_is_leaf(node->back_pointer)) { BUG(); } else if (assoc_array_ptr_is_node(node->back_pointer)) { struct assoc_array_node *p = assoc_array_ptr_to_node(node->back_pointer); edit->set[1].ptr = &p->slots[node->parent_slot]; } else if (assoc_array_ptr_is_shortcut(node->back_pointer)) { struct assoc_array_shortcut *s = assoc_array_ptr_to_shortcut(node->back_pointer); edit->set[1].ptr = &s->next_node; } edit->set[1].to = assoc_array_node_to_ptr(new_n0); edit->excised_subtree = assoc_array_node_to_ptr(node); } } return edit; enomem: /* Clean up after an out of memory error */ pr_devel("enomem\n"); assoc_array_cancel_edit(edit); return ERR_PTR(-ENOMEM); } /** * assoc_array_clear - Script deletion of all objects from an associative array * @array: The array to clear. * @ops: The operations to use. * * Precalculate and preallocate a script for the deletion of all the objects * from an associative array. This results in an edit script that can either * be applied or cancelled. * * The function returns a pointer to an edit script if there are objects to be * deleted, NULL if there are no objects in the array or -ENOMEM. * * The caller should lock against other modifications and must continue to hold * the lock until assoc_array_apply_edit() has been called. * * Accesses to the tree may take place concurrently with this function, * provided they hold the RCU read lock. */ struct assoc_array_edit *assoc_array_clear(struct assoc_array *array, const struct assoc_array_ops *ops) { struct assoc_array_edit *edit; pr_devel("-->%s()\n", __func__); if (!array->root) return NULL; edit = kzalloc_obj(struct assoc_array_edit); if (!edit) return ERR_PTR(-ENOMEM); edit->array = array; edit->ops = ops; edit->set[1].ptr = &array->root; edit->set[1].to = NULL; edit->excised_subtree = array->root; edit->ops_for_excised_subtree = ops; pr_devel("all gone\n"); return edit; } /* * Handle the deferred destruction after an applied edit. */ static void assoc_array_rcu_cleanup(struct rcu_head *head) { struct assoc_array_edit *edit = container_of(head, struct assoc_array_edit, rcu); int i; pr_devel("-->%s()\n", __func__); if (edit->dead_leaf) edit->ops->free_object(assoc_array_ptr_to_leaf(edit->dead_leaf)); for (i = 0; i < ARRAY_SIZE(edit->excised_meta); i++) if (edit->excised_meta[i]) kfree(assoc_array_ptr_to_node(edit->excised_meta[i])); if (edit->excised_subtree) { BUG_ON(assoc_array_ptr_is_leaf(edit->excised_subtree)); if (assoc_array_ptr_is_node(edit->excised_subtree)) { struct assoc_array_node *n = assoc_array_ptr_to_node(edit->excised_subtree); n->back_pointer = NULL; } else { struct assoc_array_shortcut *s = assoc_array_ptr_to_shortcut(edit->excised_subtree); s->back_pointer = NULL; } assoc_array_destroy_subtree(edit->excised_subtree, edit->ops_for_excised_subtree); } kfree(edit); } /** * assoc_array_apply_edit - Apply an edit script to an associative array * @edit: The script to apply. * * Apply an edit script to an associative array to effect an insertion, * deletion or clearance. As the edit script includes preallocated memory, * this is guaranteed not to fail. * * The edit script, dead objects and dead metadata will be scheduled for * destruction after an RCU grace period to permit those doing read-only * accesses on the array to continue to do so under the RCU read lock whilst * the edit is taking place. */ void assoc_array_apply_edit(struct assoc_array_edit *edit) { struct assoc_array_shortcut *shortcut; struct assoc_array_node *node; struct assoc_array_ptr *ptr; int i; pr_devel("-->%s()\n", __func__); smp_wmb(); if (edit->leaf_p) *edit->leaf_p = edit->leaf; smp_wmb(); for (i = 0; i < ARRAY_SIZE(edit->set_parent_slot); i++) if (edit->set_parent_slot[i].p) *edit->set_parent_slot[i].p = edit->set_parent_slot[i].to; smp_wmb(); for (i = 0; i < ARRAY_SIZE(edit->set_backpointers); i++) if (edit->set_backpointers[i]) *edit->set_backpointers[i] = edit->set_backpointers_to; smp_wmb(); for (i = 0; i < ARRAY_SIZE(edit->set); i++) if (edit->set[i].ptr) *edit->set[i].ptr = edit->set[i].to; if (edit->array->root == NULL) { edit->array->nr_leaves_on_tree = 0; } else if (edit->adjust_count_on) { node = edit->adjust_count_on; for (;;) { node->nr_leaves_on_branch += edit->adjust_count_by; ptr = node->back_pointer; if (!ptr) break; if (assoc_array_ptr_is_shortcut(ptr)) { shortcut = assoc_array_ptr_to_shortcut(ptr); ptr = shortcut->back_pointer; if (!ptr) break; } BUG_ON(!assoc_array_ptr_is_node(ptr)); node = assoc_array_ptr_to_node(ptr); } edit->array->nr_leaves_on_tree += edit->adjust_count_by; } call_rcu(&edit->rcu, assoc_array_rcu_cleanup); } /** * assoc_array_cancel_edit - Discard an edit script. * @edit: The script to discard. * * Free an edit script and all the preallocated data it holds without making * any changes to the associative array it was intended for. * * NOTE! In the case of an insertion script, this does _not_ release the leaf * that was to be inserted. That is left to the caller. */ void assoc_array_cancel_edit(struct assoc_array_edit *edit) { struct assoc_array_ptr *ptr; int i; pr_devel("-->%s()\n", __func__); /* Clean up after an out of memory error */ for (i = 0; i < ARRAY_SIZE(edit->new_meta); i++) { ptr = edit->new_meta[i]; if (ptr) { if (assoc_array_ptr_is_node(ptr)) kfree(assoc_array_ptr_to_node(ptr)); else kfree(assoc_array_ptr_to_shortcut(ptr)); } } kfree(edit); } /** * assoc_array_gc - Garbage collect an associative array. * @array: The array to clean. * @ops: The operations to use. * @iterator: A callback function to pass judgement on each object. * @iterator_data: Private data for the callback function. * * Collect garbage from an associative array and pack down the internal tree to * save memory. * * The iterator function is asked to pass judgement upon each object in the * array. If it returns false, the object is discard and if it returns true, * the object is kept. If it returns true, it must increment the object's * usage count (or whatever it needs to do to retain it) before returning. * * This function returns 0 if successful or -ENOMEM if out of memory. In the * latter case, the array is not changed. * * The caller should lock against other modifications and must continue to hold * the lock until assoc_array_apply_edit() has been called. * * Accesses to the tree may take place concurrently with this function, * provided they hold the RCU read lock. */ int assoc_array_gc(struct assoc_array *array, const struct assoc_array_ops *ops, bool (*iterator)(void *object, void *iterator_data), void *iterator_data) { struct assoc_array_shortcut *shortcut, *new_s; struct assoc_array_node *node, *new_n; struct assoc_array_edit *edit; struct assoc_array_ptr *cursor, *ptr; struct assoc_array_ptr *new_root, *new_parent, **new_ptr_pp; unsigned long nr_leaves_on_tree; bool retained; int keylen, slot, nr_free, next_slot, i; pr_devel("-->%s()\n", __func__); if (!array->root) return 0; edit = kzalloc_obj(struct assoc_array_edit); if (!edit) return -ENOMEM; edit->array = array; edit->ops = ops; edit->ops_for_excised_subtree = ops; edit->set[0].ptr = &array->root; edit->excised_subtree = array->root; new_root = new_parent = NULL; new_ptr_pp = &new_root; cursor = array->root; descend: /* If this point is a shortcut, then we need to duplicate it and * advance the target cursor. */ if (assoc_array_ptr_is_shortcut(cursor)) { shortcut = assoc_array_ptr_to_shortcut(cursor); keylen = round_up(shortcut->skip_to_level, ASSOC_ARRAY_KEY_CHUNK_SIZE); keylen >>= ASSOC_ARRAY_KEY_CHUNK_SHIFT; new_s = kmalloc_flex(*new_s, index_key, keylen); if (!new_s) goto enomem; pr_devel("dup shortcut %p -> %p\n", shortcut, new_s); memcpy(new_s, shortcut, struct_size(new_s, index_key, keylen)); new_s->back_pointer = new_parent; new_s->parent_slot = shortcut->parent_slot; *new_ptr_pp = new_parent = assoc_array_shortcut_to_ptr(new_s); new_ptr_pp = &new_s->next_node; cursor = shortcut->next_node; } /* Duplicate the node at this position */ node = assoc_array_ptr_to_node(cursor); new_n = kzalloc_obj(struct assoc_array_node); if (!new_n) goto enomem; pr_devel("dup node %p -> %p\n", node, new_n); new_n->back_pointer = new_parent; new_n->parent_slot = node->parent_slot; *new_ptr_pp = new_parent = assoc_array_node_to_ptr(new_n); new_ptr_pp = NULL; slot = 0; continue_node: /* Filter across any leaves and gc any subtrees */ for (; slot < ASSOC_ARRAY_FAN_OUT; slot++) { ptr = node->slots[slot]; if (!ptr) continue; if (assoc_array_ptr_is_leaf(ptr)) { if (iterator(assoc_array_ptr_to_leaf(ptr), iterator_data)) /* The iterator will have done any reference * counting on the object for us. */ new_n->slots[slot] = ptr; continue; } new_ptr_pp = &new_n->slots[slot]; cursor = ptr; goto descend; } retry_compress: pr_devel("-- compress node %p --\n", new_n); /* Count up the number of empty slots in this node and work out the * subtree leaf count. */ new_n->nr_leaves_on_branch = 0; nr_free = 0; for (slot = 0; slot < ASSOC_ARRAY_FAN_OUT; slot++) { ptr = new_n->slots[slot]; if (!ptr) nr_free++; else if (assoc_array_ptr_is_leaf(ptr)) new_n->nr_leaves_on_branch++; } pr_devel("free=%d, leaves=%lu\n", nr_free, new_n->nr_leaves_on_branch); /* See what we can fold in */ retained = false; next_slot = 0; for (slot = 0; slot < ASSOC_ARRAY_FAN_OUT; slot++) { struct assoc_array_shortcut *s; struct assoc_array_node *child; ptr = new_n->slots[slot]; if (!ptr || assoc_array_ptr_is_leaf(ptr)) continue; s = NULL; if (assoc_array_ptr_is_shortcut(ptr)) { s = assoc_array_ptr_to_shortcut(ptr); ptr = s->next_node; } child = assoc_array_ptr_to_node(ptr); new_n->nr_leaves_on_branch += child->nr_leaves_on_branch; if (child->nr_leaves_on_branch <= nr_free + 1) { /* Fold the child node into this one */ pr_devel("[%d] fold node %lu/%d [nx %d]\n", slot, child->nr_leaves_on_branch, nr_free + 1, next_slot); /* We would already have reaped an intervening shortcut * on the way back up the tree. */ BUG_ON(s); new_n->slots[slot] = NULL; nr_free++; if (slot < next_slot) next_slot = slot; for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) { struct assoc_array_ptr *p = child->slots[i]; if (!p) continue; BUG_ON(assoc_array_ptr_is_meta(p)); while (new_n->slots[next_slot]) next_slot++; BUG_ON(next_slot >= ASSOC_ARRAY_FAN_OUT); new_n->slots[next_slot++] = p; nr_free--; } kfree(child); } else { pr_devel("[%d] retain node %lu/%d [nx %d]\n", slot, child->nr_leaves_on_branch, nr_free + 1, next_slot); retained = true; } } if (retained && new_n->nr_leaves_on_branch <= ASSOC_ARRAY_FAN_OUT) { pr_devel("internal nodes remain despite enough space, retrying\n"); goto retry_compress; } pr_devel("after: %lu\n", new_n->nr_leaves_on_branch); nr_leaves_on_tree = new_n->nr_leaves_on_branch; /* Excise this node if it is singly occupied by a shortcut */ if (nr_free == ASSOC_ARRAY_FAN_OUT - 1) { for (slot = 0; slot < ASSOC_ARRAY_FAN_OUT; slot++) if ((ptr = new_n->slots[slot])) break; if (assoc_array_ptr_is_meta(ptr) && assoc_array_ptr_is_shortcut(ptr)) { pr_devel("excise node %p with 1 shortcut\n", new_n); new_s = assoc_array_ptr_to_shortcut(ptr); new_parent = new_n->back_pointer; slot = new_n->parent_slot; kfree(new_n); if (!new_parent) { new_s->back_pointer = NULL; new_s->parent_slot = 0; new_root = ptr; goto gc_complete; } if (assoc_array_ptr_is_shortcut(new_parent)) { /* We can discard any preceding shortcut also */ struct assoc_array_shortcut *s = assoc_array_ptr_to_shortcut(new_parent); pr_devel("excise preceding shortcut\n"); new_parent = new_s->back_pointer = s->back_pointer; slot = new_s->parent_slot = s->parent_slot; kfree(s); if (!new_parent) { new_s->back_pointer = NULL; new_s->parent_slot = 0; new_root = ptr; goto gc_complete; } } new_s->back_pointer = new_parent; new_s->parent_slot = slot; new_n = assoc_array_ptr_to_node(new_parent); new_n->slots[slot] = ptr; goto ascend_old_tree; } } /* Excise any shortcuts we might encounter that point to nodes that * only contain leaves. */ ptr = new_n->back_pointer; if (!ptr) goto gc_complete; if (assoc_array_ptr_is_shortcut(ptr)) { new_s = assoc_array_ptr_to_shortcut(ptr); new_parent = new_s->back_pointer; slot = new_s->parent_slot; if (new_n->nr_leaves_on_branch <= ASSOC_ARRAY_FAN_OUT) { struct assoc_array_node *n; pr_devel("excise shortcut\n"); new_n->back_pointer = new_parent; new_n->parent_slot = slot; kfree(new_s); if (!new_parent) { new_root = assoc_array_node_to_ptr(new_n); goto gc_complete; } n = assoc_array_ptr_to_node(new_parent); n->slots[slot] = assoc_array_node_to_ptr(new_n); } } else { new_parent = ptr; } new_n = assoc_array_ptr_to_node(new_parent); ascend_old_tree: ptr = node->back_pointer; if (assoc_array_ptr_is_shortcut(ptr)) { shortcut = assoc_array_ptr_to_shortcut(ptr); slot = shortcut->parent_slot; cursor = shortcut->back_pointer; if (!cursor) goto gc_complete; } else { slot = node->parent_slot; cursor = ptr; } BUG_ON(!cursor); node = assoc_array_ptr_to_node(cursor); slot++; goto continue_node; gc_complete: edit->set[0].to = new_root; assoc_array_apply_edit(edit); array->nr_leaves_on_tree = nr_leaves_on_tree; return 0; enomem: pr_devel("enomem\n"); assoc_array_destroy_subtree(new_root, edit->ops); kfree(edit); return -ENOMEM; }
23 12 11 1 12 11 11 23 11 23 25 25 26 14 1 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 // SPDX-License-Identifier: GPL-2.0 #include <linux/file.h> #include <linux/mount.h> #include <linux/namei.h> #include <linux/utime.h> #include <linux/syscalls.h> #include <linux/uaccess.h> #include <linux/compat.h> #include <asm/unistd.h> #include <linux/filelock.h> #include "internal.h" static bool nsec_valid(long nsec) { if (nsec == UTIME_OMIT || nsec == UTIME_NOW) return true; return nsec >= 0 && nsec <= 999999999; } int vfs_utimes(const struct path *path, struct timespec64 *times) { int error; struct iattr newattrs; struct inode *inode = path->dentry->d_inode; struct delegated_inode delegated_inode = { }; if (times) { if (!nsec_valid(times[0].tv_nsec) || !nsec_valid(times[1].tv_nsec)) return -EINVAL; if (times[0].tv_nsec == UTIME_NOW && times[1].tv_nsec == UTIME_NOW) times = NULL; } error = mnt_want_write(path->mnt); if (error) goto out; newattrs.ia_valid = ATTR_CTIME | ATTR_MTIME | ATTR_ATIME; if (times) { if (times[0].tv_nsec == UTIME_OMIT) newattrs.ia_valid &= ~ATTR_ATIME; else if (times[0].tv_nsec != UTIME_NOW) { newattrs.ia_atime = times[0]; newattrs.ia_valid |= ATTR_ATIME_SET; } if (times[1].tv_nsec == UTIME_OMIT) newattrs.ia_valid &= ~ATTR_MTIME; else if (times[1].tv_nsec != UTIME_NOW) { newattrs.ia_mtime = times[1]; newattrs.ia_valid |= ATTR_MTIME_SET; } /* * Tell setattr_prepare(), that this is an explicit time * update, even if neither ATTR_ATIME_SET nor ATTR_MTIME_SET * were used. */ newattrs.ia_valid |= ATTR_TIMES_SET; } else { newattrs.ia_valid |= ATTR_TOUCH; } retry_deleg: inode_lock(inode); error = notify_change(mnt_idmap(path->mnt), path->dentry, &newattrs, &delegated_inode); inode_unlock(inode); if (is_delegated(&delegated_inode)) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; } mnt_drop_write(path->mnt); out: return error; } EXPORT_SYMBOL_GPL(vfs_utimes); static int do_utimes_path(int dfd, const char __user *filename, struct timespec64 *times, int flags) { struct path path; int lookup_flags = 0, error; if (flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) return -EINVAL; if (!(flags & AT_SYMLINK_NOFOLLOW)) lookup_flags |= LOOKUP_FOLLOW; CLASS(filename_uflags, name)(filename, flags); retry: error = filename_lookup(dfd, name, lookup_flags, &path, NULL); if (error) return error; error = vfs_utimes(&path, times); path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } return error; } static int do_utimes_fd(int fd, struct timespec64 *times, int flags) { if (flags) return -EINVAL; CLASS(fd, f)(fd); if (fd_empty(f)) return -EBADF; return vfs_utimes(&fd_file(f)->f_path, times); } /* * do_utimes - change times on filename or file descriptor * @dfd: open file descriptor, -1 or AT_FDCWD * @filename: path name or NULL * @times: new times or NULL * @flags: zero or more flags (only AT_SYMLINK_NOFOLLOW for the moment) * * If filename is NULL and dfd refers to an open file, then operate on * the file. Otherwise look up filename, possibly using dfd as a * starting point. * * If times==NULL, set access and modification to current time, * must be owner or have write permission. * Else, update from *times, must be owner or super user. */ long do_utimes(int dfd, const char __user *filename, struct timespec64 *times, int flags) { if (filename == NULL && dfd != AT_FDCWD) return do_utimes_fd(dfd, times, flags); return do_utimes_path(dfd, filename, times, flags); } SYSCALL_DEFINE4(utimensat, int, dfd, const char __user *, filename, struct __kernel_timespec __user *, utimes, int, flags) { struct timespec64 tstimes[2]; if (utimes) { if ((get_timespec64(&tstimes[0], &utimes[0]) || get_timespec64(&tstimes[1], &utimes[1]))) return -EFAULT; /* Nothing to do, we must not even check the path. */ if (tstimes[0].tv_nsec == UTIME_OMIT && tstimes[1].tv_nsec == UTIME_OMIT) return 0; } return do_utimes(dfd, filename, utimes ? tstimes : NULL, flags); } #ifdef __ARCH_WANT_SYS_UTIME /* * futimesat(), utimes() and utime() are older versions of utimensat() * that are provided for compatibility with traditional C libraries. * On modern architectures, we always use libc wrappers around * utimensat() instead. */ static long do_futimesat(int dfd, const char __user *filename, struct __kernel_old_timeval __user *utimes) { struct __kernel_old_timeval times[2]; struct timespec64 tstimes[2]; if (utimes) { if (copy_from_user(&times, utimes, sizeof(times))) return -EFAULT; /* This test is needed to catch all invalid values. If we would test only in do_utimes we would miss those invalid values truncated by the multiplication with 1000. Note that we also catch UTIME_{NOW,OMIT} here which are only valid for utimensat. */ if (times[0].tv_usec >= 1000000 || times[0].tv_usec < 0 || times[1].tv_usec >= 1000000 || times[1].tv_usec < 0) return -EINVAL; tstimes[0].tv_sec = times[0].tv_sec; tstimes[0].tv_nsec = 1000 * times[0].tv_usec; tstimes[1].tv_sec = times[1].tv_sec; tstimes[1].tv_nsec = 1000 * times[1].tv_usec; } return do_utimes(dfd, filename, utimes ? tstimes : NULL, 0); } SYSCALL_DEFINE3(futimesat, int, dfd, const char __user *, filename, struct __kernel_old_timeval __user *, utimes) { return do_futimesat(dfd, filename, utimes); } SYSCALL_DEFINE2(utimes, char __user *, filename, struct __kernel_old_timeval __user *, utimes) { return do_futimesat(AT_FDCWD, filename, utimes); } SYSCALL_DEFINE2(utime, char __user *, filename, struct utimbuf __user *, times) { struct timespec64 tv[2]; if (times) { if (get_user(tv[0].tv_sec, &times->actime) || get_user(tv[1].tv_sec, &times->modtime)) return -EFAULT; tv[0].tv_nsec = 0; tv[1].tv_nsec = 0; } return do_utimes(AT_FDCWD, filename, times ? tv : NULL, 0); } #endif #ifdef CONFIG_COMPAT_32BIT_TIME /* * Not all architectures have sys_utime, so implement this in terms * of sys_utimes. */ #ifdef __ARCH_WANT_SYS_UTIME32 SYSCALL_DEFINE2(utime32, const char __user *, filename, struct old_utimbuf32 __user *, t) { struct timespec64 tv[2]; if (t) { if (get_user(tv[0].tv_sec, &t->actime) || get_user(tv[1].tv_sec, &t->modtime)) return -EFAULT; tv[0].tv_nsec = 0; tv[1].tv_nsec = 0; } return do_utimes(AT_FDCWD, filename, t ? tv : NULL, 0); } #endif SYSCALL_DEFINE4(utimensat_time32, unsigned int, dfd, const char __user *, filename, struct old_timespec32 __user *, t, int, flags) { struct timespec64 tv[2]; if (t) { if (get_old_timespec32(&tv[0], &t[0]) || get_old_timespec32(&tv[1], &t[1])) return -EFAULT; if (tv[0].tv_nsec == UTIME_OMIT && tv[1].tv_nsec == UTIME_OMIT) return 0; } return do_utimes(dfd, filename, t ? tv : NULL, flags); } #ifdef __ARCH_WANT_SYS_UTIME32 static long do_compat_futimesat(unsigned int dfd, const char __user *filename, struct old_timeval32 __user *t) { struct timespec64 tv[2]; if (t) { if (get_user(tv[0].tv_sec, &t[0].tv_sec) || get_user(tv[0].tv_nsec, &t[0].tv_usec) || get_user(tv[1].tv_sec, &t[1].tv_sec) || get_user(tv[1].tv_nsec, &t[1].tv_usec)) return -EFAULT; if (tv[0].tv_nsec >= 1000000 || tv[0].tv_nsec < 0 || tv[1].tv_nsec >= 1000000 || tv[1].tv_nsec < 0) return -EINVAL; tv[0].tv_nsec *= 1000; tv[1].tv_nsec *= 1000; } return do_utimes(dfd, filename, t ? tv : NULL, 0); } SYSCALL_DEFINE3(futimesat_time32, unsigned int, dfd, const char __user *, filename, struct old_timeval32 __user *, t) { return do_compat_futimesat(dfd, filename, t); } SYSCALL_DEFINE2(utimes_time32, const char __user *, filename, struct old_timeval32 __user *, t) { return do_compat_futimesat(AT_FDCWD, filename, t); } #endif #endif
15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 13 13 13 13 13 13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 // SPDX-License-Identifier: GPL-2.0-or-later /* * Fast Userspace Mutexes (which I call "Futexes!"). * (C) Rusty Russell, IBM 2002 * * Generalized futexes, futex requeueing, misc fixes by Ingo Molnar * (C) Copyright 2003 Red Hat Inc, All Rights Reserved * * Removed page pinning, fix privately mapped COW pages and other cleanups * (C) Copyright 2003, 2004 Jamie Lokier * * Robust futex support started by Ingo Molnar * (C) Copyright 2006 Red Hat Inc, All Rights Reserved * Thanks to Thomas Gleixner for suggestions, analysis and fixes. * * PI-futex support started by Ingo Molnar and Thomas Gleixner * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> * * PRIVATE futexes by Eric Dumazet * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com> * * Requeue-PI support by Darren Hart <dvhltc@us.ibm.com> * Copyright (C) IBM Corporation, 2009 * Thanks to Thomas Gleixner for conceptual design and careful reviews. * * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly * enough at me, Linus for the original (flawed) idea, Matthew * Kirkwood for proof-of-concept implementation. * * "The futexes are also cursed." * "But they come in a choice of three flavours!" */ #include <linux/compat.h> #include <linux/jhash.h> #include <linux/pagemap.h> #include <linux/debugfs.h> #include <linux/plist.h> #include <linux/gfp.h> #include <linux/vmalloc.h> #include <linux/memblock.h> #include <linux/fault-inject.h> #include <linux/slab.h> #include <linux/prctl.h> #include <linux/mempolicy.h> #include <linux/mmap_lock.h> #include "futex.h" #include "../locking/rtmutex_common.h" /* * The base of the bucket array and its size are always used together * (after initialization only in futex_hash()), so ensure that they * reside in the same cacheline. */ static struct { unsigned long hashmask; unsigned int hashshift; struct futex_hash_bucket *queues[MAX_NUMNODES]; } __futex_data __read_mostly __aligned(2*sizeof(long)); #define futex_hashmask (__futex_data.hashmask) #define futex_hashshift (__futex_data.hashshift) #define futex_queues (__futex_data.queues) struct futex_private_hash { int state; unsigned int hash_mask; struct rcu_head rcu; void *mm; bool custom; struct futex_hash_bucket queues[]; }; /* * Fault injections for futexes. */ #ifdef CONFIG_FAIL_FUTEX static struct { struct fault_attr attr; bool ignore_private; } fail_futex = { .attr = FAULT_ATTR_INITIALIZER, .ignore_private = false, }; static int __init setup_fail_futex(char *str) { return setup_fault_attr(&fail_futex.attr, str); } __setup("fail_futex=", setup_fail_futex); bool should_fail_futex(bool fshared) { if (fail_futex.ignore_private && !fshared) return false; return should_fail(&fail_futex.attr, 1); } #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS static int __init fail_futex_debugfs(void) { umode_t mode = S_IFREG | S_IRUSR | S_IWUSR; struct dentry *dir; dir = fault_create_debugfs_attr("fail_futex", NULL, &fail_futex.attr); if (IS_ERR(dir)) return PTR_ERR(dir); debugfs_create_bool("ignore-private", mode, dir, &fail_futex.ignore_private); return 0; } late_initcall(fail_futex_debugfs); #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ #endif /* CONFIG_FAIL_FUTEX */ static struct futex_hash_bucket * __futex_hash(union futex_key *key, struct futex_private_hash *fph); #ifdef CONFIG_FUTEX_PRIVATE_HASH static bool futex_ref_get(struct futex_private_hash *fph); static bool futex_ref_put(struct futex_private_hash *fph); static bool futex_ref_is_dead(struct futex_private_hash *fph); enum { FR_PERCPU = 0, FR_ATOMIC }; static inline bool futex_key_is_private(union futex_key *key) { /* * Relies on get_futex_key() to set either bit for shared * futexes -- see comment with union futex_key. */ return !(key->both.offset & (FUT_OFF_INODE | FUT_OFF_MMSHARED)); } static bool futex_private_hash_get(struct futex_private_hash *fph) { return futex_ref_get(fph); } void futex_private_hash_put(struct futex_private_hash *fph) { if (futex_ref_put(fph)) wake_up_var(fph->mm); } /** * futex_hash_get - Get an additional reference for the local hash. * @hb: ptr to the private local hash. * * Obtain an additional reference for the already obtained hash bucket. The * caller must already own an reference. */ void futex_hash_get(struct futex_hash_bucket *hb) { struct futex_private_hash *fph = hb->priv; if (!fph) return; WARN_ON_ONCE(!futex_private_hash_get(fph)); } void futex_hash_put(struct futex_hash_bucket *hb) { struct futex_private_hash *fph = hb->priv; if (!fph) return; futex_private_hash_put(fph); } static struct futex_hash_bucket * __futex_hash_private(union futex_key *key, struct futex_private_hash *fph) { u32 hash; if (!futex_key_is_private(key)) return NULL; if (!fph) fph = rcu_dereference(key->private.mm->futex_phash); if (!fph || !fph->hash_mask) return NULL; hash = jhash2((void *)&key->private.address, sizeof(key->private.address) / 4, key->both.offset); return &fph->queues[hash & fph->hash_mask]; } static void futex_rehash_private(struct futex_private_hash *old, struct futex_private_hash *new) { struct futex_hash_bucket *hb_old, *hb_new; unsigned int slots = old->hash_mask + 1; unsigned int i; for (i = 0; i < slots; i++) { struct futex_q *this, *tmp; hb_old = &old->queues[i]; spin_lock(&hb_old->lock); plist_for_each_entry_safe(this, tmp, &hb_old->chain, list) { plist_del(&this->list, &hb_old->chain); futex_hb_waiters_dec(hb_old); WARN_ON_ONCE(this->lock_ptr != &hb_old->lock); hb_new = __futex_hash(&this->key, new); futex_hb_waiters_inc(hb_new); /* * The new pointer isn't published yet but an already * moved user can be unqueued due to timeout or signal. */ spin_lock_nested(&hb_new->lock, SINGLE_DEPTH_NESTING); plist_add(&this->list, &hb_new->chain); this->lock_ptr = &hb_new->lock; spin_unlock(&hb_new->lock); } spin_unlock(&hb_old->lock); } } static bool __futex_pivot_hash(struct mm_struct *mm, struct futex_private_hash *new) { struct futex_private_hash *fph; WARN_ON_ONCE(mm->futex_phash_new); fph = rcu_dereference_protected(mm->futex_phash, lockdep_is_held(&mm->futex_hash_lock)); if (fph) { if (!futex_ref_is_dead(fph)) { mm->futex_phash_new = new; return false; } futex_rehash_private(fph, new); } new->state = FR_PERCPU; scoped_guard(rcu) { mm->futex_batches = get_state_synchronize_rcu(); rcu_assign_pointer(mm->futex_phash, new); } kvfree_rcu(fph, rcu); return true; } static void futex_pivot_hash(struct mm_struct *mm) { scoped_guard(mutex, &mm->futex_hash_lock) { struct futex_private_hash *fph; fph = mm->futex_phash_new; if (fph) { mm->futex_phash_new = NULL; __futex_pivot_hash(mm, fph); } } } struct futex_private_hash *futex_private_hash(void) { struct mm_struct *mm = current->mm; /* * Ideally we don't loop. If there is a replacement in progress * then a new private hash is already prepared and a reference can't be * obtained once the last user dropped it's. * In that case we block on mm_struct::futex_hash_lock and either have * to perform the replacement or wait while someone else is doing the * job. Eitherway, on the second iteration we acquire a reference on the * new private hash or loop again because a new replacement has been * requested. */ again: scoped_guard(rcu) { struct futex_private_hash *fph; fph = rcu_dereference(mm->futex_phash); if (!fph) return NULL; if (futex_private_hash_get(fph)) return fph; } futex_pivot_hash(mm); goto again; } struct futex_hash_bucket *futex_hash(union futex_key *key) { struct futex_private_hash *fph; struct futex_hash_bucket *hb; again: scoped_guard(rcu) { hb = __futex_hash(key, NULL); fph = hb->priv; if (!fph || futex_private_hash_get(fph)) return hb; } futex_pivot_hash(key->private.mm); goto again; } #else /* !CONFIG_FUTEX_PRIVATE_HASH */ static struct futex_hash_bucket * __futex_hash_private(union futex_key *key, struct futex_private_hash *fph) { return NULL; } struct futex_hash_bucket *futex_hash(union futex_key *key) { return __futex_hash(key, NULL); } #endif /* CONFIG_FUTEX_PRIVATE_HASH */ #ifdef CONFIG_FUTEX_MPOL static int __futex_key_to_node(struct mm_struct *mm, unsigned long addr) { struct vm_area_struct *vma = vma_lookup(mm, addr); struct mempolicy *mpol; int node = FUTEX_NO_NODE; if (!vma) return FUTEX_NO_NODE; mpol = READ_ONCE(vma->vm_policy); if (!mpol) return FUTEX_NO_NODE; switch (mpol->mode) { case MPOL_PREFERRED: node = first_node(mpol->nodes); break; case MPOL_PREFERRED_MANY: case MPOL_BIND: if (mpol->home_node != NUMA_NO_NODE) node = mpol->home_node; break; default: break; } return node; } static int futex_key_to_node_opt(struct mm_struct *mm, unsigned long addr) { int seq, node; guard(rcu)(); if (!mmap_lock_speculate_try_begin(mm, &seq)) return -EBUSY; node = __futex_key_to_node(mm, addr); if (mmap_lock_speculate_retry(mm, seq)) return -EAGAIN; return node; } static int futex_mpol(struct mm_struct *mm, unsigned long addr) { int node; node = futex_key_to_node_opt(mm, addr); if (node >= FUTEX_NO_NODE) return node; guard(mmap_read_lock)(mm); return __futex_key_to_node(mm, addr); } #else /* !CONFIG_FUTEX_MPOL */ static int futex_mpol(struct mm_struct *mm, unsigned long addr) { return FUTEX_NO_NODE; } #endif /* CONFIG_FUTEX_MPOL */ /** * __futex_hash - Return the hash bucket * @key: Pointer to the futex key for which the hash is calculated * @fph: Pointer to private hash if known * * We hash on the keys returned from get_futex_key (see below) and return the * corresponding hash bucket. * If the FUTEX is PROCESS_PRIVATE then a per-process hash bucket (from the * private hash) is returned if existing. Otherwise a hash bucket from the * global hash is returned. */ static struct futex_hash_bucket * __futex_hash(union futex_key *key, struct futex_private_hash *fph) { int node = key->both.node; u32 hash; if (node == FUTEX_NO_NODE) { struct futex_hash_bucket *hb; hb = __futex_hash_private(key, fph); if (hb) return hb; } hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / sizeof(u32), key->both.offset); if (node == FUTEX_NO_NODE) { /* * In case of !FLAGS_NUMA, use some unused hash bits to pick a * node -- this ensures regular futexes are interleaved across * the nodes and avoids having to allocate multiple * hash-tables. * * NOTE: this isn't perfectly uniform, but it is fast and * handles sparse node masks. */ node = (hash >> futex_hashshift) % nr_node_ids; if (!node_possible(node)) { node = find_next_bit_wrap(node_possible_map.bits, nr_node_ids, node); } } return &futex_queues[node][hash & futex_hashmask]; } /** * futex_setup_timer - set up the sleeping hrtimer. * @time: ptr to the given timeout value * @timeout: the hrtimer_sleeper structure to be set up * @flags: futex flags * @range_ns: optional range in ns * * Return: Initialized hrtimer_sleeper structure or NULL if no timeout * value given */ struct hrtimer_sleeper * futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, int flags, u64 range_ns) { if (!time) return NULL; hrtimer_setup_sleeper_on_stack(timeout, (flags & FLAGS_CLOCKRT) ? CLOCK_REALTIME : CLOCK_MONOTONIC, HRTIMER_MODE_ABS); /* * If range_ns is 0, calling hrtimer_set_expires_range_ns() is * effectively the same as calling hrtimer_set_expires(). */ hrtimer_set_expires_range_ns(&timeout->timer, *time, range_ns); return timeout; } /* * Generate a machine wide unique identifier for this inode. * * This relies on u64 not wrapping in the life-time of the machine; which with * 1ns resolution means almost 585 years. * * This further relies on the fact that a well formed program will not unmap * the file while it has a (shared) futex waiting on it. This mapping will have * a file reference which pins the mount and inode. * * If for some reason an inode gets evicted and read back in again, it will get * a new sequence number and will _NOT_ match, even though it is the exact same * file. * * It is important that futex_match() will never have a false-positive, esp. * for PI futexes that can mess up the state. The above argues that false-negatives * are only possible for malformed programs. */ static u64 get_inode_sequence_number(struct inode *inode) { static atomic64_t i_seq; u64 old; /* Does the inode already have a sequence number? */ old = atomic64_read(&inode->i_sequence); if (likely(old)) return old; for (;;) { u64 new = atomic64_inc_return(&i_seq); if (WARN_ON_ONCE(!new)) continue; old = 0; if (!atomic64_try_cmpxchg_relaxed(&inode->i_sequence, &old, new)) return old; return new; } } /** * get_futex_key() - Get parameters which are the keys for a futex * @uaddr: virtual address of the futex * @flags: FLAGS_* * @key: address where result is stored. * @rw: mapping needs to be read/write (values: FUTEX_READ, * FUTEX_WRITE) * * Return: a negative error code or 0 * * The key words are stored in @key on success. * * For shared mappings (when @fshared), the key is: * * ( inode->i_sequence, page offset within mapping, offset_within_page ) * * [ also see get_inode_sequence_number() ] * * For private mappings (or when !@fshared), the key is: * * ( current->mm, address, 0 ) * * This allows (cross process, where applicable) identification of the futex * without keeping the page pinned for the duration of the FUTEX_WAIT. * * lock_page() might sleep, the caller should not hold a spinlock. */ int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key, enum futex_access rw) { unsigned long address = (unsigned long)uaddr; struct mm_struct *mm = current->mm; struct page *page; struct folio *folio; struct address_space *mapping; int node, err, size, ro = 0; bool node_updated = false; bool fshared; fshared = flags & FLAGS_SHARED; size = futex_size(flags); if (flags & FLAGS_NUMA) size *= 2; /* * The futex address must be "naturally" aligned. */ key->both.offset = address % PAGE_SIZE; if (unlikely((address % size) != 0)) return -EINVAL; address -= key->both.offset; if (unlikely(!access_ok(uaddr, size))) return -EFAULT; if (unlikely(should_fail_futex(fshared))) return -EFAULT; node = FUTEX_NO_NODE; if (flags & FLAGS_NUMA) { u32 __user *naddr = (void *)uaddr + size / 2; if (get_user_inline(node, naddr)) return -EFAULT; if ((node != FUTEX_NO_NODE) && ((unsigned int)node >= MAX_NUMNODES || !node_possible(node))) return -EINVAL; } if (node == FUTEX_NO_NODE && (flags & FLAGS_MPOL)) { node = futex_mpol(mm, address); node_updated = true; } if (flags & FLAGS_NUMA) { u32 __user *naddr = (void *)uaddr + size / 2; if (node == FUTEX_NO_NODE) { node = numa_node_id(); node_updated = true; } if (node_updated && put_user_inline(node, naddr)) return -EFAULT; } key->both.node = node; /* * PROCESS_PRIVATE futexes are fast. * As the mm cannot disappear under us and the 'key' only needs * virtual address, we dont even have to find the underlying vma. * Note : We do have to check 'uaddr' is a valid user address, * but access_ok() should be faster than find_vma() */ if (!fshared) { /* * On no-MMU, shared futexes are treated as private, therefore * we must not include the current process in the key. Since * there is only one address space, the address is a unique key * on its own. */ if (IS_ENABLED(CONFIG_MMU)) key->private.mm = mm; else key->private.mm = NULL; key->private.address = address; return 0; } again: /* Ignore any VERIFY_READ mapping (futex common case) */ if (unlikely(should_fail_futex(true))) return -EFAULT; err = get_user_pages_fast(address, 1, FOLL_WRITE, &page); /* * If write access is not required (eg. FUTEX_WAIT), try * and get read-only access. */ if (err == -EFAULT && rw == FUTEX_READ) { err = get_user_pages_fast(address, 1, 0, &page); ro = 1; } if (err < 0) return err; else err = 0; /* * The treatment of mapping from this point on is critical. The folio * lock protects many things but in this context the folio lock * stabilizes mapping, prevents inode freeing in the shared * file-backed region case and guards against movement to swap cache. * * Strictly speaking the folio lock is not needed in all cases being * considered here and folio lock forces unnecessarily serialization. * From this point on, mapping will be re-verified if necessary and * folio lock will be acquired only if it is unavoidable * * Mapping checks require the folio so it is looked up now. For * anonymous pages, it does not matter if the folio is split * in the future as the key is based on the address. For * filesystem-backed pages, the precise page is required as the * index of the page determines the key. */ folio = page_folio(page); mapping = READ_ONCE(folio->mapping); /* * If folio->mapping is NULL, then it cannot be an anonymous * page; but it might be the ZERO_PAGE or in the gate area or * in a special mapping (all cases which we are happy to fail); * or it may have been a good file page when get_user_pages_fast * found it, but truncated or holepunched or subjected to * invalidate_complete_page2 before we got the folio lock (also * cases which we are happy to fail). And we hold a reference, * so refcount care in invalidate_inode_page's remove_mapping * prevents drop_caches from setting mapping to NULL beneath us. * * The case we do have to guard against is when memory pressure made * shmem_writepage move it from filecache to swapcache beneath us: * an unlikely race, but we do need to retry for folio->mapping. */ if (unlikely(!mapping)) { int shmem_swizzled; /* * Folio lock is required to identify which special case above * applies. If this is really a shmem page then the folio lock * will prevent unexpected transitions. */ folio_lock(folio); shmem_swizzled = folio_test_swapcache(folio) || folio->mapping; folio_unlock(folio); folio_put(folio); if (shmem_swizzled) goto again; return -EFAULT; } /* * Private mappings are handled in a simple way. * * If the futex key is stored in anonymous memory, then the associated * object is the mm which is implicitly pinned by the calling process. * * NOTE: When userspace waits on a MAP_SHARED mapping, even if * it's a read-only handle, it's expected that futexes attach to * the object not the particular process. */ if (folio_test_anon(folio)) { /* * A RO anonymous page will never change and thus doesn't make * sense for futex operations. */ if (unlikely(should_fail_futex(true)) || ro) { err = -EFAULT; goto out; } key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ key->private.mm = mm; key->private.address = address; } else { struct inode *inode; /* * The associated futex object in this case is the inode and * the folio->mapping must be traversed. Ordinarily this should * be stabilised under folio lock but it's not strictly * necessary in this case as we just want to pin the inode, not * update i_pages or anything like that. * * The RCU read lock is taken as the inode is finally freed * under RCU. If the mapping still matches expectations then the * mapping->host can be safely accessed as being a valid inode. */ rcu_read_lock(); if (READ_ONCE(folio->mapping) != mapping) { rcu_read_unlock(); folio_put(folio); goto again; } inode = READ_ONCE(mapping->host); if (!inode) { rcu_read_unlock(); folio_put(folio); goto again; } key->both.offset |= FUT_OFF_INODE; /* inode-based key */ key->shared.i_seq = get_inode_sequence_number(inode); key->shared.pgoff = page_pgoff(folio, page); rcu_read_unlock(); } out: folio_put(folio); return err; } /** * fault_in_user_writeable() - Fault in user address and verify RW access * @uaddr: pointer to faulting user space address * * Slow path to fixup the fault we just took in the atomic write * access to @uaddr. * * We have no generic implementation of a non-destructive write to the * user address. We know that we faulted in the atomic pagefault * disabled section so we can as well avoid the #PF overhead by * calling get_user_pages() right away. */ int fault_in_user_writeable(u32 __user *uaddr) { struct mm_struct *mm = current->mm; int ret; mmap_read_lock(mm); ret = fixup_user_fault(mm, (unsigned long)uaddr, FAULT_FLAG_WRITE, NULL); mmap_read_unlock(mm); return ret < 0 ? ret : 0; } /** * futex_top_waiter() - Return the highest priority waiter on a futex * @hb: the hash bucket the futex_q's reside in * @key: the futex key (to distinguish it from other futex futex_q's) * * Must be called with the hb lock held. */ struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, union futex_key *key) { struct futex_q *this; plist_for_each_entry(this, &hb->chain, list) { if (futex_match(&this->key, key)) return this; } return NULL; } /** * wait_for_owner_exiting - Block until the owner has exited * @ret: owner's current futex lock status * @exiting: Pointer to the exiting task * * Caller must hold a refcount on @exiting. */ void wait_for_owner_exiting(int ret, struct task_struct *exiting) { if (ret != -EBUSY) { WARN_ON_ONCE(exiting); return; } if (WARN_ON_ONCE(ret == -EBUSY && !exiting)) return; mutex_lock(&exiting->futex_exit_mutex); /* * No point in doing state checking here. If the waiter got here * while the task was in exec()->exec_futex_release() then it can * have any FUTEX_STATE_* value when the waiter has acquired the * mutex. OK, if running, EXITING or DEAD if it reached exit() * already. Highly unlikely and not a problem. Just one more round * through the futex maze. */ mutex_unlock(&exiting->futex_exit_mutex); put_task_struct(exiting); } /** * __futex_unqueue() - Remove the futex_q from its futex_hash_bucket * @q: The futex_q to unqueue * * The q->lock_ptr must not be NULL and must be held by the caller. */ void __futex_unqueue(struct futex_q *q) { struct futex_hash_bucket *hb; if (WARN_ON_SMP(!q->lock_ptr) || WARN_ON(plist_node_empty(&q->list))) return; lockdep_assert_held(q->lock_ptr); hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock); plist_del(&q->list, &hb->chain); futex_hb_waiters_dec(hb); } /* The key must be already stored in q->key. */ void futex_q_lock(struct futex_q *q, struct futex_hash_bucket *hb) { /* * Increment the counter before taking the lock so that * a potential waker won't miss a to-be-slept task that is * waiting for the spinlock. This is safe as all futex_q_lock() * users end up calling futex_queue(). Similarly, for housekeeping, * decrement the counter at futex_q_unlock() when some error has * occurred and we don't end up adding the task to the list. */ futex_hb_waiters_inc(hb); /* implies smp_mb(); (A) */ q->lock_ptr = &hb->lock; spin_lock(&hb->lock); __acquire(q->lock_ptr); } void futex_q_unlock(struct futex_hash_bucket *hb) { futex_hb_waiters_dec(hb); spin_unlock(&hb->lock); } void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb, struct task_struct *task) { int prio; /* * The priority used to register this element is * - either the real thread-priority for the real-time threads * (i.e. threads with a priority lower than MAX_RT_PRIO) * - or MAX_RT_PRIO for non-RT threads. * Thus, all RT-threads are woken first in priority order, and * the others are woken last, in FIFO order. */ prio = min(current->normal_prio, MAX_RT_PRIO); plist_node_init(&q->list, prio); plist_add(&q->list, &hb->chain); q->task = task; } /** * futex_unqueue() - Remove the futex_q from its futex_hash_bucket * @q: The futex_q to unqueue * * The q->lock_ptr must not be held by the caller. A call to futex_unqueue() must * be paired with exactly one earlier call to futex_queue(). * * Return: * - 1 - if the futex_q was still queued (and we removed unqueued it); * - 0 - if the futex_q was already removed by the waking thread */ int futex_unqueue(struct futex_q *q) { spinlock_t *lock_ptr; int ret = 0; /* RCU so lock_ptr is not going away during locking. */ guard(rcu)(); /* In the common case we don't take the spinlock, which is nice. */ retry: /* * q->lock_ptr can change between this read and the following spin_lock. * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and * optimizing lock_ptr out of the logic below. */ lock_ptr = READ_ONCE(q->lock_ptr); if (lock_ptr != NULL) { spin_lock(lock_ptr); /* * q->lock_ptr can change between reading it and * spin_lock(), causing us to take the wrong lock. This * corrects the race condition. * * Reasoning goes like this: if we have the wrong lock, * q->lock_ptr must have changed (maybe several times) * between reading it and the spin_lock(). It can * change again after the spin_lock() but only if it was * already changed before the spin_lock(). It cannot, * however, change back to the original value. Therefore * we can detect whether we acquired the correct lock. */ if (unlikely(lock_ptr != q->lock_ptr)) { spin_unlock(lock_ptr); goto retry; } __futex_unqueue(q); BUG_ON(q->pi_state); spin_unlock(lock_ptr); ret = 1; } return ret; } void futex_q_lockptr_lock(struct futex_q *q) { spinlock_t *lock_ptr; /* * See futex_unqueue() why lock_ptr can change. */ guard(rcu)(); retry: lock_ptr = READ_ONCE(q->lock_ptr); spin_lock(lock_ptr); if (unlikely(lock_ptr != q->lock_ptr)) { spin_unlock(lock_ptr); goto retry; } } /* * PI futexes can not be requeued and must remove themselves from the hash * bucket. The hash bucket lock (i.e. lock_ptr) is held. */ void futex_unqueue_pi(struct futex_q *q) { /* * If the lock was not acquired (due to timeout or signal) then the * rt_waiter is removed before futex_q is. If this is observed by * an unlocker after dropping the rtmutex wait lock and before * acquiring the hash bucket lock, then the unlocker dequeues the * futex_q from the hash bucket list to guarantee consistent state * vs. userspace. Therefore the dequeue here must be conditional. */ if (!plist_node_empty(&q->list)) __futex_unqueue(q); BUG_ON(!q->pi_state); put_pi_state(q->pi_state); q->pi_state = NULL; } /* Constants for the pending_op argument of handle_futex_death */ #define HANDLE_DEATH_PENDING true #define HANDLE_DEATH_LIST false /* * Process a futex-list entry, check whether it's owned by the * dying task, and do notification if so: */ static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, bool pi, bool pending_op) { u32 uval, nval, mval; pid_t owner; int err; /* Futex address must be 32bit aligned */ if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0) return -1; retry: if (get_user(uval, uaddr)) return -1; /* * Special case for regular (non PI) futexes. The unlock path in * user space has two race scenarios: * * 1. The unlock path releases the user space futex value and * before it can execute the futex() syscall to wake up * waiters it is killed. * * 2. A woken up waiter is killed before it can acquire the * futex in user space. * * In the second case, the wake up notification could be generated * by the unlock path in user space after setting the futex value * to zero or by the kernel after setting the OWNER_DIED bit below. * * In both cases the TID validation below prevents a wakeup of * potential waiters which can cause these waiters to block * forever. * * In both cases the following conditions are met: * * 1) task->robust_list->list_op_pending != NULL * @pending_op == true * 2) The owner part of user space futex value == 0 * 3) Regular futex: @pi == false * * If these conditions are met, it is safe to attempt waking up a * potential waiter without touching the user space futex value and * trying to set the OWNER_DIED bit. If the futex value is zero, * the rest of the user space mutex state is consistent, so a woken * waiter will just take over the uncontended futex. Setting the * OWNER_DIED bit would create inconsistent state and malfunction * of the user space owner died handling. Otherwise, the OWNER_DIED * bit is already set, and the woken waiter is expected to deal with * this. */ owner = uval & FUTEX_TID_MASK; if (pending_op && !pi && !owner) { futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, 1, FUTEX_BITSET_MATCH_ANY); return 0; } if (owner != task_pid_vnr(curr)) return 0; /* * Ok, this dying thread is truly holding a futex * of interest. Set the OWNER_DIED bit atomically * via cmpxchg, and if the value had FUTEX_WAITERS * set, wake up a waiter (if any). (We have to do a * futex_wake() even if OWNER_DIED is already set - * to handle the rare but possible case of recursive * thread-death.) The rest of the cleanup is done in * userspace. */ mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; /* * We are not holding a lock here, but we want to have * the pagefault_disable/enable() protection because * we want to handle the fault gracefully. If the * access fails we try to fault in the futex with R/W * verification via get_user_pages. get_user() above * does not guarantee R/W access. If that fails we * give up and leave the futex locked. */ if ((err = futex_cmpxchg_value_locked(&nval, uaddr, uval, mval))) { switch (err) { case -EFAULT: if (fault_in_user_writeable(uaddr)) return -1; goto retry; case -EAGAIN: cond_resched(); goto retry; default: WARN_ON_ONCE(1); return err; } } if (nval != uval) goto retry; /* * Wake robust non-PI futexes here. The wakeup of * PI futexes happens in exit_pi_state(): */ if (!pi && (uval & FUTEX_WAITERS)) { futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, 1, FUTEX_BITSET_MATCH_ANY); } return 0; } /* * Fetch a robust-list pointer. Bit 0 signals PI futexes: */ static inline int fetch_robust_entry(struct robust_list __user **entry, struct robust_list __user * __user *head, unsigned int *pi) { unsigned long uentry; if (get_user(uentry, (unsigned long __user *)head)) return -EFAULT; *entry = (void __user *)(uentry & ~1UL); *pi = uentry & 1; return 0; } /* * Walk curr->robust_list (very carefully, it's a userspace list!) * and mark any locks found there dead, and notify any waiters. * * We silently return on any sign of list-walking problem. */ static void exit_robust_list(struct task_struct *curr) { struct robust_list_head __user *head = curr->robust_list; struct robust_list __user *entry, *next_entry, *pending; unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; unsigned int next_pi; unsigned long futex_offset; int rc; /* * Fetch the list head (which was registered earlier, via * sys_set_robust_list()): */ if (fetch_robust_entry(&entry, &head->list.next, &pi)) return; /* * Fetch the relative futex offset: */ if (get_user(futex_offset, &head->futex_offset)) return; /* * Fetch any possibly pending lock-add first, and handle it * if it exists: */ if (fetch_robust_entry(&pending, &head->list_op_pending, &pip)) return; next_entry = NULL; /* avoid warning with gcc */ while (entry != &head->list) { /* * Fetch the next entry in the list before calling * handle_futex_death: */ rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi); /* * A pending lock might already be on the list, so * don't process it twice: */ if (entry != pending) { if (handle_futex_death((void __user *)entry + futex_offset, curr, pi, HANDLE_DEATH_LIST)) return; } if (rc) return; entry = next_entry; pi = next_pi; /* * Avoid excessively long or circular lists: */ if (!--limit) break; cond_resched(); } if (pending) { handle_futex_death((void __user *)pending + futex_offset, curr, pip, HANDLE_DEATH_PENDING); } } #ifdef CONFIG_COMPAT static void __user *futex_uaddr(struct robust_list __user *entry, compat_long_t futex_offset) { compat_uptr_t base = ptr_to_compat(entry); void __user *uaddr = compat_ptr(base + futex_offset); return uaddr; } /* * Fetch a robust-list pointer. Bit 0 signals PI futexes: */ static inline int compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, compat_uptr_t __user *head, unsigned int *pi) { if (get_user(*uentry, head)) return -EFAULT; *entry = compat_ptr((*uentry) & ~1); *pi = (unsigned int)(*uentry) & 1; return 0; } /* * Walk curr->robust_list (very carefully, it's a userspace list!) * and mark any locks found there dead, and notify any waiters. * * We silently return on any sign of list-walking problem. */ static void compat_exit_robust_list(struct task_struct *curr) { struct compat_robust_list_head __user *head = curr->compat_robust_list; struct robust_list __user *entry, *next_entry, *pending; unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; unsigned int next_pi; compat_uptr_t uentry, next_uentry, upending; compat_long_t futex_offset; int rc; /* * Fetch the list head (which was registered earlier, via * sys_set_robust_list()): */ if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &pi)) return; /* * Fetch the relative futex offset: */ if (get_user(futex_offset, &head->futex_offset)) return; /* * Fetch any possibly pending lock-add first, and handle it * if it exists: */ if (compat_fetch_robust_entry(&upending, &pending, &head->list_op_pending, &pip)) return; next_entry = NULL; /* avoid warning with gcc */ while (entry != (struct robust_list __user *) &head->list) { /* * Fetch the next entry in the list before calling * handle_futex_death: */ rc = compat_fetch_robust_entry(&next_uentry, &next_entry, (compat_uptr_t __user *)&entry->next, &next_pi); /* * A pending lock might already be on the list, so * dont process it twice: */ if (entry != pending) { void __user *uaddr = futex_uaddr(entry, futex_offset); if (handle_futex_death(uaddr, curr, pi, HANDLE_DEATH_LIST)) return; } if (rc) return; uentry = next_uentry; entry = next_entry; pi = next_pi; /* * Avoid excessively long or circular lists: */ if (!--limit) break; cond_resched(); } if (pending) { void __user *uaddr = futex_uaddr(pending, futex_offset); handle_futex_death(uaddr, curr, pip, HANDLE_DEATH_PENDING); } } #endif #ifdef CONFIG_FUTEX_PI /* * This task is holding PI mutexes at exit time => bad. * Kernel cleans up PI-state, but userspace is likely hosed. * (Robust-futex cleanup is separate and might save the day for userspace.) */ static void exit_pi_state_list(struct task_struct *curr) { struct list_head *next, *head = &curr->pi_state_list; struct futex_pi_state *pi_state; union futex_key key = FUTEX_KEY_INIT; /* * The mutex mm_struct::futex_hash_lock might be acquired. */ might_sleep(); /* * Ensure the hash remains stable (no resize) during the while loop * below. The hb pointer is acquired under the pi_lock so we can't block * on the mutex. */ WARN_ON(curr != current); guard(private_hash)(); /* * We are a ZOMBIE and nobody can enqueue itself on * pi_state_list anymore, but we have to be careful * versus waiters unqueueing themselves: */ raw_spin_lock_irq(&curr->pi_lock); while (!list_empty(head)) { next = head->next; pi_state = list_entry(next, struct futex_pi_state, list); key = pi_state->key; if (1) { CLASS(hb, hb)(&key); /* * We can race against put_pi_state() removing itself from the * list (a waiter going away). put_pi_state() will first * decrement the reference count and then modify the list, so * its possible to see the list entry but fail this reference * acquire. * * In that case; drop the locks to let put_pi_state() make * progress and retry the loop. */ if (!refcount_inc_not_zero(&pi_state->refcount)) { raw_spin_unlock_irq(&curr->pi_lock); cpu_relax(); raw_spin_lock_irq(&curr->pi_lock); continue; } raw_spin_unlock_irq(&curr->pi_lock); spin_lock(&hb->lock); raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); raw_spin_lock(&curr->pi_lock); /* * We dropped the pi-lock, so re-check whether this * task still owns the PI-state: */ if (head->next != next) { /* retain curr->pi_lock for the loop invariant */ raw_spin_unlock(&pi_state->pi_mutex.wait_lock); spin_unlock(&hb->lock); put_pi_state(pi_state); continue; } WARN_ON(pi_state->owner != curr); WARN_ON(list_empty(&pi_state->list)); list_del_init(&pi_state->list); pi_state->owner = NULL; raw_spin_unlock(&curr->pi_lock); raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); spin_unlock(&hb->lock); } rt_mutex_futex_unlock(&pi_state->pi_mutex); put_pi_state(pi_state); raw_spin_lock_irq(&curr->pi_lock); } raw_spin_unlock_irq(&curr->pi_lock); } #else static inline void exit_pi_state_list(struct task_struct *curr) { } #endif static void futex_cleanup(struct task_struct *tsk) { if (unlikely(tsk->robust_list)) { exit_robust_list(tsk); tsk->robust_list = NULL; } #ifdef CONFIG_COMPAT if (unlikely(tsk->compat_robust_list)) { compat_exit_robust_list(tsk); tsk->compat_robust_list = NULL; } #endif if (unlikely(!list_empty(&tsk->pi_state_list))) exit_pi_state_list(tsk); } /** * futex_exit_recursive - Set the tasks futex state to FUTEX_STATE_DEAD * @tsk: task to set the state on * * Set the futex exit state of the task lockless. The futex waiter code * observes that state when a task is exiting and loops until the task has * actually finished the futex cleanup. The worst case for this is that the * waiter runs through the wait loop until the state becomes visible. * * This is called from the recursive fault handling path in make_task_dead(). * * This is best effort. Either the futex exit code has run already or * not. If the OWNER_DIED bit has been set on the futex then the waiter can * take it over. If not, the problem is pushed back to user space. If the * futex exit code did not run yet, then an already queued waiter might * block forever, but there is nothing which can be done about that. */ void futex_exit_recursive(struct task_struct *tsk) { /* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */ if (tsk->futex_state == FUTEX_STATE_EXITING) { __assume_ctx_lock(&tsk->futex_exit_mutex); mutex_unlock(&tsk->futex_exit_mutex); } tsk->futex_state = FUTEX_STATE_DEAD; } static void futex_cleanup_begin(struct task_struct *tsk) __acquires(&tsk->futex_exit_mutex) { /* * Prevent various race issues against a concurrent incoming waiter * including live locks by forcing the waiter to block on * tsk->futex_exit_mutex when it observes FUTEX_STATE_EXITING in * attach_to_pi_owner(). */ mutex_lock(&tsk->futex_exit_mutex); /* * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock. * * This ensures that all subsequent checks of tsk->futex_state in * attach_to_pi_owner() must observe FUTEX_STATE_EXITING with * tsk->pi_lock held. * * It guarantees also that a pi_state which was queued right before * the state change under tsk->pi_lock by a concurrent waiter must * be observed in exit_pi_state_list(). */ raw_spin_lock_irq(&tsk->pi_lock); tsk->futex_state = FUTEX_STATE_EXITING; raw_spin_unlock_irq(&tsk->pi_lock); } static void futex_cleanup_end(struct task_struct *tsk, int state) __releases(&tsk->futex_exit_mutex) { /* * Lockless store. The only side effect is that an observer might * take another loop until it becomes visible. */ tsk->futex_state = state; /* * Drop the exit protection. This unblocks waiters which observed * FUTEX_STATE_EXITING to reevaluate the state. */ mutex_unlock(&tsk->futex_exit_mutex); } void futex_exec_release(struct task_struct *tsk) { /* * The state handling is done for consistency, but in the case of * exec() there is no way to prevent further damage as the PID stays * the same. But for the unlikely and arguably buggy case that a * futex is held on exec(), this provides at least as much state * consistency protection which is possible. */ futex_cleanup_begin(tsk); futex_cleanup(tsk); /* * Reset the state to FUTEX_STATE_OK. The task is alive and about * exec a new binary. */ futex_cleanup_end(tsk, FUTEX_STATE_OK); } void futex_exit_release(struct task_struct *tsk) { futex_cleanup_begin(tsk); futex_cleanup(tsk); futex_cleanup_end(tsk, FUTEX_STATE_DEAD); } static void futex_hash_bucket_init(struct futex_hash_bucket *fhb, struct futex_private_hash *fph) { #ifdef CONFIG_FUTEX_PRIVATE_HASH fhb->priv = fph; #endif atomic_set(&fhb->waiters, 0); plist_head_init(&fhb->chain); spin_lock_init(&fhb->lock); } #define FH_CUSTOM 0x01 #ifdef CONFIG_FUTEX_PRIVATE_HASH /* * futex-ref * * Heavily inspired by percpu-rwsem/percpu-refcount; not reusing any of that * code because it just doesn't fit right. * * Dual counter, per-cpu / atomic approach like percpu-refcount, except it * re-initializes the state automatically, such that the fph swizzle is also a * transition back to per-cpu. */ static void futex_ref_rcu(struct rcu_head *head); static void __futex_ref_atomic_begin(struct futex_private_hash *fph) { struct mm_struct *mm = fph->mm; /* * The counter we're about to switch to must have fully switched; * otherwise it would be impossible for it to have reported success * from futex_ref_is_dead(). */ WARN_ON_ONCE(atomic_long_read(&mm->futex_atomic) != 0); /* * Set the atomic to the bias value such that futex_ref_{get,put}() * will never observe 0. Will be fixed up in __futex_ref_atomic_end() * when folding in the percpu count. */ atomic_long_set(&mm->futex_atomic, LONG_MAX); smp_store_release(&fph->state, FR_ATOMIC); call_rcu_hurry(&mm->futex_rcu, futex_ref_rcu); } static void __futex_ref_atomic_end(struct futex_private_hash *fph) { struct mm_struct *mm = fph->mm; unsigned int count = 0; long ret; int cpu; /* * Per __futex_ref_atomic_begin() the state of the fph must be ATOMIC * and per this RCU callback, everybody must now observe this state and * use the atomic variable. */ WARN_ON_ONCE(fph->state != FR_ATOMIC); /* * Therefore the per-cpu counter is now stable, sum and reset. */ for_each_possible_cpu(cpu) { unsigned int *ptr = per_cpu_ptr(mm->futex_ref, cpu); count += *ptr; *ptr = 0; } /* * Re-init for the next cycle. */ this_cpu_inc(*mm->futex_ref); /* 0 -> 1 */ /* * Add actual count, subtract bias and initial refcount. * * The moment this atomic operation happens, futex_ref_is_dead() can * become true. */ ret = atomic_long_add_return(count - LONG_MAX - 1, &mm->futex_atomic); if (!ret) wake_up_var(mm); WARN_ON_ONCE(ret < 0); mmput_async(mm); } static void futex_ref_rcu(struct rcu_head *head) { struct mm_struct *mm = container_of(head, struct mm_struct, futex_rcu); struct futex_private_hash *fph = rcu_dereference_raw(mm->futex_phash); if (fph->state == FR_PERCPU) { /* * Per this extra grace-period, everybody must now observe * fph as the current fph and no previously observed fph's * are in-flight. * * Notably, nobody will now rely on the atomic * futex_ref_is_dead() state anymore so we can begin the * migration of the per-cpu counter into the atomic. */ __futex_ref_atomic_begin(fph); return; } __futex_ref_atomic_end(fph); } /* * Drop the initial refcount and transition to atomics. */ static void futex_ref_drop(struct futex_private_hash *fph) { struct mm_struct *mm = fph->mm; /* * Can only transition the current fph; */ WARN_ON_ONCE(rcu_dereference_raw(mm->futex_phash) != fph); /* * We enqueue at least one RCU callback. Ensure mm stays if the task * exits before the transition is completed. */ mmget(mm); /* * In order to avoid the following scenario: * * futex_hash() __futex_pivot_hash() * guard(rcu); guard(mm->futex_hash_lock); * fph = mm->futex_phash; * rcu_assign_pointer(&mm->futex_phash, new); * futex_hash_allocate() * futex_ref_drop() * fph->state = FR_ATOMIC; * atomic_set(, BIAS); * * futex_private_hash_get(fph); // OOPS * * Where an old fph (which is FR_ATOMIC) and should fail on * inc_not_zero, will succeed because a new transition is started and * the atomic is bias'ed away from 0. * * There must be at least one full grace-period between publishing a * new fph and trying to replace it. */ if (poll_state_synchronize_rcu(mm->futex_batches)) { /* * There was a grace-period, we can begin now. */ __futex_ref_atomic_begin(fph); return; } call_rcu_hurry(&mm->futex_rcu, futex_ref_rcu); } static bool futex_ref_get(struct futex_private_hash *fph) { struct mm_struct *mm = fph->mm; guard(preempt)(); if (READ_ONCE(fph->state) == FR_PERCPU) { __this_cpu_inc(*mm->futex_ref); return true; } return atomic_long_inc_not_zero(&mm->futex_atomic); } static bool futex_ref_put(struct futex_private_hash *fph) { struct mm_struct *mm = fph->mm; guard(preempt)(); if (READ_ONCE(fph->state) == FR_PERCPU) { __this_cpu_dec(*mm->futex_ref); return false; } return atomic_long_dec_and_test(&mm->futex_atomic); } static bool futex_ref_is_dead(struct futex_private_hash *fph) { struct mm_struct *mm = fph->mm; guard(rcu)(); if (smp_load_acquire(&fph->state) == FR_PERCPU) return false; return atomic_long_read(&mm->futex_atomic) == 0; } int futex_mm_init(struct mm_struct *mm) { mutex_init(&mm->futex_hash_lock); RCU_INIT_POINTER(mm->futex_phash, NULL); mm->futex_phash_new = NULL; /* futex-ref */ mm->futex_ref = NULL; atomic_long_set(&mm->futex_atomic, 0); mm->futex_batches = get_state_synchronize_rcu(); return 0; } void futex_hash_free(struct mm_struct *mm) { struct futex_private_hash *fph; free_percpu(mm->futex_ref); kvfree(mm->futex_phash_new); fph = rcu_dereference_raw(mm->futex_phash); if (fph) kvfree(fph); } static bool futex_pivot_pending(struct mm_struct *mm) { struct futex_private_hash *fph; guard(rcu)(); if (!mm->futex_phash_new) return true; fph = rcu_dereference(mm->futex_phash); return futex_ref_is_dead(fph); } static bool futex_hash_less(struct futex_private_hash *a, struct futex_private_hash *b) { /* user provided always wins */ if (!a->custom && b->custom) return true; if (a->custom && !b->custom) return false; /* zero-sized hash wins */ if (!b->hash_mask) return true; if (!a->hash_mask) return false; /* keep the biggest */ if (a->hash_mask < b->hash_mask) return true; if (a->hash_mask > b->hash_mask) return false; return false; /* equal */ } static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags) { struct mm_struct *mm = current->mm; struct futex_private_hash *fph; bool custom = flags & FH_CUSTOM; int i; if (hash_slots && (hash_slots == 1 || !is_power_of_2(hash_slots))) return -EINVAL; /* * Once we've disabled the global hash there is no way back. */ scoped_guard(rcu) { fph = rcu_dereference(mm->futex_phash); if (fph && !fph->hash_mask) { if (custom) return -EBUSY; return 0; } } if (!mm->futex_ref) { /* * This will always be allocated by the first thread and * therefore requires no locking. */ mm->futex_ref = alloc_percpu(unsigned int); if (!mm->futex_ref) return -ENOMEM; this_cpu_inc(*mm->futex_ref); /* 0 -> 1 */ } fph = kvzalloc(struct_size(fph, queues, hash_slots), GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (!fph) return -ENOMEM; fph->hash_mask = hash_slots ? hash_slots - 1 : 0; fph->custom = custom; fph->mm = mm; for (i = 0; i < hash_slots; i++) futex_hash_bucket_init(&fph->queues[i], fph); if (custom) { /* * Only let prctl() wait / retry; don't unduly delay clone(). */ again: wait_var_event(mm, futex_pivot_pending(mm)); } scoped_guard(mutex, &mm->futex_hash_lock) { struct futex_private_hash *free __free(kvfree) = NULL; struct futex_private_hash *cur, *new; cur = rcu_dereference_protected(mm->futex_phash, lockdep_is_held(&mm->futex_hash_lock)); new = mm->futex_phash_new; mm->futex_phash_new = NULL; if (fph) { if (cur && !cur->hash_mask) { /* * If two threads simultaneously request the global * hash then the first one performs the switch, * the second one returns here. */ free = fph; mm->futex_phash_new = new; return -EBUSY; } if (cur && !new) { /* * If we have an existing hash, but do not yet have * allocated a replacement hash, drop the initial * reference on the existing hash. */ futex_ref_drop(cur); } if (new) { /* * Two updates raced; throw out the lesser one. */ if (futex_hash_less(new, fph)) { free = new; new = fph; } else { free = fph; } } else { new = fph; } fph = NULL; } if (new) { /* * Will set mm->futex_phash_new on failure; * futex_private_hash_get() will try again. */ if (!__futex_pivot_hash(mm, new) && custom) goto again; } } return 0; } int futex_hash_allocate_default(void) { unsigned int threads, buckets, current_buckets = 0; struct futex_private_hash *fph; if (!current->mm) return 0; scoped_guard(rcu) { threads = min_t(unsigned int, get_nr_threads(current), num_online_cpus()); fph = rcu_dereference(current->mm->futex_phash); if (fph) { if (fph->custom) return 0; current_buckets = fph->hash_mask + 1; } } /* * The default allocation will remain within * 16 <= threads * 4 <= global hash size */ buckets = roundup_pow_of_two(4 * threads); buckets = clamp(buckets, 16, futex_hashmask + 1); if (current_buckets >= buckets) return 0; return futex_hash_allocate(buckets, 0); } static int futex_hash_get_slots(void) { struct futex_private_hash *fph; guard(rcu)(); fph = rcu_dereference(current->mm->futex_phash); if (fph && fph->hash_mask) return fph->hash_mask + 1; return 0; } #else static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags) { return -EINVAL; } static int futex_hash_get_slots(void) { return 0; } #endif int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4) { unsigned int flags = FH_CUSTOM; int ret; switch (arg2) { case PR_FUTEX_HASH_SET_SLOTS: if (arg4) return -EINVAL; ret = futex_hash_allocate(arg3, flags); break; case PR_FUTEX_HASH_GET_SLOTS: ret = futex_hash_get_slots(); break; default: ret = -EINVAL; break; } return ret; } static int __init futex_init(void) { unsigned long hashsize, i; unsigned int order, n; unsigned long size; #ifdef CONFIG_BASE_SMALL hashsize = 16; #else hashsize = 256 * num_possible_cpus(); hashsize /= num_possible_nodes(); hashsize = max(4, hashsize); hashsize = roundup_pow_of_two(hashsize); #endif futex_hashshift = ilog2(hashsize); size = sizeof(struct futex_hash_bucket) * hashsize; order = get_order(size); for_each_node(n) { struct futex_hash_bucket *table; if (order > MAX_PAGE_ORDER) table = vmalloc_huge_node(size, GFP_KERNEL, n); else table = alloc_pages_exact_nid(n, size, GFP_KERNEL); BUG_ON(!table); for (i = 0; i < hashsize; i++) futex_hash_bucket_init(&table[i], NULL); futex_queues[n] = table; } futex_hashmask = hashsize - 1; pr_info("futex hash table entries: %lu (%lu bytes on %d NUMA nodes, total %lu KiB, %s).\n", hashsize, size, num_possible_nodes(), size * num_possible_nodes() / 1024, order > MAX_PAGE_ORDER ? "vmalloc" : "linear"); return 0; } core_initcall(futex_init);
151 151 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 // SPDX-License-Identifier: GPL-2.0 #include <linux/cache.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/pid_namespace.h> #include "internal.h" /* * /proc/self: */ static const char *proc_self_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { struct pid_namespace *ns = proc_pid_ns(inode->i_sb); pid_t tgid = task_tgid_nr_ns(current, ns); char *name; if (!tgid) return ERR_PTR(-ENOENT); /* max length of unsigned int in decimal + NULL term */ name = kmalloc(10 + 1, dentry ? GFP_KERNEL : GFP_ATOMIC); if (unlikely(!name)) return dentry ? ERR_PTR(-ENOMEM) : ERR_PTR(-ECHILD); sprintf(name, "%u", tgid); set_delayed_call(done, kfree_link, name); return name; } static const struct inode_operations proc_self_inode_operations = { .get_link = proc_self_get_link, }; unsigned self_inum __ro_after_init; int proc_setup_self(struct super_block *s) { struct dentry *self; int ret = -ENOMEM; self = d_alloc_name(s->s_root, "self"); if (self) { struct inode *inode = new_inode(s); if (inode) { inode->i_ino = self_inum; simple_inode_init_ts(inode); inode->i_mode = S_IFLNK | S_IRWXUGO; inode->i_uid = GLOBAL_ROOT_UID; inode->i_gid = GLOBAL_ROOT_GID; inode->i_op = &proc_self_inode_operations; d_make_persistent(self, inode); ret = 0; } dput(self); } if (ret) pr_err("proc_fill_super: can't allocate /proc/self\n"); return ret; } void __init proc_self_init(void) { proc_alloc_inum(&self_inum); }
10 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 10 10 10 10 2 2 2 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 // SPDX-License-Identifier: GPL-2.0-or-later /* * V4L2 controls framework core implementation. * * Copyright (C) 2010-2021 Hans Verkuil <hverkuil@kernel.org> */ #include <linux/export.h> #include <linux/mm.h> #include <linux/slab.h> #include <media/v4l2-ctrls.h> #include <media/v4l2-event.h> #include <media/v4l2-fwnode.h> #include "v4l2-ctrls-priv.h" static const union v4l2_ctrl_ptr ptr_null; static void fill_event(struct v4l2_event *ev, struct v4l2_ctrl *ctrl, u32 changes) { memset(ev, 0, sizeof(*ev)); ev->type = V4L2_EVENT_CTRL; ev->id = ctrl->id; ev->u.ctrl.changes = changes; ev->u.ctrl.type = ctrl->type; ev->u.ctrl.flags = user_flags(ctrl); if (ctrl->is_ptr) ev->u.ctrl.value64 = 0; else ev->u.ctrl.value64 = *ctrl->p_cur.p_s64; ev->u.ctrl.minimum = ctrl->minimum; ev->u.ctrl.maximum = ctrl->maximum; if (ctrl->type == V4L2_CTRL_TYPE_MENU || ctrl->type == V4L2_CTRL_TYPE_INTEGER_MENU) ev->u.ctrl.step = 1; else ev->u.ctrl.step = ctrl->step; ev->u.ctrl.default_value = ctrl->default_value; } void send_initial_event(struct v4l2_fh *fh, struct v4l2_ctrl *ctrl) { struct v4l2_event ev; u32 changes = V4L2_EVENT_CTRL_CH_FLAGS; if (!(ctrl->flags & V4L2_CTRL_FLAG_WRITE_ONLY)) changes |= V4L2_EVENT_CTRL_CH_VALUE; fill_event(&ev, ctrl, changes); v4l2_event_queue_fh(fh, &ev); } void send_event(struct v4l2_fh *fh, struct v4l2_ctrl *ctrl, u32 changes) { struct v4l2_event ev; struct v4l2_subscribed_event *sev; if (list_empty(&ctrl->ev_subs)) return; fill_event(&ev, ctrl, changes); list_for_each_entry(sev, &ctrl->ev_subs, node) if (sev->fh != fh || (sev->flags & V4L2_EVENT_SUB_FL_ALLOW_FEEDBACK)) v4l2_event_queue_fh(sev->fh, &ev); } bool v4l2_ctrl_type_op_equal(const struct v4l2_ctrl *ctrl, union v4l2_ctrl_ptr ptr1, union v4l2_ctrl_ptr ptr2) { unsigned int i; switch (ctrl->type) { case V4L2_CTRL_TYPE_BUTTON: return false; case V4L2_CTRL_TYPE_STRING: for (i = 0; i < ctrl->elems; i++) { unsigned int idx = i * ctrl->elem_size; /* strings are always 0-terminated */ if (strcmp(ptr1.p_char + idx, ptr2.p_char + idx)) return false; } return true; default: return !memcmp(ptr1.p_const, ptr2.p_const, ctrl->elems * ctrl->elem_size); } } EXPORT_SYMBOL(v4l2_ctrl_type_op_equal); /* Default intra MPEG-2 quantisation coefficients, from the specification. */ static const u8 mpeg2_intra_quant_matrix[64] = { 8, 16, 16, 19, 16, 19, 22, 22, 22, 22, 22, 22, 26, 24, 26, 27, 27, 27, 26, 26, 26, 26, 27, 27, 27, 29, 29, 29, 34, 34, 34, 29, 29, 29, 27, 27, 29, 29, 32, 32, 34, 34, 37, 38, 37, 35, 35, 34, 35, 38, 38, 40, 40, 40, 48, 48, 46, 46, 56, 56, 58, 69, 69, 83 }; static void std_init_compound(const struct v4l2_ctrl *ctrl, u32 idx, union v4l2_ctrl_ptr ptr) { struct v4l2_ctrl_mpeg2_sequence *p_mpeg2_sequence; struct v4l2_ctrl_mpeg2_picture *p_mpeg2_picture; struct v4l2_ctrl_mpeg2_quantisation *p_mpeg2_quant; struct v4l2_ctrl_vp8_frame *p_vp8_frame; struct v4l2_ctrl_vp9_frame *p_vp9_frame; struct v4l2_ctrl_fwht_params *p_fwht_params; struct v4l2_ctrl_h264_scaling_matrix *p_h264_scaling_matrix; struct v4l2_ctrl_av1_sequence *p_av1_sequence; void *p = ptr.p + idx * ctrl->elem_size; if (ctrl->p_def.p_const) memcpy(p, ctrl->p_def.p_const, ctrl->elem_size); else memset(p, 0, ctrl->elem_size); switch ((u32)ctrl->type) { case V4L2_CTRL_TYPE_MPEG2_SEQUENCE: p_mpeg2_sequence = p; /* 4:2:0 */ p_mpeg2_sequence->chroma_format = 1; break; case V4L2_CTRL_TYPE_MPEG2_PICTURE: p_mpeg2_picture = p; /* interlaced top field */ p_mpeg2_picture->picture_structure = V4L2_MPEG2_PIC_TOP_FIELD; p_mpeg2_picture->picture_coding_type = V4L2_MPEG2_PIC_CODING_TYPE_I; break; case V4L2_CTRL_TYPE_MPEG2_QUANTISATION: p_mpeg2_quant = p; memcpy(p_mpeg2_quant->intra_quantiser_matrix, mpeg2_intra_quant_matrix, ARRAY_SIZE(mpeg2_intra_quant_matrix)); /* * The default non-intra MPEG-2 quantisation * coefficients are all 16, as per the specification. */ memset(p_mpeg2_quant->non_intra_quantiser_matrix, 16, sizeof(p_mpeg2_quant->non_intra_quantiser_matrix)); break; case V4L2_CTRL_TYPE_VP8_FRAME: p_vp8_frame = p; p_vp8_frame->num_dct_parts = 1; break; case V4L2_CTRL_TYPE_VP9_FRAME: p_vp9_frame = p; p_vp9_frame->profile = 0; p_vp9_frame->bit_depth = 8; p_vp9_frame->flags |= V4L2_VP9_FRAME_FLAG_X_SUBSAMPLING | V4L2_VP9_FRAME_FLAG_Y_SUBSAMPLING; break; case V4L2_CTRL_TYPE_AV1_SEQUENCE: p_av1_sequence = p; /* * The initial profile is 0 which only allows YUV 420 subsampled * data. Set the subsampling flags accordingly. */ p_av1_sequence->bit_depth = 8; p_av1_sequence->flags |= V4L2_AV1_SEQUENCE_FLAG_SUBSAMPLING_X | V4L2_AV1_SEQUENCE_FLAG_SUBSAMPLING_Y; break; case V4L2_CTRL_TYPE_FWHT_PARAMS: p_fwht_params = p; p_fwht_params->version = V4L2_FWHT_VERSION; p_fwht_params->width = 1280; p_fwht_params->height = 720; p_fwht_params->flags = V4L2_FWHT_FL_PIXENC_YUV | (2 << V4L2_FWHT_FL_COMPONENTS_NUM_OFFSET); break; case V4L2_CTRL_TYPE_H264_SCALING_MATRIX: p_h264_scaling_matrix = p; /* * The default (flat) H.264 scaling matrix when none are * specified in the bitstream, this is according to formulas * (7-8) and (7-9) of the specification. */ memset(p_h264_scaling_matrix, 16, sizeof(*p_h264_scaling_matrix)); break; } } static void std_min_compound(const struct v4l2_ctrl *ctrl, u32 idx, union v4l2_ctrl_ptr ptr) { void *p = ptr.p + idx * ctrl->elem_size; if (ctrl->p_min.p_const) memcpy(p, ctrl->p_min.p_const, ctrl->elem_size); else memset(p, 0, ctrl->elem_size); } static void std_max_compound(const struct v4l2_ctrl *ctrl, u32 idx, union v4l2_ctrl_ptr ptr) { void *p = ptr.p + idx * ctrl->elem_size; if (ctrl->p_max.p_const) memcpy(p, ctrl->p_max.p_const, ctrl->elem_size); else memset(p, 0, ctrl->elem_size); } static void __v4l2_ctrl_type_op_init(const struct v4l2_ctrl *ctrl, u32 from_idx, u32 which, union v4l2_ctrl_ptr ptr) { unsigned int i; u32 tot_elems = ctrl->elems; u32 elems = tot_elems - from_idx; s64 value; switch (which) { case V4L2_CTRL_WHICH_DEF_VAL: value = ctrl->default_value; break; case V4L2_CTRL_WHICH_MAX_VAL: value = ctrl->maximum; break; case V4L2_CTRL_WHICH_MIN_VAL: value = ctrl->minimum; break; default: return; } switch (ctrl->type) { case V4L2_CTRL_TYPE_STRING: if (which == V4L2_CTRL_WHICH_DEF_VAL) value = ctrl->minimum; for (i = from_idx; i < tot_elems; i++) { unsigned int offset = i * ctrl->elem_size; memset(ptr.p_char + offset, ' ', value); ptr.p_char[offset + value] = '\0'; } break; case V4L2_CTRL_TYPE_INTEGER64: if (value) { for (i = from_idx; i < tot_elems; i++) ptr.p_s64[i] = value; } else { memset(ptr.p_s64 + from_idx, 0, elems * sizeof(s64)); } break; case V4L2_CTRL_TYPE_INTEGER: case V4L2_CTRL_TYPE_INTEGER_MENU: case V4L2_CTRL_TYPE_MENU: case V4L2_CTRL_TYPE_BITMASK: case V4L2_CTRL_TYPE_BOOLEAN: if (value) { for (i = from_idx; i < tot_elems; i++) ptr.p_s32[i] = value; } else { memset(ptr.p_s32 + from_idx, 0, elems * sizeof(s32)); } break; case V4L2_CTRL_TYPE_BUTTON: case V4L2_CTRL_TYPE_CTRL_CLASS: memset(ptr.p_s32 + from_idx, 0, elems * sizeof(s32)); break; case V4L2_CTRL_TYPE_U8: memset(ptr.p_u8 + from_idx, value, elems); break; case V4L2_CTRL_TYPE_U16: if (value) { for (i = from_idx; i < tot_elems; i++) ptr.p_u16[i] = value; } else { memset(ptr.p_u16 + from_idx, 0, elems * sizeof(u16)); } break; case V4L2_CTRL_TYPE_U32: if (value) { for (i = from_idx; i < tot_elems; i++) ptr.p_u32[i] = value; } else { memset(ptr.p_u32 + from_idx, 0, elems * sizeof(u32)); } break; default: for (i = from_idx; i < tot_elems; i++) { switch (which) { case V4L2_CTRL_WHICH_DEF_VAL: std_init_compound(ctrl, i, ptr); break; case V4L2_CTRL_WHICH_MAX_VAL: std_max_compound(ctrl, i, ptr); break; case V4L2_CTRL_WHICH_MIN_VAL: std_min_compound(ctrl, i, ptr); break; } } break; } } void v4l2_ctrl_type_op_init(const struct v4l2_ctrl *ctrl, u32 from_idx, union v4l2_ctrl_ptr ptr) { __v4l2_ctrl_type_op_init(ctrl, from_idx, V4L2_CTRL_WHICH_DEF_VAL, ptr); } EXPORT_SYMBOL(v4l2_ctrl_type_op_init); static void v4l2_ctrl_type_op_minimum(const struct v4l2_ctrl *ctrl, u32 from_idx, union v4l2_ctrl_ptr ptr) { __v4l2_ctrl_type_op_init(ctrl, from_idx, V4L2_CTRL_WHICH_MIN_VAL, ptr); } static void v4l2_ctrl_type_op_maximum(const struct v4l2_ctrl *ctrl, u32 from_idx, union v4l2_ctrl_ptr ptr) { __v4l2_ctrl_type_op_init(ctrl, from_idx, V4L2_CTRL_WHICH_MAX_VAL, ptr); } void v4l2_ctrl_type_op_log(const struct v4l2_ctrl *ctrl) { union v4l2_ctrl_ptr ptr = ctrl->p_cur; if (ctrl->is_array) { unsigned i; for (i = 0; i < ctrl->nr_of_dims; i++) pr_cont("[%u]", ctrl->dims[i]); pr_cont(" "); } switch (ctrl->type) { case V4L2_CTRL_TYPE_INTEGER: pr_cont("%d", *ptr.p_s32); break; case V4L2_CTRL_TYPE_BOOLEAN: pr_cont("%s", *ptr.p_s32 ? "true" : "false"); break; case V4L2_CTRL_TYPE_MENU: pr_cont("%s", ctrl->qmenu[*ptr.p_s32]); break; case V4L2_CTRL_TYPE_INTEGER_MENU: pr_cont("%lld", ctrl->qmenu_int[*ptr.p_s32]); break; case V4L2_CTRL_TYPE_BITMASK: pr_cont("0x%08x", *ptr.p_s32); break; case V4L2_CTRL_TYPE_INTEGER64: pr_cont("%lld", *ptr.p_s64); break; case V4L2_CTRL_TYPE_STRING: pr_cont("%s", ptr.p_char); break; case V4L2_CTRL_TYPE_U8: pr_cont("%u", (unsigned)*ptr.p_u8); break; case V4L2_CTRL_TYPE_U16: pr_cont("%u", (unsigned)*ptr.p_u16); break; case V4L2_CTRL_TYPE_U32: pr_cont("%u", (unsigned)*ptr.p_u32); break; case V4L2_CTRL_TYPE_AREA: pr_cont("%ux%u", ptr.p_area->width, ptr.p_area->height); break; case V4L2_CTRL_TYPE_H264_SPS: pr_cont("H264_SPS"); break; case V4L2_CTRL_TYPE_H264_PPS: pr_cont("H264_PPS"); break; case V4L2_CTRL_TYPE_H264_SCALING_MATRIX: pr_cont("H264_SCALING_MATRIX"); break; case V4L2_CTRL_TYPE_H264_SLICE_PARAMS: pr_cont("H264_SLICE_PARAMS"); break; case V4L2_CTRL_TYPE_H264_DECODE_PARAMS: pr_cont("H264_DECODE_PARAMS"); break; case V4L2_CTRL_TYPE_H264_PRED_WEIGHTS: pr_cont("H264_PRED_WEIGHTS"); break; case V4L2_CTRL_TYPE_FWHT_PARAMS: pr_cont("FWHT_PARAMS"); break; case V4L2_CTRL_TYPE_VP8_FRAME: pr_cont("VP8_FRAME"); break; case V4L2_CTRL_TYPE_HDR10_CLL_INFO: pr_cont("HDR10_CLL_INFO"); break; case V4L2_CTRL_TYPE_HDR10_MASTERING_DISPLAY: pr_cont("HDR10_MASTERING_DISPLAY"); break; case V4L2_CTRL_TYPE_MPEG2_QUANTISATION: pr_cont("MPEG2_QUANTISATION"); break; case V4L2_CTRL_TYPE_MPEG2_SEQUENCE: pr_cont("MPEG2_SEQUENCE"); break; case V4L2_CTRL_TYPE_MPEG2_PICTURE: pr_cont("MPEG2_PICTURE"); break; case V4L2_CTRL_TYPE_VP9_COMPRESSED_HDR: pr_cont("VP9_COMPRESSED_HDR"); break; case V4L2_CTRL_TYPE_VP9_FRAME: pr_cont("VP9_FRAME"); break; case V4L2_CTRL_TYPE_HEVC_SPS: pr_cont("HEVC_SPS"); break; case V4L2_CTRL_TYPE_HEVC_PPS: pr_cont("HEVC_PPS"); break; case V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS: pr_cont("HEVC_SLICE_PARAMS"); break; case V4L2_CTRL_TYPE_HEVC_EXT_SPS_ST_RPS: pr_cont("HEVC_EXT_SPS_ST_RPS"); break; case V4L2_CTRL_TYPE_HEVC_EXT_SPS_LT_RPS: pr_cont("HEVC_EXT_SPS_LT_RPS"); break; case V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX: pr_cont("HEVC_SCALING_MATRIX"); break; case V4L2_CTRL_TYPE_HEVC_DECODE_PARAMS: pr_cont("HEVC_DECODE_PARAMS"); break; case V4L2_CTRL_TYPE_AV1_SEQUENCE: pr_cont("AV1_SEQUENCE"); break; case V4L2_CTRL_TYPE_AV1_TILE_GROUP_ENTRY: pr_cont("AV1_TILE_GROUP_ENTRY"); break; case V4L2_CTRL_TYPE_AV1_FRAME: pr_cont("AV1_FRAME"); break; case V4L2_CTRL_TYPE_AV1_FILM_GRAIN: pr_cont("AV1_FILM_GRAIN"); break; case V4L2_CTRL_TYPE_RECT: pr_cont("(%d,%d)/%ux%u", ptr.p_rect->left, ptr.p_rect->top, ptr.p_rect->width, ptr.p_rect->height); break; default: pr_cont("unknown type %d", ctrl->type); break; } } EXPORT_SYMBOL(v4l2_ctrl_type_op_log); /* * Round towards the closest legal value. Be careful when we are * close to the maximum range of the control type to prevent * wrap-arounds. */ #define ROUND_TO_RANGE(val, offset_type, ctrl) \ ({ \ offset_type offset; \ if ((ctrl)->maximum >= 0 && \ val >= (ctrl)->maximum - (s32)((ctrl)->step / 2)) \ val = (ctrl)->maximum; \ else \ val += (s32)((ctrl)->step / 2); \ val = clamp_t(typeof(val), val, \ (ctrl)->minimum, (ctrl)->maximum); \ offset = (val) - (ctrl)->minimum; \ offset = (ctrl)->step * (offset / (u32)(ctrl)->step); \ val = (ctrl)->minimum + offset; \ 0; \ }) /* Validate a new control */ #define zero_padding(s) \ memset(&(s).padding, 0, sizeof((s).padding)) #define zero_reserved(s) \ memset(&(s).reserved, 0, sizeof((s).reserved)) static int validate_vp9_lf_params(struct v4l2_vp9_loop_filter *lf) { unsigned int i; if (lf->flags & ~(V4L2_VP9_LOOP_FILTER_FLAG_DELTA_ENABLED | V4L2_VP9_LOOP_FILTER_FLAG_DELTA_UPDATE)) return -EINVAL; /* That all values are in the accepted range. */ if (lf->level > GENMASK(5, 0)) return -EINVAL; if (lf->sharpness > GENMASK(2, 0)) return -EINVAL; for (i = 0; i < ARRAY_SIZE(lf->ref_deltas); i++) if (lf->ref_deltas[i] < -63 || lf->ref_deltas[i] > 63) return -EINVAL; for (i = 0; i < ARRAY_SIZE(lf->mode_deltas); i++) if (lf->mode_deltas[i] < -63 || lf->mode_deltas[i] > 63) return -EINVAL; zero_reserved(*lf); return 0; } static int validate_vp9_quant_params(struct v4l2_vp9_quantization *quant) { if (quant->delta_q_y_dc < -15 || quant->delta_q_y_dc > 15 || quant->delta_q_uv_dc < -15 || quant->delta_q_uv_dc > 15 || quant->delta_q_uv_ac < -15 || quant->delta_q_uv_ac > 15) return -EINVAL; zero_reserved(*quant); return 0; } static int validate_vp9_seg_params(struct v4l2_vp9_segmentation *seg) { unsigned int i, j; if (seg->flags & ~(V4L2_VP9_SEGMENTATION_FLAG_ENABLED | V4L2_VP9_SEGMENTATION_FLAG_UPDATE_MAP | V4L2_VP9_SEGMENTATION_FLAG_TEMPORAL_UPDATE | V4L2_VP9_SEGMENTATION_FLAG_UPDATE_DATA | V4L2_VP9_SEGMENTATION_FLAG_ABS_OR_DELTA_UPDATE)) return -EINVAL; for (i = 0; i < ARRAY_SIZE(seg->feature_enabled); i++) { if (seg->feature_enabled[i] & ~V4L2_VP9_SEGMENT_FEATURE_ENABLED_MASK) return -EINVAL; } for (i = 0; i < ARRAY_SIZE(seg->feature_data); i++) { static const int range[] = { 255, 63, 3, 0 }; for (j = 0; j < ARRAY_SIZE(seg->feature_data[j]); j++) { if (seg->feature_data[i][j] < -range[j] || seg->feature_data[i][j] > range[j]) return -EINVAL; } } zero_reserved(*seg); return 0; } static int validate_vp9_compressed_hdr(struct v4l2_ctrl_vp9_compressed_hdr *hdr) { if (hdr->tx_mode > V4L2_VP9_TX_MODE_SELECT) return -EINVAL; return 0; } static int validate_vp9_frame(struct v4l2_ctrl_vp9_frame *frame) { int ret; /* Make sure we're not passed invalid flags. */ if (frame->flags & ~(V4L2_VP9_FRAME_FLAG_KEY_FRAME | V4L2_VP9_FRAME_FLAG_SHOW_FRAME | V4L2_VP9_FRAME_FLAG_ERROR_RESILIENT | V4L2_VP9_FRAME_FLAG_INTRA_ONLY | V4L2_VP9_FRAME_FLAG_ALLOW_HIGH_PREC_MV | V4L2_VP9_FRAME_FLAG_REFRESH_FRAME_CTX | V4L2_VP9_FRAME_FLAG_PARALLEL_DEC_MODE | V4L2_VP9_FRAME_FLAG_X_SUBSAMPLING | V4L2_VP9_FRAME_FLAG_Y_SUBSAMPLING | V4L2_VP9_FRAME_FLAG_COLOR_RANGE_FULL_SWING)) return -EINVAL; if (frame->flags & V4L2_VP9_FRAME_FLAG_ERROR_RESILIENT && frame->flags & V4L2_VP9_FRAME_FLAG_REFRESH_FRAME_CTX) return -EINVAL; if (frame->profile > V4L2_VP9_PROFILE_MAX) return -EINVAL; if (frame->reset_frame_context > V4L2_VP9_RESET_FRAME_CTX_ALL) return -EINVAL; if (frame->frame_context_idx >= V4L2_VP9_NUM_FRAME_CTX) return -EINVAL; /* * Profiles 0 and 1 only support 8-bit depth, profiles 2 and 3 only 10 * and 12 bit depths. */ if ((frame->profile < 2 && frame->bit_depth != 8) || (frame->profile >= 2 && (frame->bit_depth != 10 && frame->bit_depth != 12))) return -EINVAL; /* Profile 0 and 2 only accept YUV 4:2:0. */ if ((frame->profile == 0 || frame->profile == 2) && (!(frame->flags & V4L2_VP9_FRAME_FLAG_X_SUBSAMPLING) || !(frame->flags & V4L2_VP9_FRAME_FLAG_Y_SUBSAMPLING))) return -EINVAL; /* Profile 1 and 3 only accept YUV 4:2:2, 4:4:0 and 4:4:4. */ if ((frame->profile == 1 || frame->profile == 3) && ((frame->flags & V4L2_VP9_FRAME_FLAG_X_SUBSAMPLING) && (frame->flags & V4L2_VP9_FRAME_FLAG_Y_SUBSAMPLING))) return -EINVAL; if (frame->interpolation_filter > V4L2_VP9_INTERP_FILTER_SWITCHABLE) return -EINVAL; /* * According to the spec, tile_cols_log2 shall be less than or equal * to 6. */ if (frame->tile_cols_log2 > 6) return -EINVAL; if (frame->reference_mode > V4L2_VP9_REFERENCE_MODE_SELECT) return -EINVAL; ret = validate_vp9_lf_params(&frame->lf); if (ret) return ret; ret = validate_vp9_quant_params(&frame->quant); if (ret) return ret; ret = validate_vp9_seg_params(&frame->seg); if (ret) return ret; zero_reserved(*frame); return 0; } static int validate_av1_quantization(struct v4l2_av1_quantization *q) { if (q->flags > GENMASK(2, 0)) return -EINVAL; if (q->delta_q_y_dc < -64 || q->delta_q_y_dc > 63 || q->delta_q_u_dc < -64 || q->delta_q_u_dc > 63 || q->delta_q_v_dc < -64 || q->delta_q_v_dc > 63 || q->delta_q_u_ac < -64 || q->delta_q_u_ac > 63 || q->delta_q_v_ac < -64 || q->delta_q_v_ac > 63 || q->delta_q_res > GENMASK(1, 0)) return -EINVAL; if (q->qm_y > GENMASK(3, 0) || q->qm_u > GENMASK(3, 0) || q->qm_v > GENMASK(3, 0)) return -EINVAL; return 0; } static int validate_av1_segmentation(struct v4l2_av1_segmentation *s) { u32 i; u32 j; if (s->flags > GENMASK(4, 0)) return -EINVAL; for (i = 0; i < ARRAY_SIZE(s->feature_data); i++) { static const int segmentation_feature_signed[] = { 1, 1, 1, 1, 1, 0, 0, 0 }; static const int segmentation_feature_max[] = { 255, 63, 63, 63, 63, 7, 0, 0}; for (j = 0; j < ARRAY_SIZE(s->feature_data[j]); j++) { s32 limit = segmentation_feature_max[j]; if (segmentation_feature_signed[j]) { if (s->feature_data[i][j] < -limit || s->feature_data[i][j] > limit) return -EINVAL; } else { if (s->feature_data[i][j] < 0 || s->feature_data[i][j] > limit) return -EINVAL; } } } return 0; } static int validate_av1_loop_filter(struct v4l2_av1_loop_filter *lf) { u32 i; if (lf->flags > GENMASK(3, 0)) return -EINVAL; for (i = 0; i < ARRAY_SIZE(lf->level); i++) { if (lf->level[i] > GENMASK(5, 0)) return -EINVAL; } if (lf->sharpness > GENMASK(2, 0)) return -EINVAL; for (i = 0; i < ARRAY_SIZE(lf->ref_deltas); i++) { if (lf->ref_deltas[i] < -64 || lf->ref_deltas[i] > 63) return -EINVAL; } for (i = 0; i < ARRAY_SIZE(lf->mode_deltas); i++) { if (lf->mode_deltas[i] < -64 || lf->mode_deltas[i] > 63) return -EINVAL; } return 0; } static int validate_av1_cdef(struct v4l2_av1_cdef *cdef) { u32 i; if (cdef->damping_minus_3 > GENMASK(1, 0) || cdef->bits > GENMASK(1, 0)) return -EINVAL; for (i = 0; i < 1 << cdef->bits; i++) { if (cdef->y_pri_strength[i] > GENMASK(3, 0) || cdef->y_sec_strength[i] > 4 || cdef->uv_pri_strength[i] > GENMASK(3, 0) || cdef->uv_sec_strength[i] > 4) return -EINVAL; } return 0; } static int validate_av1_loop_restauration(struct v4l2_av1_loop_restoration *lr) { if (lr->lr_unit_shift > 3 || lr->lr_uv_shift > 1) return -EINVAL; return 0; } static int validate_av1_film_grain(struct v4l2_ctrl_av1_film_grain *fg) { u32 i; if (fg->flags > GENMASK(4, 0)) return -EINVAL; if (fg->film_grain_params_ref_idx > GENMASK(2, 0) || fg->num_y_points > 14 || fg->num_cb_points > 10 || fg->num_cr_points > GENMASK(3, 0) || fg->grain_scaling_minus_8 > GENMASK(1, 0) || fg->ar_coeff_lag > GENMASK(1, 0) || fg->ar_coeff_shift_minus_6 > GENMASK(1, 0) || fg->grain_scale_shift > GENMASK(1, 0)) return -EINVAL; if (!(fg->flags & V4L2_AV1_FILM_GRAIN_FLAG_APPLY_GRAIN)) return 0; for (i = 1; i < fg->num_y_points; i++) if (fg->point_y_value[i] <= fg->point_y_value[i - 1]) return -EINVAL; for (i = 1; i < fg->num_cb_points; i++) if (fg->point_cb_value[i] <= fg->point_cb_value[i - 1]) return -EINVAL; for (i = 1; i < fg->num_cr_points; i++) if (fg->point_cr_value[i] <= fg->point_cr_value[i - 1]) return -EINVAL; return 0; } static int validate_av1_frame(struct v4l2_ctrl_av1_frame *f) { int ret = 0; ret = validate_av1_quantization(&f->quantization); if (ret) return ret; ret = validate_av1_segmentation(&f->segmentation); if (ret) return ret; ret = validate_av1_loop_filter(&f->loop_filter); if (ret) return ret; ret = validate_av1_cdef(&f->cdef); if (ret) return ret; ret = validate_av1_loop_restauration(&f->loop_restoration); if (ret) return ret; if (f->flags & ~(V4L2_AV1_FRAME_FLAG_SHOW_FRAME | V4L2_AV1_FRAME_FLAG_SHOWABLE_FRAME | V4L2_AV1_FRAME_FLAG_ERROR_RESILIENT_MODE | V4L2_AV1_FRAME_FLAG_DISABLE_CDF_UPDATE | V4L2_AV1_FRAME_FLAG_ALLOW_SCREEN_CONTENT_TOOLS | V4L2_AV1_FRAME_FLAG_FORCE_INTEGER_MV | V4L2_AV1_FRAME_FLAG_ALLOW_INTRABC | V4L2_AV1_FRAME_FLAG_USE_SUPERRES | V4L2_AV1_FRAME_FLAG_ALLOW_HIGH_PRECISION_MV | V4L2_AV1_FRAME_FLAG_IS_MOTION_MODE_SWITCHABLE | V4L2_AV1_FRAME_FLAG_USE_REF_FRAME_MVS | V4L2_AV1_FRAME_FLAG_DISABLE_FRAME_END_UPDATE_CDF | V4L2_AV1_FRAME_FLAG_ALLOW_WARPED_MOTION | V4L2_AV1_FRAME_FLAG_REFERENCE_SELECT | V4L2_AV1_FRAME_FLAG_REDUCED_TX_SET | V4L2_AV1_FRAME_FLAG_SKIP_MODE_ALLOWED | V4L2_AV1_FRAME_FLAG_SKIP_MODE_PRESENT | V4L2_AV1_FRAME_FLAG_FRAME_SIZE_OVERRIDE | V4L2_AV1_FRAME_FLAG_BUFFER_REMOVAL_TIME_PRESENT | V4L2_AV1_FRAME_FLAG_FRAME_REFS_SHORT_SIGNALING)) return -EINVAL; if (f->superres_denom > GENMASK(2, 0) + 9) return -EINVAL; return 0; } /** * validate_av1_sequence - validate AV1 sequence header fields * @s: control struct from userspace * * Implements AV1 spec §5.5.2 color_config() checks that are * possible with the current v4l2_ctrl_av1_sequence definition. * * TODO: extend validation once additional fields such as * color_primaries, transfer_characteristics, * matrix_coefficients, and chroma_sample_position * are added to the uAPI. * * Returns 0 if valid, -EINVAL otherwise. */ static int validate_av1_sequence(struct v4l2_ctrl_av1_sequence *s) { const bool mono = s->flags & V4L2_AV1_SEQUENCE_FLAG_MONO_CHROME; const bool sx = s->flags & V4L2_AV1_SEQUENCE_FLAG_SUBSAMPLING_X; const bool sy = s->flags & V4L2_AV1_SEQUENCE_FLAG_SUBSAMPLING_Y; const bool uv_dq = s->flags & V4L2_AV1_SEQUENCE_FLAG_SEPARATE_UV_DELTA_Q; /* 1. Reject unknown flags */ if (s->flags & ~(V4L2_AV1_SEQUENCE_FLAG_STILL_PICTURE | V4L2_AV1_SEQUENCE_FLAG_USE_128X128_SUPERBLOCK | V4L2_AV1_SEQUENCE_FLAG_ENABLE_FILTER_INTRA | V4L2_AV1_SEQUENCE_FLAG_ENABLE_INTRA_EDGE_FILTER | V4L2_AV1_SEQUENCE_FLAG_ENABLE_INTERINTRA_COMPOUND | V4L2_AV1_SEQUENCE_FLAG_ENABLE_MASKED_COMPOUND | V4L2_AV1_SEQUENCE_FLAG_ENABLE_WARPED_MOTION | V4L2_AV1_SEQUENCE_FLAG_ENABLE_DUAL_FILTER | V4L2_AV1_SEQUENCE_FLAG_ENABLE_ORDER_HINT | V4L2_AV1_SEQUENCE_FLAG_ENABLE_JNT_COMP | V4L2_AV1_SEQUENCE_FLAG_ENABLE_REF_FRAME_MVS | V4L2_AV1_SEQUENCE_FLAG_ENABLE_SUPERRES | V4L2_AV1_SEQUENCE_FLAG_ENABLE_CDEF | V4L2_AV1_SEQUENCE_FLAG_ENABLE_RESTORATION | V4L2_AV1_SEQUENCE_FLAG_MONO_CHROME | V4L2_AV1_SEQUENCE_FLAG_COLOR_RANGE | V4L2_AV1_SEQUENCE_FLAG_SUBSAMPLING_X | V4L2_AV1_SEQUENCE_FLAG_SUBSAMPLING_Y | V4L2_AV1_SEQUENCE_FLAG_FILM_GRAIN_PARAMS_PRESENT | V4L2_AV1_SEQUENCE_FLAG_SEPARATE_UV_DELTA_Q)) return -EINVAL; /* 2. Profile range */ if (s->seq_profile > 2) return -EINVAL; /* 3. Monochrome shortcut */ if (mono) { /* Profile 1 forbids monochrome */ if (s->seq_profile == 1) return -EINVAL; /* Mono → subsampling must look like 4:0:0: sx=1, sy=1 */ if (!sx || !sy) return -EINVAL; /* separate_uv_delta_q must be 0 */ if (uv_dq) return -EINVAL; return 0; } /* 4. Profile-specific rules */ switch (s->seq_profile) { case 0: /* Profile 0: only 8/10-bit, subsampling=4:2:0 (sx=1, sy=1) */ if (s->bit_depth != 8 && s->bit_depth != 10) return -EINVAL; if (!(sx && sy)) return -EINVAL; break; case 1: /* Profile 1: only 8/10-bit, subsampling=4:4:4 (sx=0, sy=0) */ if (s->bit_depth != 8 && s->bit_depth != 10) return -EINVAL; if (sx || sy) return -EINVAL; break; case 2: /* Profile 2: 8/10/12-bit allowed */ if (s->bit_depth != 8 && s->bit_depth != 10 && s->bit_depth != 12) return -EINVAL; if (s->bit_depth == 12) { if (!sx) { /* 4:4:4 → sy must be 0 */ if (sy) return -EINVAL; } else { /* sx=1 → sy=0 (4:2:2) or sy=1 (4:2:0) */ if (sy != 0 && sy != 1) return -EINVAL; } } else { /* 8/10-bit → only 4:2:2 allowed (sx=1, sy=0) */ if (!(sx && !sy)) return -EINVAL; } break; } return 0; } /* * Compound controls validation requires setting unused fields/flags to zero * in order to properly detect unchanged controls with v4l2_ctrl_type_op_equal's * memcmp. */ static int std_validate_compound(const struct v4l2_ctrl *ctrl, u32 idx, union v4l2_ctrl_ptr ptr) { struct v4l2_ctrl_mpeg2_sequence *p_mpeg2_sequence; struct v4l2_ctrl_mpeg2_picture *p_mpeg2_picture; struct v4l2_ctrl_vp8_frame *p_vp8_frame; struct v4l2_ctrl_fwht_params *p_fwht_params; struct v4l2_ctrl_h264_sps *p_h264_sps; struct v4l2_ctrl_h264_pps *p_h264_pps; struct v4l2_ctrl_h264_pred_weights *p_h264_pred_weights; struct v4l2_ctrl_h264_slice_params *p_h264_slice_params; struct v4l2_ctrl_h264_decode_params *p_h264_dec_params; struct v4l2_ctrl_hevc_ext_sps_lt_rps *p_hevc_lt_rps; struct v4l2_ctrl_hevc_ext_sps_st_rps *p_hevc_st_rps; struct v4l2_ctrl_hevc_sps *p_hevc_sps; struct v4l2_ctrl_hevc_pps *p_hevc_pps; struct v4l2_ctrl_hdr10_mastering_display *p_hdr10_mastering; struct v4l2_ctrl_hevc_decode_params *p_hevc_decode_params; struct v4l2_area *area; struct v4l2_rect *rect; void *p = ptr.p + idx * ctrl->elem_size; unsigned int i; switch ((u32)ctrl->type) { case V4L2_CTRL_TYPE_MPEG2_SEQUENCE: p_mpeg2_sequence = p; switch (p_mpeg2_sequence->chroma_format) { case 1: /* 4:2:0 */ case 2: /* 4:2:2 */ case 3: /* 4:4:4 */ break; default: return -EINVAL; } break; case V4L2_CTRL_TYPE_MPEG2_PICTURE: p_mpeg2_picture = p; switch (p_mpeg2_picture->intra_dc_precision) { case 0: /* 8 bits */ case 1: /* 9 bits */ case 2: /* 10 bits */ case 3: /* 11 bits */ break; default: return -EINVAL; } switch (p_mpeg2_picture->picture_structure) { case V4L2_MPEG2_PIC_TOP_FIELD: case V4L2_MPEG2_PIC_BOTTOM_FIELD: case V4L2_MPEG2_PIC_FRAME: break; default: return -EINVAL; } switch (p_mpeg2_picture->picture_coding_type) { case V4L2_MPEG2_PIC_CODING_TYPE_I: case V4L2_MPEG2_PIC_CODING_TYPE_P: case V4L2_MPEG2_PIC_CODING_TYPE_B: break; default: return -EINVAL; } zero_reserved(*p_mpeg2_picture); break; case V4L2_CTRL_TYPE_MPEG2_QUANTISATION: break; case V4L2_CTRL_TYPE_FWHT_PARAMS: p_fwht_params = p; if (p_fwht_params->version < V4L2_FWHT_VERSION) return -EINVAL; if (!p_fwht_params->width || !p_fwht_params->height) return -EINVAL; break; case V4L2_CTRL_TYPE_H264_SPS: p_h264_sps = p; /* Some syntax elements are only conditionally valid */ if (p_h264_sps->pic_order_cnt_type != 0) { p_h264_sps->log2_max_pic_order_cnt_lsb_minus4 = 0; } else if (p_h264_sps->pic_order_cnt_type != 1) { p_h264_sps->num_ref_frames_in_pic_order_cnt_cycle = 0; p_h264_sps->offset_for_non_ref_pic = 0; p_h264_sps->offset_for_top_to_bottom_field = 0; memset(&p_h264_sps->offset_for_ref_frame, 0, sizeof(p_h264_sps->offset_for_ref_frame)); } if (!V4L2_H264_SPS_HAS_CHROMA_FORMAT(p_h264_sps)) { p_h264_sps->chroma_format_idc = 1; p_h264_sps->bit_depth_luma_minus8 = 0; p_h264_sps->bit_depth_chroma_minus8 = 0; p_h264_sps->flags &= ~V4L2_H264_SPS_FLAG_QPPRIME_Y_ZERO_TRANSFORM_BYPASS; } if (p_h264_sps->chroma_format_idc < 3) p_h264_sps->flags &= ~V4L2_H264_SPS_FLAG_SEPARATE_COLOUR_PLANE; if (p_h264_sps->flags & V4L2_H264_SPS_FLAG_FRAME_MBS_ONLY) p_h264_sps->flags &= ~V4L2_H264_SPS_FLAG_MB_ADAPTIVE_FRAME_FIELD; /* * Chroma 4:2:2 format require at least High 4:2:2 profile. * * The H264 specification and well-known parser implementations * use profile-idc values directly, as that is clearer and * less ambiguous. We do the same here. */ if (p_h264_sps->profile_idc < 122 && p_h264_sps->chroma_format_idc > 1) return -EINVAL; /* Chroma 4:4:4 format require at least High 4:2:2 profile */ if (p_h264_sps->profile_idc < 244 && p_h264_sps->chroma_format_idc > 2) return -EINVAL; if (p_h264_sps->chroma_format_idc > 3) return -EINVAL; if (p_h264_sps->bit_depth_luma_minus8 > 6) return -EINVAL; if (p_h264_sps->bit_depth_chroma_minus8 > 6) return -EINVAL; if (p_h264_sps->log2_max_frame_num_minus4 > 12) return -EINVAL; if (p_h264_sps->pic_order_cnt_type > 2) return -EINVAL; if (p_h264_sps->log2_max_pic_order_cnt_lsb_minus4 > 12) return -EINVAL; if (p_h264_sps->max_num_ref_frames > V4L2_H264_REF_LIST_LEN) return -EINVAL; break; case V4L2_CTRL_TYPE_H264_PPS: p_h264_pps = p; if (p_h264_pps->num_slice_groups_minus1 > 7) return -EINVAL; if (p_h264_pps->num_ref_idx_l0_default_active_minus1 > (V4L2_H264_REF_LIST_LEN - 1)) return -EINVAL; if (p_h264_pps->num_ref_idx_l1_default_active_minus1 > (V4L2_H264_REF_LIST_LEN - 1)) return -EINVAL; if (p_h264_pps->weighted_bipred_idc > 2) return -EINVAL; /* * pic_init_qp_minus26 shall be in the range of * -(26 + QpBdOffset_y) to +25, inclusive, * where QpBdOffset_y is 6 * bit_depth_luma_minus8 */ if (p_h264_pps->pic_init_qp_minus26 < -62 || p_h264_pps->pic_init_qp_minus26 > 25) return -EINVAL; if (p_h264_pps->pic_init_qs_minus26 < -26 || p_h264_pps->pic_init_qs_minus26 > 25) return -EINVAL; if (p_h264_pps->chroma_qp_index_offset < -12 || p_h264_pps->chroma_qp_index_offset > 12) return -EINVAL; if (p_h264_pps->second_chroma_qp_index_offset < -12 || p_h264_pps->second_chroma_qp_index_offset > 12) return -EINVAL; break; case V4L2_CTRL_TYPE_H264_SCALING_MATRIX: break; case V4L2_CTRL_TYPE_H264_PRED_WEIGHTS: p_h264_pred_weights = p; if (p_h264_pred_weights->luma_log2_weight_denom > 7) return -EINVAL; if (p_h264_pred_weights->chroma_log2_weight_denom > 7) return -EINVAL; break; case V4L2_CTRL_TYPE_H264_SLICE_PARAMS: p_h264_slice_params = p; if (p_h264_slice_params->slice_type != V4L2_H264_SLICE_TYPE_B) p_h264_slice_params->flags &= ~V4L2_H264_SLICE_FLAG_DIRECT_SPATIAL_MV_PRED; if (p_h264_slice_params->colour_plane_id > 2) return -EINVAL; if (p_h264_slice_params->cabac_init_idc > 2) return -EINVAL; if (p_h264_slice_params->disable_deblocking_filter_idc > 2) return -EINVAL; if (p_h264_slice_params->slice_alpha_c0_offset_div2 < -6 || p_h264_slice_params->slice_alpha_c0_offset_div2 > 6) return -EINVAL; if (p_h264_slice_params->slice_beta_offset_div2 < -6 || p_h264_slice_params->slice_beta_offset_div2 > 6) return -EINVAL; if (p_h264_slice_params->slice_type == V4L2_H264_SLICE_TYPE_I || p_h264_slice_params->slice_type == V4L2_H264_SLICE_TYPE_SI) p_h264_slice_params->num_ref_idx_l0_active_minus1 = 0; if (p_h264_slice_params->slice_type != V4L2_H264_SLICE_TYPE_B) p_h264_slice_params->num_ref_idx_l1_active_minus1 = 0; if (p_h264_slice_params->num_ref_idx_l0_active_minus1 > (V4L2_H264_REF_LIST_LEN - 1)) return -EINVAL; if (p_h264_slice_params->num_ref_idx_l1_active_minus1 > (V4L2_H264_REF_LIST_LEN - 1)) return -EINVAL; zero_reserved(*p_h264_slice_params); break; case V4L2_CTRL_TYPE_H264_DECODE_PARAMS: p_h264_dec_params = p; if (p_h264_dec_params->nal_ref_idc > 3) return -EINVAL; for (i = 0; i < V4L2_H264_NUM_DPB_ENTRIES; i++) { struct v4l2_h264_dpb_entry *dpb_entry = &p_h264_dec_params->dpb[i]; zero_reserved(*dpb_entry); } zero_reserved(*p_h264_dec_params); break; case V4L2_CTRL_TYPE_VP8_FRAME: p_vp8_frame = p; switch (p_vp8_frame->num_dct_parts) { case 1: case 2: case 4: case 8: break; default: return -EINVAL; } zero_padding(p_vp8_frame->segment); zero_padding(p_vp8_frame->lf); zero_padding(p_vp8_frame->quant); zero_padding(p_vp8_frame->entropy); zero_padding(p_vp8_frame->coder_state); break; case V4L2_CTRL_TYPE_HEVC_SPS: p_hevc_sps = p; if (!(p_hevc_sps->flags & V4L2_HEVC_SPS_FLAG_PCM_ENABLED)) { p_hevc_sps->pcm_sample_bit_depth_luma_minus1 = 0; p_hevc_sps->pcm_sample_bit_depth_chroma_minus1 = 0; p_hevc_sps->log2_min_pcm_luma_coding_block_size_minus3 = 0; p_hevc_sps->log2_diff_max_min_pcm_luma_coding_block_size = 0; } if (!(p_hevc_sps->flags & V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT)) p_hevc_sps->num_long_term_ref_pics_sps = 0; break; case V4L2_CTRL_TYPE_HEVC_PPS: p_hevc_pps = p; if (!(p_hevc_pps->flags & V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED)) p_hevc_pps->diff_cu_qp_delta_depth = 0; if (!(p_hevc_pps->flags & V4L2_HEVC_PPS_FLAG_TILES_ENABLED)) { p_hevc_pps->num_tile_columns_minus1 = 0; p_hevc_pps->num_tile_rows_minus1 = 0; memset(&p_hevc_pps->column_width_minus1, 0, sizeof(p_hevc_pps->column_width_minus1)); memset(&p_hevc_pps->row_height_minus1, 0, sizeof(p_hevc_pps->row_height_minus1)); p_hevc_pps->flags &= ~V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED; } if (p_hevc_pps->flags & V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER) { p_hevc_pps->pps_beta_offset_div2 = 0; p_hevc_pps->pps_tc_offset_div2 = 0; } break; case V4L2_CTRL_TYPE_HEVC_DECODE_PARAMS: p_hevc_decode_params = p; if (p_hevc_decode_params->num_active_dpb_entries > V4L2_HEVC_DPB_ENTRIES_NUM_MAX) return -EINVAL; break; case V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS: break; case V4L2_CTRL_TYPE_HEVC_EXT_SPS_ST_RPS: p_hevc_st_rps = p; if (p_hevc_st_rps->flags & ~V4L2_HEVC_EXT_SPS_ST_RPS_FLAG_INTER_REF_PIC_SET_PRED) return -EINVAL; break; case V4L2_CTRL_TYPE_HEVC_EXT_SPS_LT_RPS: p_hevc_lt_rps = p; if (p_hevc_lt_rps->flags & ~V4L2_HEVC_EXT_SPS_LT_RPS_FLAG_USED_LT) return -EINVAL; break; case V4L2_CTRL_TYPE_HDR10_CLL_INFO: break; case V4L2_CTRL_TYPE_HDR10_MASTERING_DISPLAY: p_hdr10_mastering = p; for (i = 0; i < 3; ++i) { if (p_hdr10_mastering->display_primaries_x[i] < V4L2_HDR10_MASTERING_PRIMARIES_X_LOW || p_hdr10_mastering->display_primaries_x[i] > V4L2_HDR10_MASTERING_PRIMARIES_X_HIGH || p_hdr10_mastering->display_primaries_y[i] < V4L2_HDR10_MASTERING_PRIMARIES_Y_LOW || p_hdr10_mastering->display_primaries_y[i] > V4L2_HDR10_MASTERING_PRIMARIES_Y_HIGH) return -EINVAL; } if (p_hdr10_mastering->white_point_x < V4L2_HDR10_MASTERING_WHITE_POINT_X_LOW || p_hdr10_mastering->white_point_x > V4L2_HDR10_MASTERING_WHITE_POINT_X_HIGH || p_hdr10_mastering->white_point_y < V4L2_HDR10_MASTERING_WHITE_POINT_Y_LOW || p_hdr10_mastering->white_point_y > V4L2_HDR10_MASTERING_WHITE_POINT_Y_HIGH) return -EINVAL; if (p_hdr10_mastering->max_display_mastering_luminance < V4L2_HDR10_MASTERING_MAX_LUMA_LOW || p_hdr10_mastering->max_display_mastering_luminance > V4L2_HDR10_MASTERING_MAX_LUMA_HIGH || p_hdr10_mastering->min_display_mastering_luminance < V4L2_HDR10_MASTERING_MIN_LUMA_LOW || p_hdr10_mastering->min_display_mastering_luminance > V4L2_HDR10_MASTERING_MIN_LUMA_HIGH) return -EINVAL; /* The following restriction comes from ITU-T Rec. H.265 spec */ if (p_hdr10_mastering->max_display_mastering_luminance == V4L2_HDR10_MASTERING_MAX_LUMA_LOW && p_hdr10_mastering->min_display_mastering_luminance == V4L2_HDR10_MASTERING_MIN_LUMA_HIGH) return -EINVAL; break; case V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX: break; case V4L2_CTRL_TYPE_VP9_COMPRESSED_HDR: return validate_vp9_compressed_hdr(p); case V4L2_CTRL_TYPE_VP9_FRAME: return validate_vp9_frame(p); case V4L2_CTRL_TYPE_AV1_FRAME: return validate_av1_frame(p); case V4L2_CTRL_TYPE_AV1_SEQUENCE: return validate_av1_sequence(p); case V4L2_CTRL_TYPE_AV1_TILE_GROUP_ENTRY: break; case V4L2_CTRL_TYPE_AV1_FILM_GRAIN: return validate_av1_film_grain(p); case V4L2_CTRL_TYPE_AREA: area = p; if (!area->width || !area->height) return -EINVAL; break; case V4L2_CTRL_TYPE_RECT: rect = p; if (!rect->width || !rect->height) return -EINVAL; break; default: return -EINVAL; } return 0; } static int std_validate_elem(const struct v4l2_ctrl *ctrl, u32 idx, union v4l2_ctrl_ptr ptr) { size_t len; u64 offset; s64 val; switch ((u32)ctrl->type) { case V4L2_CTRL_TYPE_INTEGER: return ROUND_TO_RANGE(ptr.p_s32[idx], u32, ctrl); case V4L2_CTRL_TYPE_INTEGER64: /* * We can't use the ROUND_TO_RANGE define here due to * the u64 divide that needs special care. */ val = ptr.p_s64[idx]; if (ctrl->maximum >= 0 && val >= ctrl->maximum - (s64)(ctrl->step / 2)) val = ctrl->maximum; else val += (s64)(ctrl->step / 2); val = clamp_t(s64, val, ctrl->minimum, ctrl->maximum); offset = val - ctrl->minimum; do_div(offset, ctrl->step); ptr.p_s64[idx] = ctrl->minimum + offset * ctrl->step; return 0; case V4L2_CTRL_TYPE_U8: return ROUND_TO_RANGE(ptr.p_u8[idx], u8, ctrl); case V4L2_CTRL_TYPE_U16: return ROUND_TO_RANGE(ptr.p_u16[idx], u16, ctrl); case V4L2_CTRL_TYPE_U32: return ROUND_TO_RANGE(ptr.p_u32[idx], u32, ctrl); case V4L2_CTRL_TYPE_BOOLEAN: ptr.p_s32[idx] = !!ptr.p_s32[idx]; return 0; case V4L2_CTRL_TYPE_MENU: case V4L2_CTRL_TYPE_INTEGER_MENU: if (ptr.p_s32[idx] < ctrl->minimum || ptr.p_s32[idx] > ctrl->maximum) return -ERANGE; if (ptr.p_s32[idx] < BITS_PER_LONG_LONG && (ctrl->menu_skip_mask & BIT_ULL(ptr.p_s32[idx]))) return -EINVAL; if (ctrl->type == V4L2_CTRL_TYPE_MENU && ctrl->qmenu[ptr.p_s32[idx]][0] == '\0') return -EINVAL; return 0; case V4L2_CTRL_TYPE_BITMASK: ptr.p_s32[idx] &= ctrl->maximum; return 0; case V4L2_CTRL_TYPE_BUTTON: case V4L2_CTRL_TYPE_CTRL_CLASS: ptr.p_s32[idx] = 0; return 0; case V4L2_CTRL_TYPE_STRING: idx *= ctrl->elem_size; len = strlen(ptr.p_char + idx); if (len < ctrl->minimum) return -ERANGE; if ((len - (u32)ctrl->minimum) % (u32)ctrl->step) return -ERANGE; return 0; default: return std_validate_compound(ctrl, idx, ptr); } } int v4l2_ctrl_type_op_validate(const struct v4l2_ctrl *ctrl, union v4l2_ctrl_ptr ptr) { unsigned int i; int ret = 0; switch ((u32)ctrl->type) { case V4L2_CTRL_TYPE_U8: if (ctrl->maximum == 0xff && ctrl->minimum == 0 && ctrl->step == 1) return 0; break; case V4L2_CTRL_TYPE_U16: if (ctrl->maximum == 0xffff && ctrl->minimum == 0 && ctrl->step == 1) return 0; break; case V4L2_CTRL_TYPE_U32: if (ctrl->maximum == 0xffffffff && ctrl->minimum == 0 && ctrl->step == 1) return 0; break; case V4L2_CTRL_TYPE_BUTTON: case V4L2_CTRL_TYPE_CTRL_CLASS: memset(ptr.p_s32, 0, ctrl->new_elems * sizeof(s32)); return 0; } for (i = 0; !ret && i < ctrl->new_elems; i++) ret = std_validate_elem(ctrl, i, ptr); return ret; } EXPORT_SYMBOL(v4l2_ctrl_type_op_validate); static const struct v4l2_ctrl_type_ops std_type_ops = { .equal = v4l2_ctrl_type_op_equal, .init = v4l2_ctrl_type_op_init, .minimum = v4l2_ctrl_type_op_minimum, .maximum = v4l2_ctrl_type_op_maximum, .log = v4l2_ctrl_type_op_log, .validate = v4l2_ctrl_type_op_validate, }; void v4l2_ctrl_notify(struct v4l2_ctrl *ctrl, v4l2_ctrl_notify_fnc notify, void *priv) { if (!ctrl) return; if (!notify) { ctrl->call_notify = 0; return; } if (WARN_ON(ctrl->handler->notify && ctrl->handler->notify != notify)) return; ctrl->handler->notify = notify; ctrl->handler->notify_priv = priv; ctrl->call_notify = 1; } EXPORT_SYMBOL(v4l2_ctrl_notify); /* Copy the one value to another. */ static void ptr_to_ptr(struct v4l2_ctrl *ctrl, union v4l2_ctrl_ptr from, union v4l2_ctrl_ptr to, unsigned int elems) { if (ctrl == NULL) return; memcpy(to.p, from.p_const, elems * ctrl->elem_size); } /* Copy the new value to the current value. */ void new_to_cur(struct v4l2_fh *fh, struct v4l2_ctrl *ctrl, u32 ch_flags) { bool changed; if (ctrl == NULL) return; /* has_changed is set by cluster_changed */ changed = ctrl->has_changed; if (changed) { if (ctrl->is_dyn_array) ctrl->elems = ctrl->new_elems; ptr_to_ptr(ctrl, ctrl->p_new, ctrl->p_cur, ctrl->elems); } if (ch_flags & V4L2_EVENT_CTRL_CH_FLAGS) { /* Note: CH_FLAGS is only set for auto clusters. */ ctrl->flags &= ~(V4L2_CTRL_FLAG_INACTIVE | V4L2_CTRL_FLAG_VOLATILE); if (!is_cur_manual(ctrl->cluster[0])) { ctrl->flags |= V4L2_CTRL_FLAG_INACTIVE; if (ctrl->cluster[0]->has_volatiles) ctrl->flags |= V4L2_CTRL_FLAG_VOLATILE; } fh = NULL; } if (changed || ch_flags) { /* If a control was changed that was not one of the controls modified by the application, then send the event to all. */ if (!ctrl->is_new) fh = NULL; send_event(fh, ctrl, (changed ? V4L2_EVENT_CTRL_CH_VALUE : 0) | ch_flags); if (ctrl->call_notify && changed && ctrl->handler->notify) ctrl->handler->notify(ctrl, ctrl->handler->notify_priv); } } /* Copy the current value to the new value */ void cur_to_new(struct v4l2_ctrl *ctrl) { if (ctrl == NULL) return; if (ctrl->is_dyn_array) ctrl->new_elems = ctrl->elems; ptr_to_ptr(ctrl, ctrl->p_cur, ctrl->p_new, ctrl->new_elems); } static bool req_alloc_array(struct v4l2_ctrl_ref *ref, u32 elems) { void *tmp; if (elems == ref->p_req_array_alloc_elems) return true; if (ref->ctrl->is_dyn_array && elems < ref->p_req_array_alloc_elems) return true; tmp = kvmalloc(elems * ref->ctrl->elem_size, GFP_KERNEL); if (!tmp) { ref->p_req_array_enomem = true; return false; } ref->p_req_array_enomem = false; kvfree(ref->p_req.p); ref->p_req.p = tmp; ref->p_req_array_alloc_elems = elems; return true; } /* Copy the new value to the request value */ void new_to_req(struct v4l2_ctrl_ref *ref) { struct v4l2_ctrl *ctrl; if (!ref) return; ctrl = ref->ctrl; if (ctrl->is_array && !req_alloc_array(ref, ctrl->new_elems)) return; ref->p_req_elems = ctrl->new_elems; ptr_to_ptr(ctrl, ctrl->p_new, ref->p_req, ref->p_req_elems); ref->p_req_valid = true; } /* Copy the current value to the request value */ void cur_to_req(struct v4l2_ctrl_ref *ref) { struct v4l2_ctrl *ctrl; if (!ref) return; ctrl = ref->ctrl; if (ctrl->is_array && !req_alloc_array(ref, ctrl->elems)) return; ref->p_req_elems = ctrl->elems; ptr_to_ptr(ctrl, ctrl->p_cur, ref->p_req, ctrl->elems); ref->p_req_valid = true; } /* Copy the request value to the new value */ int req_to_new(struct v4l2_ctrl_ref *ref) { struct v4l2_ctrl *ctrl; if (!ref) return 0; ctrl = ref->ctrl; /* * This control was never set in the request, so just use the current * value. */ if (!ref->p_req_valid) { if (ctrl->is_dyn_array) ctrl->new_elems = ctrl->elems; ptr_to_ptr(ctrl, ctrl->p_cur, ctrl->p_new, ctrl->new_elems); return 0; } /* Not an array, so just copy the request value */ if (!ctrl->is_array) { ptr_to_ptr(ctrl, ref->p_req, ctrl->p_new, ctrl->new_elems); return 0; } /* Sanity check, should never happen */ if (WARN_ON(!ref->p_req_array_alloc_elems)) return -ENOMEM; if (!ctrl->is_dyn_array && ref->p_req_elems != ctrl->p_array_alloc_elems) return -ENOMEM; /* * Check if the number of elements in the request is more than the * elements in ctrl->p_array. If so, attempt to realloc ctrl->p_array. * Note that p_array is allocated with twice the number of elements * in the dynamic array since it has to store both the current and * new value of such a control. */ if (ref->p_req_elems > ctrl->p_array_alloc_elems) { unsigned int sz = ref->p_req_elems * ctrl->elem_size; void *old = ctrl->p_array; void *tmp = kvzalloc(2 * sz, GFP_KERNEL); if (!tmp) return -ENOMEM; memcpy(tmp, ctrl->p_new.p, ctrl->elems * ctrl->elem_size); memcpy(tmp + sz, ctrl->p_cur.p, ctrl->elems * ctrl->elem_size); ctrl->p_new.p = tmp; ctrl->p_cur.p = tmp + sz; ctrl->p_array = tmp; ctrl->p_array_alloc_elems = ref->p_req_elems; kvfree(old); } ctrl->new_elems = ref->p_req_elems; ptr_to_ptr(ctrl, ref->p_req, ctrl->p_new, ctrl->new_elems); return 0; } /* Control range checking */ int check_range(enum v4l2_ctrl_type type, s64 min, s64 max, u64 step, s64 def) { switch (type) { case V4L2_CTRL_TYPE_BOOLEAN: if (step != 1 || max > 1 || min < 0) return -ERANGE; fallthrough; case V4L2_CTRL_TYPE_U8: case V4L2_CTRL_TYPE_U16: case V4L2_CTRL_TYPE_U32: case V4L2_CTRL_TYPE_INTEGER: case V4L2_CTRL_TYPE_INTEGER64: if (step == 0 || min > max || def < min || def > max) return -ERANGE; return 0; case V4L2_CTRL_TYPE_BITMASK: if (step || min || !max || (def & ~max)) return -ERANGE; return 0; case V4L2_CTRL_TYPE_MENU: case V4L2_CTRL_TYPE_INTEGER_MENU: if (min > max || def < min || def > max || min < 0 || (step && max >= BITS_PER_LONG_LONG)) return -ERANGE; /* Note: step == menu_skip_mask for menu controls. So here we check if the default value is masked out. */ if (def < BITS_PER_LONG_LONG && (step & BIT_ULL(def))) return -EINVAL; return 0; case V4L2_CTRL_TYPE_STRING: if (min > max || min < 0 || step < 1 || def) return -ERANGE; return 0; default: return 0; } } /* Set the handler's error code if it wasn't set earlier already */ static inline int handler_set_err(struct v4l2_ctrl_handler *hdl, int err) { if (hdl->error == 0) hdl->error = err; return err; } /* Initialize the handler */ int v4l2_ctrl_handler_init_class(struct v4l2_ctrl_handler *hdl, unsigned nr_of_controls_hint, struct lock_class_key *key, const char *name) { mutex_init(&hdl->_lock); hdl->lock = &hdl->_lock; lockdep_set_class_and_name(hdl->lock, key, name); INIT_LIST_HEAD(&hdl->ctrls); INIT_LIST_HEAD(&hdl->ctrl_refs); hdl->nr_of_buckets = 1 + nr_of_controls_hint / 8; hdl->buckets = kvzalloc_objs(hdl->buckets[0], hdl->nr_of_buckets); hdl->error = hdl->buckets ? 0 : -ENOMEM; v4l2_ctrl_handler_init_request(hdl); return hdl->error; } EXPORT_SYMBOL(v4l2_ctrl_handler_init_class); /* Free all controls and control refs */ int v4l2_ctrl_handler_free(struct v4l2_ctrl_handler *hdl) { struct v4l2_ctrl_ref *ref, *next_ref; struct v4l2_ctrl *ctrl, *next_ctrl; struct v4l2_subscribed_event *sev, *next_sev; if (!hdl) return 0; if (!hdl->buckets) return hdl->error; v4l2_ctrl_handler_free_request(hdl); mutex_lock(hdl->lock); /* Free all nodes */ list_for_each_entry_safe(ref, next_ref, &hdl->ctrl_refs, node) { list_del(&ref->node); if (ref->p_req_array_alloc_elems) kvfree(ref->p_req.p); kfree(ref); } /* Free all controls owned by the handler */ list_for_each_entry_safe(ctrl, next_ctrl, &hdl->ctrls, node) { list_del(&ctrl->node); list_for_each_entry_safe(sev, next_sev, &ctrl->ev_subs, node) list_del(&sev->node); kvfree(ctrl->p_array); kvfree(ctrl); } kvfree(hdl->buckets); hdl->buckets = NULL; hdl->cached = NULL; mutex_unlock(hdl->lock); mutex_destroy(&hdl->_lock); return hdl->error; } EXPORT_SYMBOL(v4l2_ctrl_handler_free); /* For backwards compatibility: V4L2_CID_PRIVATE_BASE should no longer be used except in G_CTRL, S_CTRL, QUERYCTRL and QUERYMENU when dealing with applications that do not use the NEXT_CTRL flag. We just find the n-th private user control. It's O(N), but that should not be an issue in this particular case. */ static struct v4l2_ctrl_ref *find_private_ref( struct v4l2_ctrl_handler *hdl, u32 id) { struct v4l2_ctrl_ref *ref; id -= V4L2_CID_PRIVATE_BASE; list_for_each_entry(ref, &hdl->ctrl_refs, node) { /* Search for private user controls that are compatible with VIDIOC_G/S_CTRL. */ if (V4L2_CTRL_ID2WHICH(ref->ctrl->id) == V4L2_CTRL_CLASS_USER && V4L2_CTRL_DRIVER_PRIV(ref->ctrl->id)) { if (!ref->ctrl->is_int) continue; if (id == 0) return ref; id--; } } return NULL; } /* Find a control with the given ID. */ struct v4l2_ctrl_ref *find_ref(struct v4l2_ctrl_handler *hdl, u32 id) { struct v4l2_ctrl_ref *ref; int bucket; id &= V4L2_CTRL_ID_MASK; /* Old-style private controls need special handling */ if (id >= V4L2_CID_PRIVATE_BASE) return find_private_ref(hdl, id); bucket = id % hdl->nr_of_buckets; /* Simple optimization: cache the last control found */ if (hdl->cached && hdl->cached->ctrl->id == id) return hdl->cached; /* Not in cache, search the hash */ ref = hdl->buckets ? hdl->buckets[bucket] : NULL; while (ref && ref->ctrl->id != id) ref = ref->next; if (ref) hdl->cached = ref; /* cache it! */ return ref; } /* Find a control with the given ID. Take the handler's lock first. */ struct v4l2_ctrl_ref *find_ref_lock(struct v4l2_ctrl_handler *hdl, u32 id) { struct v4l2_ctrl_ref *ref = NULL; if (hdl) { mutex_lock(hdl->lock); ref = find_ref(hdl, id); mutex_unlock(hdl->lock); } return ref; } /* Find a control with the given ID. */ struct v4l2_ctrl *v4l2_ctrl_find(struct v4l2_ctrl_handler *hdl, u32 id) { struct v4l2_ctrl_ref *ref = find_ref_lock(hdl, id); return ref ? ref->ctrl : NULL; } EXPORT_SYMBOL(v4l2_ctrl_find); /* Allocate a new v4l2_ctrl_ref and hook it into the handler. */ int handler_new_ref(struct v4l2_ctrl_handler *hdl, struct v4l2_ctrl *ctrl, struct v4l2_ctrl_ref **ctrl_ref, bool from_other_dev, bool allocate_req) { struct v4l2_ctrl_ref *ref; struct v4l2_ctrl_ref *new_ref; u32 id = ctrl->id; u32 class_ctrl = V4L2_CTRL_ID2WHICH(id) | 1; int bucket = id % hdl->nr_of_buckets; /* which bucket to use */ unsigned int size_extra_req = 0; if (ctrl_ref) *ctrl_ref = NULL; /* * Automatically add the control class if it is not yet present and * the new control is not a compound control. */ if (ctrl->type < V4L2_CTRL_COMPOUND_TYPES && id != class_ctrl && find_ref_lock(hdl, class_ctrl) == NULL) if (!v4l2_ctrl_new_std(hdl, NULL, class_ctrl, 0, 0, 0, 0)) return hdl->error; if (hdl->error) return hdl->error; if (allocate_req && !ctrl->is_array) size_extra_req = ctrl->elems * ctrl->elem_size; new_ref = kzalloc(sizeof(*new_ref) + size_extra_req, GFP_KERNEL); if (!new_ref) return handler_set_err(hdl, -ENOMEM); new_ref->ctrl = ctrl; new_ref->from_other_dev = from_other_dev; if (size_extra_req) new_ref->p_req.p = &new_ref[1]; INIT_LIST_HEAD(&new_ref->node); mutex_lock(hdl->lock); /* Add immediately at the end of the list if the list is empty, or if the last element in the list has a lower ID. This ensures that when elements are added in ascending order the insertion is an O(1) operation. */ if (list_empty(&hdl->ctrl_refs) || id > node2id(hdl->ctrl_refs.prev)) { list_add_tail(&new_ref->node, &hdl->ctrl_refs); goto insert_in_hash; } /* Find insert position in sorted list */ list_for_each_entry(ref, &hdl->ctrl_refs, node) { if (ref->ctrl->id < id) continue; /* Don't add duplicates */ if (ref->ctrl->id == id) { kfree(new_ref); goto unlock; } list_add(&new_ref->node, ref->node.prev); break; } insert_in_hash: /* Insert the control node in the hash */ new_ref->next = hdl->buckets[bucket]; hdl->buckets[bucket] = new_ref; if (ctrl_ref) *ctrl_ref = new_ref; if (ctrl->handler == hdl) { /* By default each control starts in a cluster of its own. * new_ref->ctrl is basically a cluster array with one * element, so that's perfect to use as the cluster pointer. * But only do this for the handler that owns the control. */ ctrl->cluster = &new_ref->ctrl; ctrl->ncontrols = 1; } unlock: mutex_unlock(hdl->lock); return 0; } /* Add a new control */ static struct v4l2_ctrl *v4l2_ctrl_new(struct v4l2_ctrl_handler *hdl, const struct v4l2_ctrl_ops *ops, const struct v4l2_ctrl_type_ops *type_ops, u32 id, const char *name, enum v4l2_ctrl_type type, s64 min, s64 max, u64 step, s64 def, const u32 dims[V4L2_CTRL_MAX_DIMS], u32 elem_size, u32 flags, const char * const *qmenu, const s64 *qmenu_int, const union v4l2_ctrl_ptr p_def, const union v4l2_ctrl_ptr p_min, const union v4l2_ctrl_ptr p_max, void *priv) { struct v4l2_ctrl *ctrl; unsigned sz_extra; unsigned nr_of_dims = 0; unsigned elems = 1; bool is_array; unsigned tot_ctrl_size; void *data; int err; if (hdl->error) return NULL; while (dims && dims[nr_of_dims]) { elems *= dims[nr_of_dims]; nr_of_dims++; if (nr_of_dims == V4L2_CTRL_MAX_DIMS) break; } is_array = nr_of_dims > 0; /* Prefill elem_size for all types handled by std_type_ops */ switch ((u32)type) { case V4L2_CTRL_TYPE_INTEGER64: elem_size = sizeof(s64); break; case V4L2_CTRL_TYPE_STRING: elem_size = max + 1; break; case V4L2_CTRL_TYPE_U8: elem_size = sizeof(u8); break; case V4L2_CTRL_TYPE_U16: elem_size = sizeof(u16); break; case V4L2_CTRL_TYPE_U32: elem_size = sizeof(u32); break; case V4L2_CTRL_TYPE_MPEG2_SEQUENCE: elem_size = sizeof(struct v4l2_ctrl_mpeg2_sequence); break; case V4L2_CTRL_TYPE_MPEG2_PICTURE: elem_size = sizeof(struct v4l2_ctrl_mpeg2_picture); break; case V4L2_CTRL_TYPE_MPEG2_QUANTISATION: elem_size = sizeof(struct v4l2_ctrl_mpeg2_quantisation); break; case V4L2_CTRL_TYPE_FWHT_PARAMS: elem_size = sizeof(struct v4l2_ctrl_fwht_params); break; case V4L2_CTRL_TYPE_H264_SPS: elem_size = sizeof(struct v4l2_ctrl_h264_sps); break; case V4L2_CTRL_TYPE_H264_PPS: elem_size = sizeof(struct v4l2_ctrl_h264_pps); break; case V4L2_CTRL_TYPE_H264_SCALING_MATRIX: elem_size = sizeof(struct v4l2_ctrl_h264_scaling_matrix); break; case V4L2_CTRL_TYPE_H264_SLICE_PARAMS: elem_size = sizeof(struct v4l2_ctrl_h264_slice_params); break; case V4L2_CTRL_TYPE_H264_DECODE_PARAMS: elem_size = sizeof(struct v4l2_ctrl_h264_decode_params); break; case V4L2_CTRL_TYPE_H264_PRED_WEIGHTS: elem_size = sizeof(struct v4l2_ctrl_h264_pred_weights); break; case V4L2_CTRL_TYPE_VP8_FRAME: elem_size = sizeof(struct v4l2_ctrl_vp8_frame); break; case V4L2_CTRL_TYPE_HEVC_SPS: elem_size = sizeof(struct v4l2_ctrl_hevc_sps); break; case V4L2_CTRL_TYPE_HEVC_PPS: elem_size = sizeof(struct v4l2_ctrl_hevc_pps); break; case V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS: elem_size = sizeof(struct v4l2_ctrl_hevc_slice_params); break; case V4L2_CTRL_TYPE_HEVC_EXT_SPS_ST_RPS: elem_size = sizeof(struct v4l2_ctrl_hevc_ext_sps_st_rps); break; case V4L2_CTRL_TYPE_HEVC_EXT_SPS_LT_RPS: elem_size = sizeof(struct v4l2_ctrl_hevc_ext_sps_lt_rps); break; case V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX: elem_size = sizeof(struct v4l2_ctrl_hevc_scaling_matrix); break; case V4L2_CTRL_TYPE_HEVC_DECODE_PARAMS: elem_size = sizeof(struct v4l2_ctrl_hevc_decode_params); break; case V4L2_CTRL_TYPE_HDR10_CLL_INFO: elem_size = sizeof(struct v4l2_ctrl_hdr10_cll_info); break; case V4L2_CTRL_TYPE_HDR10_MASTERING_DISPLAY: elem_size = sizeof(struct v4l2_ctrl_hdr10_mastering_display); break; case V4L2_CTRL_TYPE_VP9_COMPRESSED_HDR: elem_size = sizeof(struct v4l2_ctrl_vp9_compressed_hdr); break; case V4L2_CTRL_TYPE_VP9_FRAME: elem_size = sizeof(struct v4l2_ctrl_vp9_frame); break; case V4L2_CTRL_TYPE_AV1_SEQUENCE: elem_size = sizeof(struct v4l2_ctrl_av1_sequence); break; case V4L2_CTRL_TYPE_AV1_TILE_GROUP_ENTRY: elem_size = sizeof(struct v4l2_ctrl_av1_tile_group_entry); break; case V4L2_CTRL_TYPE_AV1_FRAME: elem_size = sizeof(struct v4l2_ctrl_av1_frame); break; case V4L2_CTRL_TYPE_AV1_FILM_GRAIN: elem_size = sizeof(struct v4l2_ctrl_av1_film_grain); break; case V4L2_CTRL_TYPE_AREA: elem_size = sizeof(struct v4l2_area); break; case V4L2_CTRL_TYPE_RECT: elem_size = sizeof(struct v4l2_rect); break; default: if (type < V4L2_CTRL_COMPOUND_TYPES) elem_size = sizeof(s32); break; } if (type < V4L2_CTRL_COMPOUND_TYPES && type != V4L2_CTRL_TYPE_BUTTON && type != V4L2_CTRL_TYPE_CTRL_CLASS && type != V4L2_CTRL_TYPE_STRING) flags |= V4L2_CTRL_FLAG_HAS_WHICH_MIN_MAX; /* Sanity checks */ if (id == 0 || name == NULL || !elem_size || id >= V4L2_CID_PRIVATE_BASE || (type == V4L2_CTRL_TYPE_MENU && qmenu == NULL) || (type == V4L2_CTRL_TYPE_INTEGER_MENU && qmenu_int == NULL)) { handler_set_err(hdl, -ERANGE); return NULL; } err = check_range(type, min, max, step, def); if (err) { handler_set_err(hdl, err); return NULL; } if (is_array && (type == V4L2_CTRL_TYPE_BUTTON || type == V4L2_CTRL_TYPE_CTRL_CLASS)) { handler_set_err(hdl, -EINVAL); return NULL; } if (flags & V4L2_CTRL_FLAG_DYNAMIC_ARRAY) { /* * For now only support this for one-dimensional arrays only. * * This can be relaxed in the future, but this will * require more effort. */ if (nr_of_dims != 1) { handler_set_err(hdl, -EINVAL); return NULL; } /* Start with just 1 element */ elems = 1; } tot_ctrl_size = elem_size * elems; sz_extra = 0; if (type == V4L2_CTRL_TYPE_BUTTON) flags |= V4L2_CTRL_FLAG_WRITE_ONLY | V4L2_CTRL_FLAG_EXECUTE_ON_WRITE; else if (type == V4L2_CTRL_TYPE_CTRL_CLASS) flags |= V4L2_CTRL_FLAG_READ_ONLY; else if (!is_array && (type == V4L2_CTRL_TYPE_INTEGER64 || type == V4L2_CTRL_TYPE_STRING || type >= V4L2_CTRL_COMPOUND_TYPES)) sz_extra += 2 * tot_ctrl_size; if (type >= V4L2_CTRL_COMPOUND_TYPES && p_def.p_const) sz_extra += elem_size; if (type >= V4L2_CTRL_COMPOUND_TYPES && p_min.p_const) sz_extra += elem_size; if (type >= V4L2_CTRL_COMPOUND_TYPES && p_max.p_const) sz_extra += elem_size; ctrl = kvzalloc(sizeof(*ctrl) + sz_extra, GFP_KERNEL); if (ctrl == NULL) { handler_set_err(hdl, -ENOMEM); return NULL; } INIT_LIST_HEAD(&ctrl->node); INIT_LIST_HEAD(&ctrl->ev_subs); ctrl->handler = hdl; ctrl->ops = ops; ctrl->type_ops = type_ops ? type_ops : &std_type_ops; ctrl->id = id; ctrl->name = name; ctrl->type = type; ctrl->flags = flags; ctrl->minimum = min; ctrl->maximum = max; ctrl->step = step; ctrl->default_value = def; ctrl->is_string = !is_array && type == V4L2_CTRL_TYPE_STRING; ctrl->is_ptr = is_array || type >= V4L2_CTRL_COMPOUND_TYPES || ctrl->is_string; ctrl->is_int = !ctrl->is_ptr && type != V4L2_CTRL_TYPE_INTEGER64; ctrl->is_array = is_array; ctrl->is_dyn_array = !!(flags & V4L2_CTRL_FLAG_DYNAMIC_ARRAY); ctrl->elems = elems; ctrl->new_elems = elems; ctrl->nr_of_dims = nr_of_dims; if (nr_of_dims) memcpy(ctrl->dims, dims, nr_of_dims * sizeof(dims[0])); ctrl->elem_size = elem_size; if (type == V4L2_CTRL_TYPE_MENU) ctrl->qmenu = qmenu; else if (type == V4L2_CTRL_TYPE_INTEGER_MENU) ctrl->qmenu_int = qmenu_int; ctrl->priv = priv; ctrl->cur.val = ctrl->val = def; data = &ctrl[1]; if (ctrl->is_array) { ctrl->p_array_alloc_elems = elems; ctrl->p_array = kvzalloc(2 * elems * elem_size, GFP_KERNEL); if (!ctrl->p_array) { kvfree(ctrl); return NULL; } data = ctrl->p_array; } if (!ctrl->is_int) { ctrl->p_new.p = data; ctrl->p_cur.p = data + tot_ctrl_size; } else { ctrl->p_new.p = &ctrl->val; ctrl->p_cur.p = &ctrl->cur.val; } if (type >= V4L2_CTRL_COMPOUND_TYPES && p_def.p_const) { if (ctrl->is_array) ctrl->p_def.p = &ctrl[1]; else ctrl->p_def.p = ctrl->p_cur.p + tot_ctrl_size; memcpy(ctrl->p_def.p, p_def.p_const, elem_size); } if (flags & V4L2_CTRL_FLAG_HAS_WHICH_MIN_MAX) { void *ptr = ctrl->p_def.p; if (p_min.p_const) { ptr += elem_size; ctrl->p_min.p = ptr; memcpy(ctrl->p_min.p, p_min.p_const, elem_size); } if (p_max.p_const) { ptr += elem_size; ctrl->p_max.p = ptr; memcpy(ctrl->p_max.p, p_max.p_const, elem_size); } } ctrl->type_ops->init(ctrl, 0, ctrl->p_cur); cur_to_new(ctrl); if (handler_new_ref(hdl, ctrl, NULL, false, false)) { kvfree(ctrl->p_array); kvfree(ctrl); return NULL; } mutex_lock(hdl->lock); list_add_tail(&ctrl->node, &hdl->ctrls); mutex_unlock(hdl->lock); return ctrl; } struct v4l2_ctrl *v4l2_ctrl_new_custom(struct v4l2_ctrl_handler *hdl, const struct v4l2_ctrl_config *cfg, void *priv) { bool is_menu; struct v4l2_ctrl *ctrl; const char *name = cfg->name; const char * const *qmenu = cfg->qmenu; const s64 *qmenu_int = cfg->qmenu_int; enum v4l2_ctrl_type type = cfg->type; u32 flags = cfg->flags; s64 min = cfg->min; s64 max = cfg->max; u64 step = cfg->step; s64 def = cfg->def; if (name == NULL) v4l2_ctrl_fill(cfg->id, &name, &type, &min, &max, &step, &def, &flags); is_menu = (type == V4L2_CTRL_TYPE_MENU || type == V4L2_CTRL_TYPE_INTEGER_MENU); if (is_menu) WARN_ON(step); else WARN_ON(cfg->menu_skip_mask); if (type == V4L2_CTRL_TYPE_MENU && !qmenu) { qmenu = v4l2_ctrl_get_menu(cfg->id); } else if (type == V4L2_CTRL_TYPE_INTEGER_MENU && !qmenu_int) { handler_set_err(hdl, -EINVAL); return NULL; } ctrl = v4l2_ctrl_new(hdl, cfg->ops, cfg->type_ops, cfg->id, name, type, min, max, is_menu ? cfg->menu_skip_mask : step, def, cfg->dims, cfg->elem_size, flags, qmenu, qmenu_int, cfg->p_def, cfg->p_min, cfg->p_max, priv); if (ctrl) ctrl->is_private = cfg->is_private; return ctrl; } EXPORT_SYMBOL(v4l2_ctrl_new_custom); /* Helper function for standard non-menu controls */ struct v4l2_ctrl *v4l2_ctrl_new_std(struct v4l2_ctrl_handler *hdl, const struct v4l2_ctrl_ops *ops, u32 id, s64 min, s64 max, u64 step, s64 def) { const char *name; enum v4l2_ctrl_type type; u32 flags; v4l2_ctrl_fill(id, &name, &type, &min, &max, &step, &def, &flags); if (type == V4L2_CTRL_TYPE_MENU || type == V4L2_CTRL_TYPE_INTEGER_MENU || type >= V4L2_CTRL_COMPOUND_TYPES) { handler_set_err(hdl, -EINVAL); return NULL; } return v4l2_ctrl_new(hdl, ops, NULL, id, name, type, min, max, step, def, NULL, 0, flags, NULL, NULL, ptr_null, ptr_null, ptr_null, NULL); } EXPORT_SYMBOL(v4l2_ctrl_new_std); /* Helper function for standard menu controls */ struct v4l2_ctrl *v4l2_ctrl_new_std_menu(struct v4l2_ctrl_handler *hdl, const struct v4l2_ctrl_ops *ops, u32 id, u8 _max, u64 mask, u8 _def) { const char * const *qmenu = NULL; const s64 *qmenu_int = NULL; unsigned int qmenu_int_len = 0; const char *name; enum v4l2_ctrl_type type; s64 min; s64 max = _max; s64 def = _def; u64 step; u32 flags; v4l2_ctrl_fill(id, &name, &type, &min, &max, &step, &def, &flags); if (type == V4L2_CTRL_TYPE_MENU) qmenu = v4l2_ctrl_get_menu(id); else if (type == V4L2_CTRL_TYPE_INTEGER_MENU) qmenu_int = v4l2_ctrl_get_int_menu(id, &qmenu_int_len); if ((!qmenu && !qmenu_int) || (qmenu_int && max >= qmenu_int_len)) { handler_set_err(hdl, -EINVAL); return NULL; } return v4l2_ctrl_new(hdl, ops, NULL, id, name, type, 0, max, mask, def, NULL, 0, flags, qmenu, qmenu_int, ptr_null, ptr_null, ptr_null, NULL); } EXPORT_SYMBOL(v4l2_ctrl_new_std_menu); /* Helper function for standard menu controls with driver defined menu */ struct v4l2_ctrl *v4l2_ctrl_new_std_menu_items(struct v4l2_ctrl_handler *hdl, const struct v4l2_ctrl_ops *ops, u32 id, u8 _max, u64 mask, u8 _def, const char * const *qmenu) { enum v4l2_ctrl_type type; const char *name; u32 flags; u64 step; s64 min; s64 max = _max; s64 def = _def; /* v4l2_ctrl_new_std_menu_items() should only be called for * standard controls without a standard menu. */ if (v4l2_ctrl_get_menu(id)) { handler_set_err(hdl, -EINVAL); return NULL; } v4l2_ctrl_fill(id, &name, &type, &min, &max, &step, &def, &flags); if (type != V4L2_CTRL_TYPE_MENU || qmenu == NULL) { handler_set_err(hdl, -EINVAL); return NULL; } return v4l2_ctrl_new(hdl, ops, NULL, id, name, type, 0, max, mask, def, NULL, 0, flags, qmenu, NULL, ptr_null, ptr_null, ptr_null, NULL); } EXPORT_SYMBOL(v4l2_ctrl_new_std_menu_items); /* Helper function for standard compound controls */ struct v4l2_ctrl *v4l2_ctrl_new_std_compound(struct v4l2_ctrl_handler *hdl, const struct v4l2_ctrl_ops *ops, u32 id, const union v4l2_ctrl_ptr p_def, const union v4l2_ctrl_ptr p_min, const union v4l2_ctrl_ptr p_max) { const char *name; enum v4l2_ctrl_type type; u32 flags; s64 min, max, step, def; v4l2_ctrl_fill(id, &name, &type, &min, &max, &step, &def, &flags); if (type < V4L2_CTRL_COMPOUND_TYPES) { handler_set_err(hdl, -EINVAL); return NULL; } return v4l2_ctrl_new(hdl, ops, NULL, id, name, type, min, max, step, def, NULL, 0, flags, NULL, NULL, p_def, p_min, p_max, NULL); } EXPORT_SYMBOL(v4l2_ctrl_new_std_compound); /* Helper function for standard integer menu controls */ struct v4l2_ctrl *v4l2_ctrl_new_int_menu(struct v4l2_ctrl_handler *hdl, const struct v4l2_ctrl_ops *ops, u32 id, u8 _max, u8 _def, const s64 *qmenu_int) { const char *name; enum v4l2_ctrl_type type; s64 min; u64 step; s64 max = _max; s64 def = _def; u32 flags; v4l2_ctrl_fill(id, &name, &type, &min, &max, &step, &def, &flags); if (type != V4L2_CTRL_TYPE_INTEGER_MENU) { handler_set_err(hdl, -EINVAL); return NULL; } return v4l2_ctrl_new(hdl, ops, NULL, id, name, type, 0, max, 0, def, NULL, 0, flags, NULL, qmenu_int, ptr_null, ptr_null, ptr_null, NULL); } EXPORT_SYMBOL(v4l2_ctrl_new_int_menu); /* Add the controls from another handler to our own. */ int v4l2_ctrl_add_handler(struct v4l2_ctrl_handler *hdl, struct v4l2_ctrl_handler *add, bool (*filter)(const struct v4l2_ctrl *ctrl), bool from_other_dev) { struct v4l2_ctrl_ref *ref; int ret = 0; /* Do nothing if either handler is NULL or if they are the same */ if (!hdl || !add || hdl == add) return 0; if (hdl->error) return hdl->error; mutex_lock(add->lock); list_for_each_entry(ref, &add->ctrl_refs, node) { struct v4l2_ctrl *ctrl = ref->ctrl; /* Skip handler-private controls. */ if (ctrl->is_private) continue; /* And control classes */ if (ctrl->type == V4L2_CTRL_TYPE_CTRL_CLASS) continue; /* Filter any unwanted controls */ if (filter && !filter(ctrl)) continue; ret = handler_new_ref(hdl, ctrl, NULL, from_other_dev, false); if (ret) break; } mutex_unlock(add->lock); return ret; } EXPORT_SYMBOL(v4l2_ctrl_add_handler); bool v4l2_ctrl_radio_filter(const struct v4l2_ctrl *ctrl) { if (V4L2_CTRL_ID2WHICH(ctrl->id) == V4L2_CTRL_CLASS_FM_TX) return true; if (V4L2_CTRL_ID2WHICH(ctrl->id) == V4L2_CTRL_CLASS_FM_RX) return true; switch (ctrl->id) { case V4L2_CID_AUDIO_MUTE: case V4L2_CID_AUDIO_VOLUME: case V4L2_CID_AUDIO_BALANCE: case V4L2_CID_AUDIO_BASS: case V4L2_CID_AUDIO_TREBLE: case V4L2_CID_AUDIO_LOUDNESS: return true; default: break; } return false; } EXPORT_SYMBOL(v4l2_ctrl_radio_filter); /* Cluster controls */ void v4l2_ctrl_cluster(unsigned ncontrols, struct v4l2_ctrl **controls) { bool has_volatiles = false; int i; /* The first control is the master control and it must not be NULL */ if (WARN_ON(ncontrols == 0 || controls[0] == NULL)) return; for (i = 0; i < ncontrols; i++) { if (controls[i]) { controls[i]->cluster = controls; controls[i]->ncontrols = ncontrols; if (controls[i]->flags & V4L2_CTRL_FLAG_VOLATILE) has_volatiles = true; } } controls[0]->has_volatiles = has_volatiles; } EXPORT_SYMBOL(v4l2_ctrl_cluster); void v4l2_ctrl_auto_cluster(unsigned ncontrols, struct v4l2_ctrl **controls, u8 manual_val, bool set_volatile) { struct v4l2_ctrl *master = controls[0]; u32 flag = 0; int i; v4l2_ctrl_cluster(ncontrols, controls); WARN_ON(ncontrols <= 1); WARN_ON(manual_val < master->minimum || manual_val > master->maximum); WARN_ON(set_volatile && !has_op(master, g_volatile_ctrl)); master->is_auto = true; master->has_volatiles = set_volatile; master->manual_mode_value = manual_val; master->flags |= V4L2_CTRL_FLAG_UPDATE; if (!is_cur_manual(master)) flag = V4L2_CTRL_FLAG_INACTIVE | (set_volatile ? V4L2_CTRL_FLAG_VOLATILE : 0); for (i = 1; i < ncontrols; i++) if (controls[i]) controls[i]->flags |= flag; } EXPORT_SYMBOL(v4l2_ctrl_auto_cluster); /* * Obtain the current volatile values of an autocluster and mark them * as new. */ void update_from_auto_cluster(struct v4l2_ctrl *master) { int i; for (i = 1; i < master->ncontrols; i++) cur_to_new(master->cluster[i]); if (!call_op(master, g_volatile_ctrl)) for (i = 1; i < master->ncontrols; i++) if (master->cluster[i]) master->cluster[i]->is_new = 1; } /* * Return non-zero if one or more of the controls in the cluster has a new * value that differs from the current value. */ static int cluster_changed(struct v4l2_ctrl *master) { bool changed = false; int i; for (i = 0; i < master->ncontrols; i++) { struct v4l2_ctrl *ctrl = master->cluster[i]; bool ctrl_changed = false; if (!ctrl) continue; if (ctrl->flags & V4L2_CTRL_FLAG_EXECUTE_ON_WRITE) { changed = true; ctrl_changed = true; } /* * Set has_changed to false to avoid generating * the event V4L2_EVENT_CTRL_CH_VALUE */ if (ctrl->flags & V4L2_CTRL_FLAG_VOLATILE) { ctrl->has_changed = false; continue; } if (ctrl->elems != ctrl->new_elems) ctrl_changed = true; if (!ctrl_changed) ctrl_changed = !ctrl->type_ops->equal(ctrl, ctrl->p_cur, ctrl->p_new); ctrl->has_changed = ctrl_changed; changed |= ctrl->has_changed; } return changed; } /* * Core function that calls try/s_ctrl and ensures that the new value is * copied to the current value on a set. * Must be called with ctrl->handler->lock held. */ int try_or_set_cluster(struct v4l2_fh *fh, struct v4l2_ctrl *master, bool set, u32 ch_flags) { bool update_flag; int ret; int i; /* * Go through the cluster and either validate the new value or * (if no new value was set), copy the current value to the new * value, ensuring a consistent view for the control ops when * called. */ for (i = 0; i < master->ncontrols; i++) { struct v4l2_ctrl *ctrl = master->cluster[i]; if (!ctrl) continue; if (!ctrl->is_new) { cur_to_new(ctrl); continue; } /* * Check again: it may have changed since the * previous check in try_or_set_ext_ctrls(). */ if (set && (ctrl->flags & V4L2_CTRL_FLAG_GRABBED)) return -EBUSY; } ret = call_op(master, try_ctrl); /* Don't set if there is no change */ if (ret || !set || !cluster_changed(master)) return ret; ret = call_op(master, s_ctrl); if (ret) return ret; /* If OK, then make the new values permanent. */ update_flag = is_cur_manual(master) != is_new_manual(master); for (i = 0; i < master->ncontrols; i++) { /* * If we switch from auto to manual mode, and this cluster * contains volatile controls, then all non-master controls * have to be marked as changed. The 'new' value contains * the volatile value (obtained by update_from_auto_cluster), * which now has to become the current value. */ if (i && update_flag && is_new_manual(master) && master->has_volatiles && master->cluster[i]) master->cluster[i]->has_changed = true; new_to_cur(fh, master->cluster[i], ch_flags | ((update_flag && i > 0) ? V4L2_EVENT_CTRL_CH_FLAGS : 0)); } return 0; } /* Activate/deactivate a control. */ void v4l2_ctrl_activate(struct v4l2_ctrl *ctrl, bool active) { /* invert since the actual flag is called 'inactive' */ bool inactive = !active; bool old; if (ctrl == NULL) return; if (inactive) /* set V4L2_CTRL_FLAG_INACTIVE */ old = test_and_set_bit(4, &ctrl->flags); else /* clear V4L2_CTRL_FLAG_INACTIVE */ old = test_and_clear_bit(4, &ctrl->flags); if (old != inactive) send_event(NULL, ctrl, V4L2_EVENT_CTRL_CH_FLAGS); } EXPORT_SYMBOL(v4l2_ctrl_activate); void __v4l2_ctrl_grab(struct v4l2_ctrl *ctrl, bool grabbed) { bool old; if (ctrl == NULL) return; lockdep_assert_held(ctrl->handler->lock); if (grabbed) /* set V4L2_CTRL_FLAG_GRABBED */ old = test_and_set_bit(1, &ctrl->flags); else /* clear V4L2_CTRL_FLAG_GRABBED */ old = test_and_clear_bit(1, &ctrl->flags); if (old != grabbed) send_event(NULL, ctrl, V4L2_EVENT_CTRL_CH_FLAGS); } EXPORT_SYMBOL(__v4l2_ctrl_grab); /* Call s_ctrl for all controls owned by the handler */ int __v4l2_ctrl_handler_setup(struct v4l2_ctrl_handler *hdl) { struct v4l2_ctrl *ctrl; int ret = 0; if (hdl == NULL) return 0; lockdep_assert_held(hdl->lock); list_for_each_entry(ctrl, &hdl->ctrls, node) ctrl->done = false; list_for_each_entry(ctrl, &hdl->ctrls, node) { struct v4l2_ctrl *master = ctrl->cluster[0]; int i; /* Skip if this control was already handled by a cluster. */ /* Skip button controls and read-only controls. */ if (ctrl->done || ctrl->type == V4L2_CTRL_TYPE_BUTTON || (ctrl->flags & V4L2_CTRL_FLAG_READ_ONLY)) continue; for (i = 0; i < master->ncontrols; i++) { if (master->cluster[i]) { cur_to_new(master->cluster[i]); master->cluster[i]->is_new = 1; master->cluster[i]->done = true; } } ret = call_op(master, s_ctrl); if (ret) break; } return ret; } EXPORT_SYMBOL_GPL(__v4l2_ctrl_handler_setup); int v4l2_ctrl_handler_setup(struct v4l2_ctrl_handler *hdl) { int ret; if (hdl == NULL) return 0; mutex_lock(hdl->lock); ret = __v4l2_ctrl_handler_setup(hdl); mutex_unlock(hdl->lock); return ret; } EXPORT_SYMBOL(v4l2_ctrl_handler_setup); /* Log the control name and value */ static void log_ctrl(const struct v4l2_ctrl *ctrl, const char *prefix, const char *colon) { if (ctrl->flags & (V4L2_CTRL_FLAG_DISABLED | V4L2_CTRL_FLAG_WRITE_ONLY)) return; if (ctrl->type == V4L2_CTRL_TYPE_CTRL_CLASS) return; pr_info("%s%s%s: ", prefix, colon, ctrl->name); ctrl->type_ops->log(ctrl); if (ctrl->flags & (V4L2_CTRL_FLAG_INACTIVE | V4L2_CTRL_FLAG_GRABBED | V4L2_CTRL_FLAG_VOLATILE)) { if (ctrl->flags & V4L2_CTRL_FLAG_INACTIVE) pr_cont(" inactive"); if (ctrl->flags & V4L2_CTRL_FLAG_GRABBED) pr_cont(" grabbed"); if (ctrl->flags & V4L2_CTRL_FLAG_VOLATILE) pr_cont(" volatile"); } pr_cont("\n"); } /* Log all controls owned by the handler */ void v4l2_ctrl_handler_log_status(struct v4l2_ctrl_handler *hdl, const char *prefix) { struct v4l2_ctrl *ctrl; const char *colon = ""; int len; if (!hdl) return; if (!prefix) prefix = ""; len = strlen(prefix); if (len && prefix[len - 1] != ' ') colon = ": "; mutex_lock(hdl->lock); list_for_each_entry(ctrl, &hdl->ctrls, node) if (!(ctrl->flags & V4L2_CTRL_FLAG_DISABLED)) log_ctrl(ctrl, prefix, colon); mutex_unlock(hdl->lock); } EXPORT_SYMBOL(v4l2_ctrl_handler_log_status); int v4l2_ctrl_new_fwnode_properties(struct v4l2_ctrl_handler *hdl, const struct v4l2_ctrl_ops *ctrl_ops, const struct v4l2_fwnode_device_properties *p) { if (hdl->error) return hdl->error; if (p->orientation != V4L2_FWNODE_PROPERTY_UNSET) { u32 orientation_ctrl; switch (p->orientation) { case V4L2_FWNODE_ORIENTATION_FRONT: orientation_ctrl = V4L2_CAMERA_ORIENTATION_FRONT; break; case V4L2_FWNODE_ORIENTATION_BACK: orientation_ctrl = V4L2_CAMERA_ORIENTATION_BACK; break; case V4L2_FWNODE_ORIENTATION_EXTERNAL: orientation_ctrl = V4L2_CAMERA_ORIENTATION_EXTERNAL; break; default: hdl->error = -EINVAL; return hdl->error; } if (!v4l2_ctrl_new_std_menu(hdl, ctrl_ops, V4L2_CID_CAMERA_ORIENTATION, V4L2_CAMERA_ORIENTATION_EXTERNAL, 0, orientation_ctrl)) return hdl->error; } if (p->rotation != V4L2_FWNODE_PROPERTY_UNSET) { if (!v4l2_ctrl_new_std(hdl, ctrl_ops, V4L2_CID_CAMERA_SENSOR_ROTATION, p->rotation, p->rotation, 1, p->rotation)) return hdl->error; } return hdl->error; } EXPORT_SYMBOL(v4l2_ctrl_new_fwnode_properties);
480 164 5821 2673 302 2180 217 1 1 1 3 200 22 181 631 2 631 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_DCACHE_H #define __LINUX_DCACHE_H #include <linux/atomic.h> #include <linux/list.h> #include <linux/math.h> #include <linux/rculist.h> #include <linux/rculist_bl.h> #include <linux/spinlock.h> #include <linux/seqlock.h> #include <linux/cache.h> #include <linux/rcupdate.h> #include <linux/lockref.h> #include <linux/stringhash.h> #include <linux/wait.h> struct path; struct file; struct vfsmount; /* * linux/include/linux/dcache.h * * Dirent cache data structures * * (C) Copyright 1997 Thomas Schoebel-Theuer, * with heavy changes by Linus Torvalds */ #define IS_ROOT(x) ((x) == (x)->d_parent) /* The hash is always the low bits of hash_len */ #ifdef __LITTLE_ENDIAN #define HASH_LEN_DECLARE u32 hash; u32 len #define bytemask_from_count(cnt) (~(~0ul << (cnt)*8)) #else #define HASH_LEN_DECLARE u32 len; u32 hash #define bytemask_from_count(cnt) (~(~0ul >> (cnt)*8)) #endif /* * "quick string" -- eases parameter passing, but more importantly * saves "metadata" about the string (ie length and the hash). * * hash comes first so it snuggles against d_parent in the * dentry. */ struct qstr { union { struct { HASH_LEN_DECLARE; }; u64 hash_len; }; const unsigned char *name; }; #define QSTR_INIT(n,l) { { { .len = l } }, .name = n } #define QSTR_LEN(n,l) (struct qstr)QSTR_INIT(n,l) #define QSTR(n) QSTR_LEN(n, strlen(n)) extern const struct qstr empty_name; extern const struct qstr slash_name; extern const struct qstr dotdot_name; /* * Try to keep struct dentry aligned on 64 byte cachelines (this will * give reasonable cacheline footprint with larger lines without the * large memory footprint increase). */ #ifdef CONFIG_64BIT # define DNAME_INLINE_WORDS 5 /* 192 bytes */ #else # ifdef CONFIG_SMP # define DNAME_INLINE_WORDS 9 /* 128 bytes */ # else # define DNAME_INLINE_WORDS 11 /* 128 bytes */ # endif #endif #define DNAME_INLINE_LEN (DNAME_INLINE_WORDS*sizeof(unsigned long)) union shortname_store { unsigned char string[DNAME_INLINE_LEN]; unsigned long words[DNAME_INLINE_WORDS]; }; #define d_lock d_lockref.lock #define d_iname d_shortname.string struct completion_list; struct dentry { /* RCU lookup touched fields */ unsigned int d_flags; /* protected by d_lock */ seqcount_spinlock_t d_seq; /* per dentry seqlock */ struct hlist_bl_node d_hash; /* lookup hash list */ struct dentry *d_parent; /* parent directory */ union { struct qstr __d_name; /* for use ONLY in fs/dcache.c */ const struct qstr d_name; }; struct inode *d_inode; /* Where the name belongs to - NULL is * negative */ union shortname_store d_shortname; /* --- cacheline 1 boundary (64 bytes) was 32 bytes ago --- */ /* Ref lookup also touches following */ const struct dentry_operations *d_op; struct super_block *d_sb; /* The root of the dentry tree */ unsigned long d_time; /* used by d_revalidate */ void *d_fsdata; /* fs-specific data */ /* --- cacheline 2 boundary (128 bytes) --- */ struct lockref d_lockref; /* per-dentry lock and refcount * keep separate from RCU lookup area if * possible! */ union { struct list_head d_lru; /* LRU list */ wait_queue_head_t *d_wait; /* in-lookup ones only */ }; struct hlist_node d_sib; /* child of parent list */ struct hlist_head d_children; /* our children */ /* * the following members can share memory - their uses are * mutually exclusive. */ union { /* positives: inode alias list */ struct hlist_node d_alias; /* in-lookup ones (all negative, live): hash chain */ struct hlist_bl_node d_in_lookup_hash; /* killed ones: (already negative) used to schedule freeing */ struct rcu_head d_rcu; /* * live non-in-lookup negatives: used if shrink_dcache_tree() * races with eviction by another thread and needs to wait for * this dentry to get killed . Remains NULL for almost all * negative dentries. */ struct completion_list *waiters; }; }; /* * dentry->d_lock spinlock nesting subclasses: * * 0: normal * 1: nested */ enum dentry_d_lock_class { DENTRY_D_LOCK_NORMAL, /* implicitly used by plain spin_lock() APIs. */ DENTRY_D_LOCK_NESTED }; enum d_real_type { D_REAL_DATA, D_REAL_METADATA, }; struct dentry_operations { int (*d_revalidate)(struct inode *, const struct qstr *, struct dentry *, unsigned int); int (*d_weak_revalidate)(struct dentry *, unsigned int); int (*d_hash)(const struct dentry *, struct qstr *); int (*d_compare)(const struct dentry *, unsigned int, const char *, const struct qstr *); int (*d_delete)(const struct dentry *); int (*d_init)(struct dentry *); void (*d_release)(struct dentry *); void (*d_prune)(struct dentry *); void (*d_iput)(struct dentry *, struct inode *); char *(*d_dname)(struct dentry *, char *, int); struct vfsmount *(*d_automount)(struct path *); int (*d_manage)(const struct path *, bool); struct dentry *(*d_real)(struct dentry *, enum d_real_type type); bool (*d_unalias_trylock)(const struct dentry *); void (*d_unalias_unlock)(const struct dentry *); } ____cacheline_aligned; /* * Locking rules for dentry_operations callbacks are to be found in * Documentation/filesystems/locking.rst. Keep it updated! * * FUrther descriptions are found in Documentation/filesystems/vfs.rst. * Keep it updated too! */ /* d_flags entries */ enum dentry_flags { DCACHE_OP_HASH = BIT(0), DCACHE_OP_COMPARE = BIT(1), DCACHE_OP_REVALIDATE = BIT(2), DCACHE_OP_DELETE = BIT(3), DCACHE_OP_PRUNE = BIT(4), /* * This dentry is possibly not currently connected to the dcache tree, * in which case its parent will either be itself, or will have this * flag as well. nfsd will not use a dentry with this bit set, but will * first endeavour to clear the bit either by discovering that it is * connected, or by performing lookup operations. Any filesystem which * supports nfsd_operations MUST have a lookup function which, if it * finds a directory inode with a DCACHE_DISCONNECTED dentry, will * d_move that dentry into place and return that dentry rather than the * passed one, typically using d_splice_alias. */ DCACHE_DISCONNECTED = BIT(5), DCACHE_REFERENCED = BIT(6), /* Recently used, don't discard. */ DCACHE_DONTCACHE = BIT(7), /* Purge from memory on final dput() */ DCACHE_CANT_MOUNT = BIT(8), DCACHE_SHRINK_LIST = BIT(10), DCACHE_OP_WEAK_REVALIDATE = BIT(11), /* * this dentry has been "silly renamed" and has to be deleted on the * last dput() */ DCACHE_NFSFS_RENAMED = BIT(12), DCACHE_FSNOTIFY_PARENT_WATCHED = BIT(13), /* Parent inode is watched by some fsnotify listener */ DCACHE_DENTRY_KILLED = BIT(14), DCACHE_MOUNTED = BIT(15), /* is a mountpoint */ DCACHE_NEED_AUTOMOUNT = BIT(16), /* handle automount on this dir */ DCACHE_MANAGE_TRANSIT = BIT(17), /* manage transit from this dirent */ DCACHE_LRU_LIST = BIT(18), DCACHE_ENTRY_TYPE = (7 << 19), /* bits 19..21 are for storing type: */ DCACHE_MISS_TYPE = (0 << 19), /* Negative dentry */ DCACHE_WHITEOUT_TYPE = (1 << 19), /* Whiteout dentry (stop pathwalk) */ DCACHE_DIRECTORY_TYPE = (2 << 19), /* Normal directory */ DCACHE_AUTODIR_TYPE = (3 << 19), /* Lookupless directory (presumed automount) */ DCACHE_REGULAR_TYPE = (4 << 19), /* Regular file type */ DCACHE_SPECIAL_TYPE = (5 << 19), /* Other file type */ DCACHE_SYMLINK_TYPE = (6 << 19), /* Symlink */ DCACHE_NOKEY_NAME = BIT(22), /* Encrypted name encoded without key */ DCACHE_OP_REAL = BIT(23), DCACHE_PAR_LOOKUP = BIT(24), /* being looked up (with parent locked shared) */ DCACHE_DENTRY_CURSOR = BIT(25), DCACHE_NORCU = BIT(26), /* No RCU delay for freeing */ DCACHE_PERSISTENT = BIT(27) }; #define DCACHE_MANAGED_DENTRY \ (DCACHE_MOUNTED|DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT) extern seqlock_t rename_lock; /* * These are the low-level FS interfaces to the dcache.. */ extern void d_instantiate(struct dentry *, struct inode *); extern void d_instantiate_new(struct dentry *, struct inode *); extern void __d_drop(struct dentry *dentry); extern void d_drop(struct dentry *dentry); extern void d_delete(struct dentry *); /* allocate/de-allocate */ extern struct dentry * d_alloc(struct dentry *, const struct qstr *); extern struct dentry * d_alloc_anon(struct super_block *); extern struct dentry * d_alloc_parallel(struct dentry *, const struct qstr *, wait_queue_head_t *); extern struct dentry * d_splice_alias(struct inode *, struct dentry *); /* weird procfs mess; *NOT* exported */ extern struct dentry * d_splice_alias_ops(struct inode *, struct dentry *, const struct dentry_operations *); extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *); extern bool d_same_name(const struct dentry *dentry, const struct dentry *parent, const struct qstr *name); extern struct dentry *d_find_any_alias(struct inode *inode); extern struct dentry * d_obtain_alias(struct inode *); extern struct dentry * d_obtain_root(struct inode *); extern void shrink_dcache_sb(struct super_block *); extern void shrink_dcache_parent(struct dentry *); extern void d_invalidate(struct dentry *); /* only used at mount-time */ extern struct dentry * d_make_root(struct inode *); extern void d_mark_tmpfile(struct file *, struct inode *); int d_mark_tmpfile_name(struct file *file, const struct qstr *name); extern void d_tmpfile(struct file *, struct inode *); extern struct dentry *d_find_alias(struct inode *); extern void d_prune_aliases(struct inode *); extern void d_dispose_if_unused(struct dentry *, struct list_head *); extern void shrink_dentry_list(struct list_head *); extern struct dentry *d_find_alias_rcu(struct inode *); /* test whether we have any submounts in a subdir tree */ extern int path_has_submounts(const struct path *); /* * This adds the entry to the hash queues. */ extern void d_rehash(struct dentry *); extern void d_add(struct dentry *, struct inode *); /* used for rename() and baskets */ extern void d_move(struct dentry *, struct dentry *); extern void d_exchange(struct dentry *, struct dentry *); extern struct dentry *d_ancestor(struct dentry *, struct dentry *); extern struct dentry *d_lookup(const struct dentry *, const struct qstr *); static inline unsigned d_count(const struct dentry *dentry) { return dentry->d_lockref.count; } ino_t d_parent_ino(struct dentry *dentry); /* * helper function for dentry_operations.d_dname() members */ extern __printf(3, 4) char *dynamic_dname(char *, int, const char *, ...); extern char *__d_path(const struct path *, const struct path *, char *, int); extern char *d_absolute_path(const struct path *, char *, int); extern char *d_path(const struct path *, char *, int); extern char *dentry_path_raw(const struct dentry *, char *, int); extern char *dentry_path(const struct dentry *, char *, int); /* Allocation counts.. */ /** * dget_dlock - get a reference to a dentry * @dentry: dentry to get a reference to * * Given a live dentry, increment the reference count and return the dentry. * Caller must hold @dentry->d_lock. Making sure that dentry is alive is * caller's resonsibility. There are many conditions sufficient to guarantee * that; e.g. anything with non-negative refcount is alive, so's anything * hashed, anything positive, anyone's parent, etc. */ static inline struct dentry *dget_dlock(struct dentry *dentry) { dentry->d_lockref.count++; return dentry; } /** * dget - get a reference to a dentry * @dentry: dentry to get a reference to * * Given a dentry or %NULL pointer increment the reference count * if appropriate and return the dentry. A dentry will not be * destroyed when it has references. Conversely, a dentry with * no references can disappear for any number of reasons, starting * with memory pressure. In other words, that primitive is * used to clone an existing reference; using it on something with * zero refcount is a bug. * * NOTE: it will spin if @dentry->d_lock is held. From the deadlock * avoidance point of view it is equivalent to spin_lock()/increment * refcount/spin_unlock(), so calling it under @dentry->d_lock is * always a bug; so's calling it under ->d_lock on any of its descendents. * */ static inline struct dentry *dget(struct dentry *dentry) { if (dentry) lockref_get(&dentry->d_lockref); return dentry; } extern struct dentry *dget_parent(struct dentry *dentry); /** * d_unhashed - is dentry hashed * @dentry: entry to check * * Returns true if the dentry passed is not currently hashed. */ static inline int d_unhashed(const struct dentry *dentry) { return hlist_bl_unhashed(&dentry->d_hash); } static inline int d_unlinked(const struct dentry *dentry) { return d_unhashed(dentry) && !IS_ROOT(dentry); } static inline int cant_mount(const struct dentry *dentry) { return (dentry->d_flags & DCACHE_CANT_MOUNT); } static inline void dont_mount(struct dentry *dentry) { spin_lock(&dentry->d_lock); dentry->d_flags |= DCACHE_CANT_MOUNT; spin_unlock(&dentry->d_lock); } extern void __d_lookup_unhash_wake(struct dentry *dentry); static inline int d_in_lookup(const struct dentry *dentry) { return dentry->d_flags & DCACHE_PAR_LOOKUP; } static inline void d_lookup_done(struct dentry *dentry) { if (unlikely(d_in_lookup(dentry))) __d_lookup_unhash_wake(dentry); } extern void dput(struct dentry *); static inline bool d_managed(const struct dentry *dentry) { return dentry->d_flags & DCACHE_MANAGED_DENTRY; } static inline bool d_mountpoint(const struct dentry *dentry) { return dentry->d_flags & DCACHE_MOUNTED; } /* * Directory cache entry type accessor functions. */ static inline unsigned __d_entry_type(const struct dentry *dentry) { return dentry->d_flags & DCACHE_ENTRY_TYPE; } static inline bool d_is_miss(const struct dentry *dentry) { return __d_entry_type(dentry) == DCACHE_MISS_TYPE; } static inline bool d_is_whiteout(const struct dentry *dentry) { return __d_entry_type(dentry) == DCACHE_WHITEOUT_TYPE; } static inline bool d_can_lookup(const struct dentry *dentry) { return __d_entry_type(dentry) == DCACHE_DIRECTORY_TYPE; } static inline bool d_is_autodir(const struct dentry *dentry) { return __d_entry_type(dentry) == DCACHE_AUTODIR_TYPE; } static inline bool d_is_dir(const struct dentry *dentry) { return d_can_lookup(dentry) || d_is_autodir(dentry); } static inline bool d_is_symlink(const struct dentry *dentry) { return __d_entry_type(dentry) == DCACHE_SYMLINK_TYPE; } static inline bool d_is_reg(const struct dentry *dentry) { return __d_entry_type(dentry) == DCACHE_REGULAR_TYPE; } static inline bool d_is_special(const struct dentry *dentry) { return __d_entry_type(dentry) == DCACHE_SPECIAL_TYPE; } static inline bool d_is_file(const struct dentry *dentry) { return d_is_reg(dentry) || d_is_special(dentry); } static inline bool d_is_negative(const struct dentry *dentry) { // TODO: check d_is_whiteout(dentry) also. return d_is_miss(dentry); } static inline bool d_flags_negative(unsigned flags) { return (flags & DCACHE_ENTRY_TYPE) == DCACHE_MISS_TYPE; } static inline bool d_is_positive(const struct dentry *dentry) { return !d_is_negative(dentry); } /** * d_really_is_negative - Determine if a dentry is really negative (ignoring fallthroughs) * @dentry: The dentry in question * * Returns true if the dentry represents either an absent name or a name that * doesn't map to an inode (ie. ->d_inode is NULL). The dentry could represent * a true miss, a whiteout that isn't represented by a 0,0 chardev or a * fallthrough marker in an opaque directory. * * Note! (1) This should be used *only* by a filesystem to examine its own * dentries. It should not be used to look at some other filesystem's * dentries. (2) It should also be used in combination with d_inode() to get * the inode. (3) The dentry may have something attached to ->d_lower and the * type field of the flags may be set to something other than miss or whiteout. */ static inline bool d_really_is_negative(const struct dentry *dentry) { return dentry->d_inode == NULL; } /** * d_really_is_positive - Determine if a dentry is really positive (ignoring fallthroughs) * @dentry: The dentry in question * * Returns true if the dentry represents a name that maps to an inode * (ie. ->d_inode is not NULL). The dentry might still represent a whiteout if * that is represented on medium as a 0,0 chardev. * * Note! (1) This should be used *only* by a filesystem to examine its own * dentries. It should not be used to look at some other filesystem's * dentries. (2) It should also be used in combination with d_inode() to get * the inode. */ static inline bool d_really_is_positive(const struct dentry *dentry) { return dentry->d_inode != NULL; } static inline int simple_positive(const struct dentry *dentry) { return d_really_is_positive(dentry) && !d_unhashed(dentry); } unsigned long vfs_pressure_ratio(unsigned long val); /** * d_inode - Get the actual inode of this dentry * @dentry: The dentry to query * * This is the helper normal filesystems should use to get at their own inodes * in their own dentries and ignore the layering superimposed upon them. */ static inline struct inode *d_inode(const struct dentry *dentry) { return dentry->d_inode; } /** * d_inode_rcu - Get the actual inode of this dentry with READ_ONCE() * @dentry: The dentry to query * * This is the helper normal filesystems should use to get at their own inodes * in their own dentries and ignore the layering superimposed upon them. */ static inline struct inode *d_inode_rcu(const struct dentry *dentry) { return READ_ONCE(dentry->d_inode); } /** * d_backing_inode - Get upper or lower inode we should be using * @upper: The upper layer * * This is the helper that should be used to get at the inode that will be used * if this dentry were to be opened as a file. The inode may be on the upper * dentry or it may be on a lower dentry pinned by the upper. * * Normal filesystems should not use this to access their own inodes. */ static inline struct inode *d_backing_inode(const struct dentry *upper) { struct inode *inode = upper->d_inode; return inode; } /** * d_real - Return the real dentry * @dentry: the dentry to query * @type: the type of real dentry (data or metadata) * * If dentry is on a union/overlay, then return the underlying, real dentry. * Otherwise return the dentry itself. * * See also: Documentation/filesystems/vfs.rst */ static inline struct dentry *d_real(struct dentry *dentry, enum d_real_type type) { if (unlikely(dentry->d_flags & DCACHE_OP_REAL)) return dentry->d_op->d_real(dentry, type); else return dentry; } /** * d_real_inode - Return the real inode hosting the data * @dentry: The dentry to query * * If dentry is on a union/overlay, then return the underlying, real inode. * Otherwise return d_inode(). */ static inline struct inode *d_real_inode(const struct dentry *dentry) { /* This usage of d_real() results in const dentry */ return d_inode(d_real((struct dentry *) dentry, D_REAL_DATA)); } struct name_snapshot { struct qstr name; union shortname_store inline_name; }; void take_dentry_name_snapshot(struct name_snapshot *, struct dentry *); void release_dentry_name_snapshot(struct name_snapshot *); static inline struct dentry *d_first_child(const struct dentry *dentry) { return hlist_entry_safe(dentry->d_children.first, struct dentry, d_sib); } static inline struct dentry *d_next_sibling(const struct dentry *dentry) { return hlist_entry_safe(dentry->d_sib.next, struct dentry, d_sib); } void set_default_d_op(struct super_block *, const struct dentry_operations *); struct dentry *d_make_persistent(struct dentry *, struct inode *); void d_make_discardable(struct dentry *dentry); /* inode->i_lock must be held over that */ #define for_each_alias(dentry, inode) \ hlist_for_each_entry(dentry, &(inode)->i_dentry, d_alias) #endif /* __LINUX_DCACHE_H */
12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __SEQ_FILE_NET_H__ #define __SEQ_FILE_NET_H__ #include <linux/seq_file.h> #include <net/net_trackers.h> struct net; extern struct net init_net; struct seq_net_private { #ifdef CONFIG_NET_NS struct net *net; netns_tracker ns_tracker; #endif }; static inline struct net *seq_file_net(struct seq_file *seq) { #ifdef CONFIG_NET_NS return ((struct seq_net_private *)seq->private)->net; #else return &init_net; #endif } /* * This one is needed for proc_create_net_single since net is stored directly * in private not as a struct i.e. seq_file_net can't be used. */ static inline struct net *seq_file_single_net(struct seq_file *seq) { #ifdef CONFIG_NET_NS return (struct net *)seq->private; #else return &init_net; #endif } #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 /* * net/tipc/bearer.h: Include file for TIPC bearer code * * Copyright (c) 1996-2006, 2013-2016, Ericsson AB * Copyright (c) 2005, 2010-2011, Wind River Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #ifndef _TIPC_BEARER_H #define _TIPC_BEARER_H #include "netlink.h" #include "core.h" #include "msg.h" #include <net/genetlink.h> #define MAX_MEDIA 3 /* Identifiers associated with TIPC message header media address info * - address info field is 32 bytes long * - the field's actual content and length is defined per media * - remaining unused bytes in the field are set to zero */ #define TIPC_MEDIA_INFO_SIZE 32 #define TIPC_MEDIA_TYPE_OFFSET 3 #define TIPC_MEDIA_ADDR_OFFSET 4 /* * Identifiers of supported TIPC media types */ #define TIPC_MEDIA_TYPE_ETH 1 #define TIPC_MEDIA_TYPE_IB 2 #define TIPC_MEDIA_TYPE_UDP 3 /* Minimum bearer MTU */ #define TIPC_MIN_BEARER_MTU (MAX_H_SIZE + INT_H_SIZE) /* Identifiers for distinguishing between broadcast/multicast and replicast */ #define TIPC_BROADCAST_SUPPORT 1 #define TIPC_REPLICAST_SUPPORT 2 /** * struct tipc_media_addr - destination address used by TIPC bearers * @value: address info (format defined by media) * @media_id: TIPC media type identifier * @broadcast: non-zero if address is a broadcast address */ struct tipc_media_addr { u8 value[TIPC_MEDIA_INFO_SIZE]; u8 media_id; u8 broadcast; }; struct tipc_bearer; /** * struct tipc_media - Media specific info exposed to generic bearer layer * @send_msg: routine which handles buffer transmission * @enable_media: routine which enables a media * @disable_media: routine which disables a media * @addr2str: convert media address format to string * @addr2msg: convert from media addr format to discovery msg addr format * @msg2addr: convert from discovery msg addr format to media addr format * @raw2addr: convert from raw addr format to media addr format * @priority: default link (and bearer) priority * @tolerance: default time (in ms) before declaring link failure * @min_win: minimum window (in packets) before declaring link congestion * @max_win: maximum window (in packets) before declaring link congestion * @mtu: max packet size bearer can support for media type not dependent on * underlying device MTU * @type_id: TIPC media identifier * @hwaddr_len: TIPC media address len * @name: media name */ struct tipc_media { int (*send_msg)(struct net *net, struct sk_buff *buf, struct tipc_bearer *b, struct tipc_media_addr *dest); int (*enable_media)(struct net *net, struct tipc_bearer *b, struct nlattr *attr[]); void (*disable_media)(struct tipc_bearer *b); int (*addr2str)(struct tipc_media_addr *addr, char *strbuf, int bufsz); int (*addr2msg)(char *msg, struct tipc_media_addr *addr); int (*msg2addr)(struct tipc_bearer *b, struct tipc_media_addr *addr, char *msg); int (*raw2addr)(struct tipc_bearer *b, struct tipc_media_addr *addr, const char *raw); u32 priority; u32 tolerance; u32 min_win; u32 max_win; u32 mtu; u32 type_id; u32 hwaddr_len; char name[TIPC_MAX_MEDIA_NAME]; }; /** * struct tipc_bearer - Generic TIPC bearer structure * @media_ptr: pointer to additional media-specific information about bearer * @mtu: max packet size bearer can support * @addr: media-specific address associated with bearer * @name: bearer name (format = media:interface) * @media: ptr to media structure associated with bearer * @bcast_addr: media address used in broadcasting * @pt: packet type for bearer * @rcu: rcu struct for tipc_bearer * @priority: default link priority for bearer * @min_win: minimum window (in packets) before declaring link congestion * @max_win: maximum window (in packets) before declaring link congestion * @tolerance: default link tolerance for bearer * @domain: network domain to which links can be established * @identity: array index of this bearer within TIPC bearer array * @disc: ptr to link setup request * @net_plane: network plane ('A' through 'H') currently associated with bearer * @encap_hlen: encap headers length * @up: bearer up flag (bit 0) * @refcnt: tipc_bearer reference counter * * Note: media-specific code is responsible for initialization of the fields * indicated below when a bearer is enabled; TIPC's generic bearer code takes * care of initializing all other fields. */ struct tipc_bearer { void __rcu *media_ptr; /* initialized by media */ u32 mtu; /* initialized by media */ struct tipc_media_addr addr; /* initialized by media */ char name[TIPC_MAX_BEARER_NAME]; struct tipc_media *media; struct tipc_media_addr bcast_addr; struct packet_type pt; struct rcu_head rcu; u32 priority; u32 min_win; u32 max_win; u32 tolerance; u32 domain; u32 identity; struct tipc_discoverer *disc; char net_plane; u16 encap_hlen; unsigned long up; refcount_t refcnt; }; struct tipc_bearer_names { char media_name[TIPC_MAX_MEDIA_NAME]; char if_name[TIPC_MAX_IF_NAME]; }; /* * TIPC routines available to supported media types */ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b); /* * Routines made available to TIPC by supported media types */ extern struct tipc_media eth_media_info; #ifdef CONFIG_TIPC_MEDIA_IB extern struct tipc_media ib_media_info; #endif #ifdef CONFIG_TIPC_MEDIA_UDP extern struct tipc_media udp_media_info; #endif int tipc_nl_bearer_disable(struct sk_buff *skb, struct genl_info *info); int __tipc_nl_bearer_disable(struct sk_buff *skb, struct genl_info *info); int tipc_nl_bearer_enable(struct sk_buff *skb, struct genl_info *info); int __tipc_nl_bearer_enable(struct sk_buff *skb, struct genl_info *info); int tipc_nl_bearer_dump(struct sk_buff *skb, struct netlink_callback *cb); int tipc_nl_bearer_get(struct sk_buff *skb, struct genl_info *info); int tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info); int __tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info); int tipc_nl_bearer_add(struct sk_buff *skb, struct genl_info *info); int tipc_nl_media_dump(struct sk_buff *skb, struct netlink_callback *cb); int tipc_nl_media_get(struct sk_buff *skb, struct genl_info *info); int tipc_nl_media_set(struct sk_buff *skb, struct genl_info *info); int __tipc_nl_media_set(struct sk_buff *skb, struct genl_info *info); int tipc_media_addr_printf(char *buf, int len, struct tipc_media_addr *a); int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b, struct nlattr *attrs[]); bool tipc_bearer_hold(struct tipc_bearer *b); void tipc_bearer_put(struct tipc_bearer *b); void tipc_disable_l2_media(struct tipc_bearer *b); int tipc_l2_send_msg(struct net *net, struct sk_buff *buf, struct tipc_bearer *b, struct tipc_media_addr *dest); void tipc_bearer_add_dest(struct net *net, u32 bearer_id, u32 dest); void tipc_bearer_remove_dest(struct net *net, u32 bearer_id, u32 dest); struct tipc_bearer *tipc_bearer_find(struct net *net, const char *name); int tipc_bearer_get_name(struct net *net, char *name, u32 bearer_id); struct tipc_media *tipc_media_find(const char *name); int tipc_bearer_setup(void); void tipc_bearer_cleanup(void); void tipc_bearer_stop(struct net *net); int tipc_bearer_mtu(struct net *net, u32 bearer_id); int tipc_bearer_min_mtu(struct net *net, u32 bearer_id); bool tipc_bearer_bcast_support(struct net *net, u32 bearer_id); void tipc_bearer_xmit_skb(struct net *net, u32 bearer_id, struct sk_buff *skb, struct tipc_media_addr *dest); void tipc_bearer_xmit(struct net *net, u32 bearer_id, struct sk_buff_head *xmitq, struct tipc_media_addr *dst, struct tipc_node *__dnode); void tipc_bearer_bc_xmit(struct net *net, u32 bearer_id, struct sk_buff_head *xmitq); void tipc_clone_to_loopback(struct net *net, struct sk_buff_head *pkts); int tipc_attach_loopback(struct net *net); void tipc_detach_loopback(struct net *net); static inline void tipc_loopback_trace(struct net *net, struct sk_buff_head *pkts) { if (unlikely(dev_nit_active(net->loopback_dev))) tipc_clone_to_loopback(net, pkts); } /* check if device MTU is too low for tipc headers */ static inline bool tipc_mtu_bad(struct net_device *dev) { if (dev->mtu >= TIPC_MIN_BEARER_MTU) return false; netdev_warn(dev, "MTU too low for tipc bearer\n"); return true; } #endif /* _TIPC_BEARER_H */
3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 // SPDX-License-Identifier: GPL-2.0-or-later /* * Linux IPv6 multicast routing support for BSD pim6sd * Based on net/ipv4/ipmr.c. * * (c) 2004 Mickael Hoerdt, <hoerdt@clarinet.u-strasbg.fr> * LSIIT Laboratory, Strasbourg, France * (c) 2004 Jean-Philippe Andriot, <jean-philippe.andriot@6WIND.com> * 6WIND, Paris, France * Copyright (C)2007,2008 USAGI/WIDE Project * YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org> */ #include <linux/uaccess.h> #include <linux/types.h> #include <linux/sched.h> #include <linux/errno.h> #include <linux/mm.h> #include <linux/kernel.h> #include <linux/fcntl.h> #include <linux/stat.h> #include <linux/socket.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/inetdevice.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/init.h> #include <linux/compat.h> #include <linux/rhashtable.h> #include <net/protocol.h> #include <linux/skbuff.h> #include <net/raw.h> #include <linux/notifier.h> #include <linux/if_arp.h> #include <net/checksum.h> #include <net/netlink.h> #include <net/fib_rules.h> #include <net/ipv6.h> #include <net/ip6_route.h> #include <linux/mroute6.h> #include <linux/pim.h> #include <net/addrconf.h> #include <linux/netfilter_ipv6.h> #include <linux/export.h> #include <net/ip6_checksum.h> #include <linux/netconf.h> #include <net/ip_tunnels.h> #include <linux/nospec.h> struct ip6mr_rule { struct fib_rule common; }; struct ip6mr_result { struct mr_table *mrt; }; /* Big lock, protecting vif table, mrt cache and mroute socket state. Note that the changes are semaphored via rtnl_lock. */ static DEFINE_SPINLOCK(mrt_lock); static struct net_device *vif_dev_read(const struct vif_device *vif) { return rcu_dereference(vif->dev); } /* Multicast router control variables */ /* Special spinlock for queue of unresolved entries */ static DEFINE_SPINLOCK(mfc_unres_lock); /* We return to original Alan's scheme. Hash table of resolved entries is changed only in process context and protected with weak lock mrt_lock. Queue of unresolved entries is protected with strong spinlock mfc_unres_lock. In this case data path is free of exclusive locks at all. */ static struct kmem_cache *mrt_cachep __read_mostly; static struct mr_table *ip6mr_new_table(struct net *net, u32 id); static void ip6mr_free_table(struct mr_table *mrt); static void ip6_mr_forward(struct net *net, struct mr_table *mrt, struct net_device *dev, struct sk_buff *skb, struct mfc6_cache *cache); static int ip6mr_cache_report(const struct mr_table *mrt, struct sk_buff *pkt, mifi_t mifi, int assert); static void mr6_netlink_event(struct mr_table *mrt, struct mfc6_cache *mfc, int cmd); static void mrt6msg_netlink_event(const struct mr_table *mrt, struct sk_buff *pkt); static int ip6mr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack); static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb); static void mroute_clean_tables(struct mr_table *mrt, int flags); static void ipmr_expire_process(struct timer_list *t); #ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES #define ip6mr_for_each_table(mrt, net) \ list_for_each_entry_rcu(mrt, &net->ipv6.mr6_tables, list, \ lockdep_rtnl_is_held() || \ list_empty(&net->ipv6.mr6_tables)) static struct mr_table *ip6mr_mr_table_iter(struct net *net, struct mr_table *mrt) { struct mr_table *ret; if (!mrt) ret = list_entry_rcu(net->ipv6.mr6_tables.next, struct mr_table, list); else ret = list_entry_rcu(mrt->list.next, struct mr_table, list); if (&ret->list == &net->ipv6.mr6_tables) return NULL; return ret; } static struct mr_table *__ip6mr_get_table(struct net *net, u32 id) { struct mr_table *mrt; ip6mr_for_each_table(mrt, net) { if (mrt->id == id) return mrt; } return NULL; } static struct mr_table *ip6mr_get_table(struct net *net, u32 id) { struct mr_table *mrt; rcu_read_lock(); mrt = __ip6mr_get_table(net, id); rcu_read_unlock(); return mrt; } static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6, struct mr_table **mrt) { int err; struct ip6mr_result res; struct fib_lookup_arg arg = { .result = &res, .flags = FIB_LOOKUP_NOREF, }; /* update flow if oif or iif point to device enslaved to l3mdev */ l3mdev_update_flow(net, flowi6_to_flowi(flp6)); err = fib_rules_lookup(net->ipv6.mr6_rules_ops, flowi6_to_flowi(flp6), 0, &arg); if (err < 0) return err; *mrt = res.mrt; return 0; } static int ip6mr_rule_action(struct fib_rule *rule, struct flowi *flp, int flags, struct fib_lookup_arg *arg) { struct ip6mr_result *res = arg->result; struct mr_table *mrt; switch (rule->action) { case FR_ACT_TO_TBL: break; case FR_ACT_UNREACHABLE: return -ENETUNREACH; case FR_ACT_PROHIBIT: return -EACCES; case FR_ACT_BLACKHOLE: default: return -EINVAL; } arg->table = fib_rule_get_table(rule, arg); mrt = __ip6mr_get_table(rule->fr_net, arg->table); if (!mrt) return -EAGAIN; res->mrt = mrt; return 0; } static int ip6mr_rule_match(struct fib_rule *rule, struct flowi *flp, int flags) { return 1; } static int ip6mr_rule_configure(struct fib_rule *rule, struct sk_buff *skb, struct fib_rule_hdr *frh, struct nlattr **tb, struct netlink_ext_ack *extack) { return 0; } static int ip6mr_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, struct nlattr **tb) { return 1; } static int ip6mr_rule_fill(struct fib_rule *rule, struct sk_buff *skb, struct fib_rule_hdr *frh) { frh->dst_len = 0; frh->src_len = 0; frh->tos = 0; return 0; } static const struct fib_rules_ops __net_initconst ip6mr_rules_ops_template = { .family = RTNL_FAMILY_IP6MR, .rule_size = sizeof(struct ip6mr_rule), .addr_size = sizeof(struct in6_addr), .action = ip6mr_rule_action, .match = ip6mr_rule_match, .configure = ip6mr_rule_configure, .compare = ip6mr_rule_compare, .fill = ip6mr_rule_fill, .nlgroup = RTNLGRP_IPV6_RULE, .owner = THIS_MODULE, }; static int __net_init ip6mr_rules_init(struct net *net) { struct fib_rules_ops *ops; struct mr_table *mrt; int err; ops = fib_rules_register(&ip6mr_rules_ops_template, net); if (IS_ERR(ops)) return PTR_ERR(ops); INIT_LIST_HEAD(&net->ipv6.mr6_tables); mrt = ip6mr_new_table(net, RT6_TABLE_DFLT); if (IS_ERR(mrt)) { err = PTR_ERR(mrt); goto err1; } err = fib_default_rule_add(ops, 0x7fff, RT6_TABLE_DFLT); if (err < 0) goto err2; net->ipv6.mr6_rules_ops = ops; return 0; err2: rtnl_lock(); ip6mr_free_table(mrt); rtnl_unlock(); err1: fib_rules_unregister(ops); return err; } static void __net_exit ip6mr_rules_exit(struct net *net) { struct mr_table *mrt, *next; ASSERT_RTNL(); list_for_each_entry_safe(mrt, next, &net->ipv6.mr6_tables, list) { list_del(&mrt->list); ip6mr_free_table(mrt); } fib_rules_unregister(net->ipv6.mr6_rules_ops); } static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { return fib_rules_dump(net, nb, RTNL_FAMILY_IP6MR, extack); } static unsigned int ip6mr_rules_seq_read(const struct net *net) { return fib_rules_seq_read(net, RTNL_FAMILY_IP6MR); } bool ip6mr_rule_default(const struct fib_rule *rule) { return fib_rule_matchall(rule) && rule->action == FR_ACT_TO_TBL && rule->table == RT6_TABLE_DFLT && !rule->l3mdev; } EXPORT_SYMBOL(ip6mr_rule_default); #else #define ip6mr_for_each_table(mrt, net) \ for (mrt = net->ipv6.mrt6; mrt; mrt = NULL) static struct mr_table *ip6mr_mr_table_iter(struct net *net, struct mr_table *mrt) { if (!mrt) return net->ipv6.mrt6; return NULL; } static struct mr_table *ip6mr_get_table(struct net *net, u32 id) { return net->ipv6.mrt6; } #define __ip6mr_get_table ip6mr_get_table static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6, struct mr_table **mrt) { *mrt = net->ipv6.mrt6; return 0; } static int __net_init ip6mr_rules_init(struct net *net) { struct mr_table *mrt; mrt = ip6mr_new_table(net, RT6_TABLE_DFLT); if (IS_ERR(mrt)) return PTR_ERR(mrt); net->ipv6.mrt6 = mrt; return 0; } static void __net_exit ip6mr_rules_exit(struct net *net) { ASSERT_RTNL(); ip6mr_free_table(net->ipv6.mrt6); net->ipv6.mrt6 = NULL; } static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { return 0; } static unsigned int ip6mr_rules_seq_read(const struct net *net) { return 0; } #endif static int ip6mr_hash_cmp(struct rhashtable_compare_arg *arg, const void *ptr) { const struct mfc6_cache_cmp_arg *cmparg = arg->key; struct mfc6_cache *c = (struct mfc6_cache *)ptr; return !ipv6_addr_equal(&c->mf6c_mcastgrp, &cmparg->mf6c_mcastgrp) || !ipv6_addr_equal(&c->mf6c_origin, &cmparg->mf6c_origin); } static const struct rhashtable_params ip6mr_rht_params = { .head_offset = offsetof(struct mr_mfc, mnode), .key_offset = offsetof(struct mfc6_cache, cmparg), .key_len = sizeof(struct mfc6_cache_cmp_arg), .nelem_hint = 3, .obj_cmpfn = ip6mr_hash_cmp, .automatic_shrinking = true, }; static void ip6mr_new_table_set(struct mr_table *mrt, struct net *net) { #ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES list_add_tail_rcu(&mrt->list, &net->ipv6.mr6_tables); #endif } static struct mfc6_cache_cmp_arg ip6mr_mr_table_ops_cmparg_any = { .mf6c_origin = IN6ADDR_ANY_INIT, .mf6c_mcastgrp = IN6ADDR_ANY_INIT, }; static struct mr_table_ops ip6mr_mr_table_ops = { .rht_params = &ip6mr_rht_params, .cmparg_any = &ip6mr_mr_table_ops_cmparg_any, }; static struct mr_table *ip6mr_new_table(struct net *net, u32 id) { struct mr_table *mrt; mrt = __ip6mr_get_table(net, id); if (mrt) return mrt; return mr_table_alloc(net, id, &ip6mr_mr_table_ops, ipmr_expire_process, ip6mr_new_table_set); } static void ip6mr_free_table(struct mr_table *mrt) { struct net *net = read_pnet(&mrt->net); WARN_ON_ONCE(!mr_can_free_table(net)); timer_shutdown_sync(&mrt->ipmr_expire_timer); mroute_clean_tables(mrt, MRT6_FLUSH_MIFS | MRT6_FLUSH_MIFS_STATIC | MRT6_FLUSH_MFC | MRT6_FLUSH_MFC_STATIC); rhltable_destroy(&mrt->mfc_hash); kfree(mrt); } #ifdef CONFIG_PROC_FS /* The /proc interfaces to multicast routing * /proc/ip6_mr_cache /proc/ip6_mr_vif */ static void *ip6mr_vif_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { struct mr_vif_iter *iter = seq->private; struct net *net = seq_file_net(seq); struct mr_table *mrt; rcu_read_lock(); mrt = __ip6mr_get_table(net, RT6_TABLE_DFLT); if (!mrt) { rcu_read_unlock(); return ERR_PTR(-ENOENT); } iter->mrt = mrt; return mr_vif_seq_start(seq, pos); } static void ip6mr_vif_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { rcu_read_unlock(); } static int ip6mr_vif_seq_show(struct seq_file *seq, void *v) { struct mr_vif_iter *iter = seq->private; struct mr_table *mrt = iter->mrt; if (v == SEQ_START_TOKEN) { seq_puts(seq, "Interface BytesIn PktsIn BytesOut PktsOut Flags\n"); } else { const struct vif_device *vif = v; const struct net_device *vif_dev; const char *name; vif_dev = vif_dev_read(vif); name = vif_dev ? vif_dev->name : "none"; seq_printf(seq, "%2td %-10s %8ld %7ld %8ld %7ld %05X\n", vif - mrt->vif_table, name, vif->bytes_in, vif->pkt_in, vif->bytes_out, vif->pkt_out, vif->flags); } return 0; } static const struct seq_operations ip6mr_vif_seq_ops = { .start = ip6mr_vif_seq_start, .next = mr_vif_seq_next, .stop = ip6mr_vif_seq_stop, .show = ip6mr_vif_seq_show, }; static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos) { struct net *net = seq_file_net(seq); struct mr_table *mrt; mrt = ip6mr_get_table(net, RT6_TABLE_DFLT); if (!mrt) return ERR_PTR(-ENOENT); return mr_mfc_seq_start(seq, pos, mrt, &mfc_unres_lock); } static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) { int n; if (v == SEQ_START_TOKEN) { seq_puts(seq, "Group " "Origin " "Iif Pkts Bytes Wrong Oifs\n"); } else { const struct mfc6_cache *mfc = v; const struct mr_mfc_iter *it = seq->private; struct mr_table *mrt = it->mrt; seq_printf(seq, "%pI6 %pI6 %-3hd", &mfc->mf6c_mcastgrp, &mfc->mf6c_origin, mfc->_c.mfc_parent); if (it->cache != &mrt->mfc_unres_queue) { seq_printf(seq, " %8lu %8lu %8lu", atomic_long_read(&mfc->_c.mfc_un.res.pkt), atomic_long_read(&mfc->_c.mfc_un.res.bytes), atomic_long_read(&mfc->_c.mfc_un.res.wrong_if)); for (n = mfc->_c.mfc_un.res.minvif; n < mfc->_c.mfc_un.res.maxvif; n++) { if (VIF_EXISTS(mrt, n) && mfc->_c.mfc_un.res.ttls[n] < 255) seq_printf(seq, " %2d:%-3d", n, mfc->_c.mfc_un.res.ttls[n]); } } else { /* unresolved mfc_caches don't contain * pkt, bytes and wrong_if values */ seq_printf(seq, " %8lu %8lu %8lu", 0ul, 0ul, 0ul); } seq_putc(seq, '\n'); } return 0; } static const struct seq_operations ipmr_mfc_seq_ops = { .start = ipmr_mfc_seq_start, .next = mr_mfc_seq_next, .stop = mr_mfc_seq_stop, .show = ipmr_mfc_seq_show, }; #endif #ifdef CONFIG_IPV6_PIMSM_V2 static int pim6_rcv(struct sk_buff *skb) { struct pimreghdr *pim; struct ipv6hdr *encap; struct net_device *reg_dev = NULL; struct net *net = dev_net(skb->dev); struct mr_table *mrt; struct flowi6 fl6 = { .flowi6_iif = skb->dev->ifindex, .flowi6_mark = skb->mark, }; int reg_vif_num; if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap))) goto drop; pim = (struct pimreghdr *)skb_transport_header(skb); if (pim->type != ((PIM_VERSION << 4) | PIM_TYPE_REGISTER) || (pim->flags & PIM_NULL_REGISTER) || (csum_ipv6_magic(&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, sizeof(*pim), IPPROTO_PIM, csum_partial((void *)pim, sizeof(*pim), 0)) && csum_fold(skb_checksum(skb, 0, skb->len, 0)))) goto drop; /* check if the inner packet is destined to mcast group */ encap = (struct ipv6hdr *)(skb_transport_header(skb) + sizeof(*pim)); if (!ipv6_addr_is_multicast(&encap->daddr) || encap->payload_len == 0 || ntohs(encap->payload_len) + sizeof(*pim) > skb->len) goto drop; if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0) goto drop; /* Pairs with WRITE_ONCE() in mif6_add()/mif6_delete() */ reg_vif_num = READ_ONCE(mrt->mroute_reg_vif_num); if (reg_vif_num >= 0) reg_dev = vif_dev_read(&mrt->vif_table[reg_vif_num]); if (!reg_dev) goto drop; skb->mac_header = skb->network_header; skb_pull(skb, (u8 *)encap - skb->data); skb_reset_network_header(skb); skb->protocol = htons(ETH_P_IPV6); skb->ip_summed = CHECKSUM_NONE; skb_tunnel_rx(skb, reg_dev, dev_net(reg_dev)); netif_rx(skb); return 0; drop: kfree_skb(skb); return 0; } static const struct inet6_protocol pim6_protocol = { .handler = pim6_rcv, }; /* Service routines creating virtual interfaces: PIMREG */ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev) { struct net *net = dev_net(dev); struct mr_table *mrt; struct flowi6 fl6 = { .flowi6_oif = dev->ifindex, .flowi6_iif = skb->skb_iif ? : LOOPBACK_IFINDEX, .flowi6_mark = skb->mark, }; if (!pskb_inet_may_pull(skb)) goto tx_err; if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0) goto tx_err; DEV_STATS_ADD(dev, tx_bytes, skb->len); DEV_STATS_INC(dev, tx_packets); rcu_read_lock(); ip6mr_cache_report(mrt, skb, READ_ONCE(mrt->mroute_reg_vif_num), MRT6MSG_WHOLEPKT); rcu_read_unlock(); kfree_skb(skb); return NETDEV_TX_OK; tx_err: DEV_STATS_INC(dev, tx_errors); kfree_skb(skb); return NETDEV_TX_OK; } static int reg_vif_get_iflink(const struct net_device *dev) { return 0; } static const struct net_device_ops reg_vif_netdev_ops = { .ndo_start_xmit = reg_vif_xmit, .ndo_get_iflink = reg_vif_get_iflink, }; static void reg_vif_setup(struct net_device *dev) { dev->type = ARPHRD_PIMREG; dev->mtu = 1500 - sizeof(struct ipv6hdr) - 8; dev->flags = IFF_NOARP; dev->netdev_ops = &reg_vif_netdev_ops; dev->needs_free_netdev = true; dev->netns_immutable = true; } static struct net_device *ip6mr_reg_vif(struct net *net, struct mr_table *mrt) { struct net_device *dev; char name[IFNAMSIZ]; if (mrt->id == RT6_TABLE_DFLT) sprintf(name, "pim6reg"); else sprintf(name, "pim6reg%u", mrt->id); dev = alloc_netdev(0, name, NET_NAME_UNKNOWN, reg_vif_setup); if (!dev) return NULL; dev_net_set(dev, net); if (register_netdevice(dev)) { free_netdev(dev); return NULL; } if (dev_open(dev, NULL)) goto failure; dev_hold(dev); return dev; failure: unregister_netdevice(dev); return NULL; } #endif static int call_ip6mr_vif_entry_notifiers(struct net *net, enum fib_event_type event_type, struct vif_device *vif, struct net_device *vif_dev, mifi_t vif_index, u32 tb_id) { return mr_call_vif_notifiers(net, RTNL_FAMILY_IP6MR, event_type, vif, vif_dev, vif_index, tb_id, &net->ipv6.ipmr_seq); } static int call_ip6mr_mfc_entry_notifiers(struct net *net, enum fib_event_type event_type, struct mfc6_cache *mfc, u32 tb_id) { return mr_call_mfc_notifiers(net, RTNL_FAMILY_IP6MR, event_type, &mfc->_c, tb_id, &net->ipv6.ipmr_seq); } /* Delete a VIF entry */ static int mif6_delete(struct mr_table *mrt, int vifi, int notify, struct list_head *head) { struct vif_device *v; struct net_device *dev; struct inet6_dev *in6_dev; if (vifi < 0 || vifi >= mrt->maxvif) return -EADDRNOTAVAIL; v = &mrt->vif_table[vifi]; dev = rtnl_dereference(v->dev); if (!dev) return -EADDRNOTAVAIL; call_ip6mr_vif_entry_notifiers(read_pnet(&mrt->net), FIB_EVENT_VIF_DEL, v, dev, vifi, mrt->id); spin_lock(&mrt_lock); RCU_INIT_POINTER(v->dev, NULL); #ifdef CONFIG_IPV6_PIMSM_V2 if (vifi == mrt->mroute_reg_vif_num) { /* Pairs with READ_ONCE() in ip6mr_cache_report() and reg_vif_xmit() */ WRITE_ONCE(mrt->mroute_reg_vif_num, -1); } #endif if (vifi + 1 == mrt->maxvif) { int tmp; for (tmp = vifi - 1; tmp >= 0; tmp--) { if (VIF_EXISTS(mrt, tmp)) break; } WRITE_ONCE(mrt->maxvif, tmp + 1); } spin_unlock(&mrt_lock); dev_set_allmulti(dev, -1); in6_dev = __in6_dev_get(dev); if (in6_dev) { atomic_dec(&in6_dev->cnf.mc_forwarding); inet6_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF, NETCONFA_MC_FORWARDING, dev->ifindex, &in6_dev->cnf); } if ((v->flags & MIFF_REGISTER) && !notify) unregister_netdevice_queue(dev, head); netdev_put(dev, &v->dev_tracker); return 0; } static inline void ip6mr_cache_free_rcu(struct rcu_head *head) { struct mr_mfc *c = container_of(head, struct mr_mfc, rcu); kmem_cache_free(mrt_cachep, (struct mfc6_cache *)c); } static inline void ip6mr_cache_free(struct mfc6_cache *c) { call_rcu(&c->_c.rcu, ip6mr_cache_free_rcu); } /* Destroy an unresolved cache entry, killing queued skbs and reporting error to netlink readers. */ static void ip6mr_destroy_unres(struct mr_table *mrt, struct mfc6_cache *c) { struct net *net = read_pnet(&mrt->net); struct sk_buff *skb; atomic_dec(&mrt->cache_resolve_queue_len); while ((skb = skb_dequeue(&c->_c.mfc_un.unres.unresolved)) != NULL) { if (ipv6_hdr(skb)->version == 0) { struct nlmsghdr *nlh = skb_pull(skb, sizeof(struct ipv6hdr)); nlh->nlmsg_type = NLMSG_ERROR; nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr)); skb_trim(skb, nlh->nlmsg_len); ((struct nlmsgerr *)nlmsg_data(nlh))->error = -ETIMEDOUT; rtnl_unicast(skb, net, NETLINK_CB(skb).portid); } else kfree_skb(skb); } ip6mr_cache_free(c); } /* Timer process for all the unresolved queue. */ static void ipmr_do_expire_process(struct mr_table *mrt) { unsigned long now = jiffies; unsigned long expires = 10 * HZ; struct mr_mfc *c, *next; list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) { if (time_after(c->mfc_un.unres.expires, now)) { /* not yet... */ unsigned long interval = c->mfc_un.unres.expires - now; if (interval < expires) expires = interval; continue; } list_del(&c->list); mr6_netlink_event(mrt, (struct mfc6_cache *)c, RTM_DELROUTE); ip6mr_destroy_unres(mrt, (struct mfc6_cache *)c); } if (!list_empty(&mrt->mfc_unres_queue)) mod_timer(&mrt->ipmr_expire_timer, jiffies + expires); } static void ipmr_expire_process(struct timer_list *t) { struct mr_table *mrt = timer_container_of(mrt, t, ipmr_expire_timer); if (!spin_trylock(&mfc_unres_lock)) { mod_timer(&mrt->ipmr_expire_timer, jiffies + 1); return; } if (!list_empty(&mrt->mfc_unres_queue)) ipmr_do_expire_process(mrt); spin_unlock(&mfc_unres_lock); } /* Fill oifs list. It is called under locked mrt_lock. */ static void ip6mr_update_thresholds(struct mr_table *mrt, struct mr_mfc *cache, unsigned char *ttls) { int vifi; cache->mfc_un.res.minvif = MAXMIFS; cache->mfc_un.res.maxvif = 0; memset(cache->mfc_un.res.ttls, 255, MAXMIFS); for (vifi = 0; vifi < mrt->maxvif; vifi++) { if (VIF_EXISTS(mrt, vifi) && ttls[vifi] && ttls[vifi] < 255) { cache->mfc_un.res.ttls[vifi] = ttls[vifi]; if (cache->mfc_un.res.minvif > vifi) cache->mfc_un.res.minvif = vifi; if (cache->mfc_un.res.maxvif <= vifi) cache->mfc_un.res.maxvif = vifi + 1; } } WRITE_ONCE(cache->mfc_un.res.lastuse, jiffies); } static int mif6_add(struct net *net, struct mr_table *mrt, struct mif6ctl *vifc, int mrtsock) { int vifi = vifc->mif6c_mifi; struct vif_device *v = &mrt->vif_table[vifi]; struct net_device *dev; struct inet6_dev *in6_dev; int err; /* Is vif busy ? */ if (VIF_EXISTS(mrt, vifi)) return -EADDRINUSE; switch (vifc->mif6c_flags) { #ifdef CONFIG_IPV6_PIMSM_V2 case MIFF_REGISTER: /* * Special Purpose VIF in PIM * All the packets will be sent to the daemon */ if (mrt->mroute_reg_vif_num >= 0) return -EADDRINUSE; dev = ip6mr_reg_vif(net, mrt); if (!dev) return -ENOBUFS; err = dev_set_allmulti(dev, 1); if (err) { unregister_netdevice(dev); dev_put(dev); return err; } break; #endif case 0: dev = dev_get_by_index(net, vifc->mif6c_pifi); if (!dev) return -EADDRNOTAVAIL; err = dev_set_allmulti(dev, 1); if (err) { dev_put(dev); return err; } break; default: return -EINVAL; } in6_dev = __in6_dev_get(dev); if (in6_dev) { atomic_inc(&in6_dev->cnf.mc_forwarding); inet6_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF, NETCONFA_MC_FORWARDING, dev->ifindex, &in6_dev->cnf); } /* Fill in the VIF structures */ vif_device_init(v, dev, vifc->vifc_rate_limit, vifc->vifc_threshold, vifc->mif6c_flags | (!mrtsock ? VIFF_STATIC : 0), MIFF_REGISTER); /* And finish update writing critical data */ spin_lock(&mrt_lock); rcu_assign_pointer(v->dev, dev); netdev_tracker_alloc(dev, &v->dev_tracker, GFP_ATOMIC); #ifdef CONFIG_IPV6_PIMSM_V2 if (v->flags & MIFF_REGISTER) WRITE_ONCE(mrt->mroute_reg_vif_num, vifi); #endif if (vifi + 1 > mrt->maxvif) WRITE_ONCE(mrt->maxvif, vifi + 1); spin_unlock(&mrt_lock); call_ip6mr_vif_entry_notifiers(net, FIB_EVENT_VIF_ADD, v, dev, vifi, mrt->id); return 0; } static struct mfc6_cache *ip6mr_cache_find(struct mr_table *mrt, const struct in6_addr *origin, const struct in6_addr *mcastgrp) { struct mfc6_cache_cmp_arg arg = { .mf6c_origin = *origin, .mf6c_mcastgrp = *mcastgrp, }; return mr_mfc_find(mrt, &arg); } /* Look for a (*,G) entry */ static struct mfc6_cache *ip6mr_cache_find_any(struct mr_table *mrt, struct in6_addr *mcastgrp, mifi_t mifi) { struct mfc6_cache_cmp_arg arg = { .mf6c_origin = in6addr_any, .mf6c_mcastgrp = *mcastgrp, }; if (ipv6_addr_any(mcastgrp)) return mr_mfc_find_any_parent(mrt, mifi); return mr_mfc_find_any(mrt, mifi, &arg); } /* Look for a (S,G,iif) entry if parent != -1 */ static struct mfc6_cache * ip6mr_cache_find_parent(struct mr_table *mrt, const struct in6_addr *origin, const struct in6_addr *mcastgrp, int parent) { struct mfc6_cache_cmp_arg arg = { .mf6c_origin = *origin, .mf6c_mcastgrp = *mcastgrp, }; return mr_mfc_find_parent(mrt, &arg, parent); } /* Allocate a multicast cache entry */ static struct mfc6_cache *ip6mr_cache_alloc(void) { struct mfc6_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL); if (!c) return NULL; c->_c.mfc_un.res.last_assert = jiffies - MFC_ASSERT_THRESH - 1; c->_c.mfc_un.res.minvif = MAXMIFS; c->_c.free = ip6mr_cache_free_rcu; refcount_set(&c->_c.mfc_un.res.refcount, 1); return c; } static struct mfc6_cache *ip6mr_cache_alloc_unres(void) { struct mfc6_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC); if (!c) return NULL; skb_queue_head_init(&c->_c.mfc_un.unres.unresolved); c->_c.mfc_un.unres.expires = jiffies + 10 * HZ; return c; } /* * A cache entry has gone into a resolved state from queued */ static void ip6mr_cache_resolve(struct net *net, struct mr_table *mrt, struct mfc6_cache *uc, struct mfc6_cache *c) { struct sk_buff *skb; /* * Play the pending entries through our router */ while ((skb = __skb_dequeue(&uc->_c.mfc_un.unres.unresolved))) { if (ipv6_hdr(skb)->version == 0) { struct nlmsghdr *nlh = skb_pull(skb, sizeof(struct ipv6hdr)); if (mr_fill_mroute(mrt, skb, &c->_c, nlmsg_data(nlh)) > 0) { nlh->nlmsg_len = skb_tail_pointer(skb) - (u8 *)nlh; } else { nlh->nlmsg_type = NLMSG_ERROR; nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr)); skb_trim(skb, nlh->nlmsg_len); ((struct nlmsgerr *)nlmsg_data(nlh))->error = -EMSGSIZE; } rtnl_unicast(skb, net, NETLINK_CB(skb).portid); } else { rcu_read_lock(); ip6_mr_forward(net, mrt, skb->dev, skb, c); rcu_read_unlock(); } } } /* * Bounce a cache query up to pim6sd and netlink. * * Called under rcu_read_lock() */ static int ip6mr_cache_report(const struct mr_table *mrt, struct sk_buff *pkt, mifi_t mifi, int assert) { struct sock *mroute6_sk; struct sk_buff *skb; struct mrt6msg *msg; int ret; #ifdef CONFIG_IPV6_PIMSM_V2 if (assert == MRT6MSG_WHOLEPKT || assert == MRT6MSG_WRMIFWHOLE) skb = skb_realloc_headroom(pkt, -skb_network_offset(pkt) +sizeof(*msg)); else #endif skb = alloc_skb(sizeof(struct ipv6hdr) + sizeof(*msg), GFP_ATOMIC); if (!skb) return -ENOBUFS; /* I suppose that internal messages * do not require checksums */ skb->ip_summed = CHECKSUM_UNNECESSARY; #ifdef CONFIG_IPV6_PIMSM_V2 if (assert == MRT6MSG_WHOLEPKT || assert == MRT6MSG_WRMIFWHOLE) { /* Ugly, but we have no choice with this interface. Duplicate old header, fix length etc. And all this only to mangle msg->im6_msgtype and to set msg->im6_mbz to "mbz" :-) */ __skb_pull(skb, skb_network_offset(pkt)); skb_push(skb, sizeof(*msg)); skb_reset_transport_header(skb); msg = (struct mrt6msg *)skb_transport_header(skb); msg->im6_mbz = 0; msg->im6_msgtype = assert; if (assert == MRT6MSG_WRMIFWHOLE) msg->im6_mif = mifi; else msg->im6_mif = READ_ONCE(mrt->mroute_reg_vif_num); msg->im6_pad = 0; msg->im6_src = ipv6_hdr(pkt)->saddr; msg->im6_dst = ipv6_hdr(pkt)->daddr; skb->ip_summed = CHECKSUM_UNNECESSARY; } else #endif { /* * Copy the IP header */ skb_put(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); skb_copy_to_linear_data(skb, ipv6_hdr(pkt), sizeof(struct ipv6hdr)); /* * Add our header */ skb_put(skb, sizeof(*msg)); skb_reset_transport_header(skb); msg = (struct mrt6msg *)skb_transport_header(skb); msg->im6_mbz = 0; msg->im6_msgtype = assert; msg->im6_mif = mifi; msg->im6_pad = 0; msg->im6_src = ipv6_hdr(pkt)->saddr; msg->im6_dst = ipv6_hdr(pkt)->daddr; skb_dst_set(skb, dst_clone(skb_dst(pkt))); skb->ip_summed = CHECKSUM_UNNECESSARY; } mroute6_sk = rcu_dereference(mrt->mroute_sk); if (!mroute6_sk) { kfree_skb(skb); return -EINVAL; } mrt6msg_netlink_event(mrt, skb); /* Deliver to user space multicast routing algorithms */ ret = sock_queue_rcv_skb(mroute6_sk, skb); if (ret < 0) { net_warn_ratelimited("mroute6: pending queue full, dropping entries\n"); kfree_skb(skb); } return ret; } /* Queue a packet for resolution. It gets locked cache entry! */ static int ip6mr_cache_unresolved(struct mr_table *mrt, mifi_t mifi, struct sk_buff *skb, struct net_device *dev) { struct mfc6_cache *c; bool found = false; int err; spin_lock_bh(&mfc_unres_lock); list_for_each_entry(c, &mrt->mfc_unres_queue, _c.list) { if (ipv6_addr_equal(&c->mf6c_mcastgrp, &ipv6_hdr(skb)->daddr) && ipv6_addr_equal(&c->mf6c_origin, &ipv6_hdr(skb)->saddr)) { found = true; break; } } if (!found) { /* * Create a new entry if allowable */ c = ip6mr_cache_alloc_unres(); if (!c) { spin_unlock_bh(&mfc_unres_lock); kfree_skb(skb); return -ENOBUFS; } /* Fill in the new cache entry */ c->_c.mfc_parent = -1; c->mf6c_origin = ipv6_hdr(skb)->saddr; c->mf6c_mcastgrp = ipv6_hdr(skb)->daddr; /* * Reflect first query at pim6sd */ err = ip6mr_cache_report(mrt, skb, mifi, MRT6MSG_NOCACHE); if (err < 0) { /* If the report failed throw the cache entry out - Brad Parker */ spin_unlock_bh(&mfc_unres_lock); ip6mr_cache_free(c); kfree_skb(skb); return err; } atomic_inc(&mrt->cache_resolve_queue_len); list_add(&c->_c.list, &mrt->mfc_unres_queue); mr6_netlink_event(mrt, c, RTM_NEWROUTE); ipmr_do_expire_process(mrt); } /* See if we can append the packet */ if (c->_c.mfc_un.unres.unresolved.qlen > 3) { kfree_skb(skb); err = -ENOBUFS; } else { if (dev) { skb->dev = dev; skb->skb_iif = dev->ifindex; } skb_queue_tail(&c->_c.mfc_un.unres.unresolved, skb); err = 0; } spin_unlock_bh(&mfc_unres_lock); return err; } /* * MFC6 cache manipulation by user space */ static int ip6mr_mfc_delete(struct mr_table *mrt, struct mf6cctl *mfc, int parent) { struct mfc6_cache *c; /* The entries are added/deleted only under RTNL */ rcu_read_lock(); c = ip6mr_cache_find_parent(mrt, &mfc->mf6cc_origin.sin6_addr, &mfc->mf6cc_mcastgrp.sin6_addr, parent); rcu_read_unlock(); if (!c) return -ENOENT; rhltable_remove(&mrt->mfc_hash, &c->_c.mnode, ip6mr_rht_params); list_del_rcu(&c->_c.list); call_ip6mr_mfc_entry_notifiers(read_pnet(&mrt->net), FIB_EVENT_ENTRY_DEL, c, mrt->id); mr6_netlink_event(mrt, c, RTM_DELROUTE); mr_cache_put(&c->_c); return 0; } static int ip6mr_device_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); struct mr_table *mrt; struct vif_device *v; int ct; if (event != NETDEV_UNREGISTER) return NOTIFY_DONE; ip6mr_for_each_table(mrt, net) { v = &mrt->vif_table[0]; for (ct = 0; ct < mrt->maxvif; ct++, v++) { if (rcu_access_pointer(v->dev) == dev) mif6_delete(mrt, ct, 1, NULL); } } return NOTIFY_DONE; } static unsigned int ip6mr_seq_read(const struct net *net) { return atomic_read(&net->ipv6.ipmr_seq) + ip6mr_rules_seq_read(net); } static int ip6mr_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { return mr_dump(net, nb, RTNL_FAMILY_IP6MR, ip6mr_rules_dump, ip6mr_mr_table_iter, extack); } static struct notifier_block ip6_mr_notifier = { .notifier_call = ip6mr_device_event }; static const struct fib_notifier_ops ip6mr_notifier_ops_template = { .family = RTNL_FAMILY_IP6MR, .fib_seq_read = ip6mr_seq_read, .fib_dump = ip6mr_dump, .owner = THIS_MODULE, }; static int __net_init ip6mr_notifier_init(struct net *net) { struct fib_notifier_ops *ops; atomic_set(&net->ipv6.ipmr_seq, 0); ops = fib_notifier_ops_register(&ip6mr_notifier_ops_template, net); if (IS_ERR(ops)) return PTR_ERR(ops); net->ipv6.ip6mr_notifier_ops = ops; return 0; } static void __net_exit ip6mr_notifier_exit(struct net *net) { fib_notifier_ops_unregister(net->ipv6.ip6mr_notifier_ops); net->ipv6.ip6mr_notifier_ops = NULL; } /* Setup for IP multicast routing */ static int __net_init ip6mr_net_init(struct net *net) { int err; err = ip6mr_notifier_init(net); if (err) return err; err = ip6mr_rules_init(net); if (err < 0) goto ip6mr_rules_fail; #ifdef CONFIG_PROC_FS err = -ENOMEM; if (!proc_create_net("ip6_mr_vif", 0, net->proc_net, &ip6mr_vif_seq_ops, sizeof(struct mr_vif_iter))) goto proc_vif_fail; if (!proc_create_net("ip6_mr_cache", 0, net->proc_net, &ipmr_mfc_seq_ops, sizeof(struct mr_mfc_iter))) goto proc_cache_fail; #endif return 0; #ifdef CONFIG_PROC_FS proc_cache_fail: remove_proc_entry("ip6_mr_vif", net->proc_net); proc_vif_fail: rtnl_lock(); ip6mr_rules_exit(net); rtnl_unlock(); #endif ip6mr_rules_fail: ip6mr_notifier_exit(net); return err; } static void __net_exit ip6mr_net_exit(struct net *net) { #ifdef CONFIG_PROC_FS remove_proc_entry("ip6_mr_cache", net->proc_net); remove_proc_entry("ip6_mr_vif", net->proc_net); #endif ip6mr_notifier_exit(net); } static void __net_exit ip6mr_net_exit_batch(struct list_head *net_list) { struct net *net; rtnl_lock(); list_for_each_entry(net, net_list, exit_list) ip6mr_rules_exit(net); rtnl_unlock(); } static struct pernet_operations ip6mr_net_ops = { .init = ip6mr_net_init, .exit = ip6mr_net_exit, .exit_batch = ip6mr_net_exit_batch, }; static const struct rtnl_msg_handler ip6mr_rtnl_msg_handlers[] __initconst_or_module = { {.owner = THIS_MODULE, .protocol = RTNL_FAMILY_IP6MR, .msgtype = RTM_GETROUTE, .doit = ip6mr_rtm_getroute, .dumpit = ip6mr_rtm_dumproute}, }; int __init ip6_mr_init(void) { int err; mrt_cachep = KMEM_CACHE(mfc6_cache, SLAB_HWCACHE_ALIGN); if (!mrt_cachep) return -ENOMEM; err = register_pernet_subsys(&ip6mr_net_ops); if (err) goto reg_pernet_fail; err = register_netdevice_notifier(&ip6_mr_notifier); if (err) goto reg_notif_fail; #ifdef CONFIG_IPV6_PIMSM_V2 if (inet6_add_protocol(&pim6_protocol, IPPROTO_PIM) < 0) { pr_err("%s: can't add PIM protocol\n", __func__); err = -EAGAIN; goto add_proto_fail; } #endif err = rtnl_register_many(ip6mr_rtnl_msg_handlers); if (!err) return 0; #ifdef CONFIG_IPV6_PIMSM_V2 inet6_del_protocol(&pim6_protocol, IPPROTO_PIM); add_proto_fail: unregister_netdevice_notifier(&ip6_mr_notifier); #endif reg_notif_fail: unregister_pernet_subsys(&ip6mr_net_ops); reg_pernet_fail: kmem_cache_destroy(mrt_cachep); return err; } void __init ip6_mr_cleanup(void) { rtnl_unregister_many(ip6mr_rtnl_msg_handlers); #ifdef CONFIG_IPV6_PIMSM_V2 inet6_del_protocol(&pim6_protocol, IPPROTO_PIM); #endif unregister_netdevice_notifier(&ip6_mr_notifier); unregister_pernet_subsys(&ip6mr_net_ops); kmem_cache_destroy(mrt_cachep); } static int ip6mr_mfc_add(struct net *net, struct mr_table *mrt, struct mf6cctl *mfc, int mrtsock, int parent) { unsigned char ttls[MAXMIFS]; struct mfc6_cache *uc, *c; struct mr_mfc *_uc; bool found; int i, err; if (mfc->mf6cc_parent >= MAXMIFS) return -ENFILE; memset(ttls, 255, MAXMIFS); for (i = 0; i < MAXMIFS; i++) { if (IF_ISSET(i, &mfc->mf6cc_ifset)) ttls[i] = 1; } /* The entries are added/deleted only under RTNL */ rcu_read_lock(); c = ip6mr_cache_find_parent(mrt, &mfc->mf6cc_origin.sin6_addr, &mfc->mf6cc_mcastgrp.sin6_addr, parent); rcu_read_unlock(); if (c) { spin_lock(&mrt_lock); c->_c.mfc_parent = mfc->mf6cc_parent; ip6mr_update_thresholds(mrt, &c->_c, ttls); if (!mrtsock) c->_c.mfc_flags |= MFC_STATIC; spin_unlock(&mrt_lock); call_ip6mr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, c, mrt->id); mr6_netlink_event(mrt, c, RTM_NEWROUTE); return 0; } if (!ipv6_addr_any(&mfc->mf6cc_mcastgrp.sin6_addr) && !ipv6_addr_is_multicast(&mfc->mf6cc_mcastgrp.sin6_addr)) return -EINVAL; c = ip6mr_cache_alloc(); if (!c) return -ENOMEM; c->mf6c_origin = mfc->mf6cc_origin.sin6_addr; c->mf6c_mcastgrp = mfc->mf6cc_mcastgrp.sin6_addr; c->_c.mfc_parent = mfc->mf6cc_parent; ip6mr_update_thresholds(mrt, &c->_c, ttls); if (!mrtsock) c->_c.mfc_flags |= MFC_STATIC; err = rhltable_insert_key(&mrt->mfc_hash, &c->cmparg, &c->_c.mnode, ip6mr_rht_params); if (err) { pr_err("ip6mr: rhtable insert error %d\n", err); ip6mr_cache_free(c); return err; } list_add_tail_rcu(&c->_c.list, &mrt->mfc_cache_list); /* Check to see if we resolved a queued list. If so we * need to send on the frames and tidy up. */ found = false; spin_lock_bh(&mfc_unres_lock); list_for_each_entry(_uc, &mrt->mfc_unres_queue, list) { uc = (struct mfc6_cache *)_uc; if (ipv6_addr_equal(&uc->mf6c_origin, &c->mf6c_origin) && ipv6_addr_equal(&uc->mf6c_mcastgrp, &c->mf6c_mcastgrp)) { list_del(&_uc->list); atomic_dec(&mrt->cache_resolve_queue_len); found = true; break; } } if (list_empty(&mrt->mfc_unres_queue)) timer_delete(&mrt->ipmr_expire_timer); spin_unlock_bh(&mfc_unres_lock); if (found) { ip6mr_cache_resolve(net, mrt, uc, c); ip6mr_cache_free(uc); } call_ip6mr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_ADD, c, mrt->id); mr6_netlink_event(mrt, c, RTM_NEWROUTE); return 0; } /* * Close the multicast socket, and clear the vif tables etc */ static void mroute_clean_tables(struct mr_table *mrt, int flags) { struct mr_mfc *c, *tmp; LIST_HEAD(list); int i; /* Shut down all active vif entries */ if (flags & (MRT6_FLUSH_MIFS | MRT6_FLUSH_MIFS_STATIC)) { for (i = 0; i < mrt->maxvif; i++) { if (((mrt->vif_table[i].flags & VIFF_STATIC) && !(flags & MRT6_FLUSH_MIFS_STATIC)) || (!(mrt->vif_table[i].flags & VIFF_STATIC) && !(flags & MRT6_FLUSH_MIFS))) continue; mif6_delete(mrt, i, 0, &list); } unregister_netdevice_many(&list); } /* Wipe the cache */ if (flags & (MRT6_FLUSH_MFC | MRT6_FLUSH_MFC_STATIC)) { list_for_each_entry_safe(c, tmp, &mrt->mfc_cache_list, list) { if (((c->mfc_flags & MFC_STATIC) && !(flags & MRT6_FLUSH_MFC_STATIC)) || (!(c->mfc_flags & MFC_STATIC) && !(flags & MRT6_FLUSH_MFC))) continue; rhltable_remove(&mrt->mfc_hash, &c->mnode, ip6mr_rht_params); list_del_rcu(&c->list); call_ip6mr_mfc_entry_notifiers(read_pnet(&mrt->net), FIB_EVENT_ENTRY_DEL, (struct mfc6_cache *)c, mrt->id); mr6_netlink_event(mrt, (struct mfc6_cache *)c, RTM_DELROUTE); mr_cache_put(c); } } if (flags & MRT6_FLUSH_MFC) { if (atomic_read(&mrt->cache_resolve_queue_len) != 0) { spin_lock_bh(&mfc_unres_lock); list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) { list_del(&c->list); mr6_netlink_event(mrt, (struct mfc6_cache *)c, RTM_DELROUTE); ip6mr_destroy_unres(mrt, (struct mfc6_cache *)c); } spin_unlock_bh(&mfc_unres_lock); } } } static int ip6mr_sk_init(struct mr_table *mrt, struct sock *sk) { int err = 0; struct net *net = sock_net(sk); rtnl_lock(); spin_lock(&mrt_lock); if (rtnl_dereference(mrt->mroute_sk)) { err = -EADDRINUSE; } else { rcu_assign_pointer(mrt->mroute_sk, sk); sock_set_flag(sk, SOCK_RCU_FREE); atomic_inc(&net->ipv6.devconf_all->mc_forwarding); } spin_unlock(&mrt_lock); if (!err) inet6_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_MC_FORWARDING, NETCONFA_IFINDEX_ALL, net->ipv6.devconf_all); rtnl_unlock(); return err; } int ip6mr_sk_done(struct sock *sk) { struct net *net = sock_net(sk); struct ipv6_devconf *devconf; struct mr_table *mrt; int err = -EACCES; if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num != IPPROTO_ICMPV6) return err; devconf = net->ipv6.devconf_all; if (!devconf || !atomic_read(&devconf->mc_forwarding)) return err; rtnl_lock(); ip6mr_for_each_table(mrt, net) { if (sk == rtnl_dereference(mrt->mroute_sk)) { spin_lock(&mrt_lock); RCU_INIT_POINTER(mrt->mroute_sk, NULL); /* Note that mroute_sk had SOCK_RCU_FREE set, * so the RCU grace period before sk freeing * is guaranteed by sk_destruct() */ atomic_dec(&devconf->mc_forwarding); spin_unlock(&mrt_lock); inet6_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_MC_FORWARDING, NETCONFA_IFINDEX_ALL, net->ipv6.devconf_all); mroute_clean_tables(mrt, MRT6_FLUSH_MIFS | MRT6_FLUSH_MFC); err = 0; break; } } rtnl_unlock(); return err; } bool mroute6_is_socket(struct net *net, struct sk_buff *skb) { struct mr_table *mrt; struct flowi6 fl6 = { .flowi6_iif = skb->skb_iif ? : LOOPBACK_IFINDEX, .flowi6_oif = skb->dev->ifindex, .flowi6_mark = skb->mark, }; if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0) return NULL; return rcu_access_pointer(mrt->mroute_sk); } EXPORT_SYMBOL(mroute6_is_socket); /* * Socket options and virtual interface manipulation. The whole * virtual interface system is a complete heap, but unfortunately * that's how BSD mrouted happens to think. Maybe one day with a proper * MOSPF/PIM router set up we can clean this up. */ int ip6_mroute_setsockopt(struct sock *sk, int optname, sockptr_t optval, unsigned int optlen) { int ret, parent = 0; struct mif6ctl vif; struct mf6cctl mfc; mifi_t mifi; struct net *net = sock_net(sk); struct mr_table *mrt; if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num != IPPROTO_ICMPV6) return -EOPNOTSUPP; mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT); if (!mrt) return -ENOENT; if (optname != MRT6_INIT) { if (sk != rcu_access_pointer(mrt->mroute_sk) && !ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EACCES; } switch (optname) { case MRT6_INIT: if (optlen < sizeof(int)) return -EINVAL; return ip6mr_sk_init(mrt, sk); case MRT6_DONE: return ip6mr_sk_done(sk); case MRT6_ADD_MIF: if (optlen < sizeof(vif)) return -EINVAL; if (copy_from_sockptr(&vif, optval, sizeof(vif))) return -EFAULT; if (vif.mif6c_mifi >= MAXMIFS) return -ENFILE; rtnl_lock(); ret = mif6_add(net, mrt, &vif, sk == rtnl_dereference(mrt->mroute_sk)); rtnl_unlock(); return ret; case MRT6_DEL_MIF: if (optlen < sizeof(mifi_t)) return -EINVAL; if (copy_from_sockptr(&mifi, optval, sizeof(mifi_t))) return -EFAULT; rtnl_lock(); ret = mif6_delete(mrt, mifi, 0, NULL); rtnl_unlock(); return ret; /* * Manipulate the forwarding caches. These live * in a sort of kernel/user symbiosis. */ case MRT6_ADD_MFC: case MRT6_DEL_MFC: parent = -1; fallthrough; case MRT6_ADD_MFC_PROXY: case MRT6_DEL_MFC_PROXY: if (optlen < sizeof(mfc)) return -EINVAL; if (copy_from_sockptr(&mfc, optval, sizeof(mfc))) return -EFAULT; if (parent == 0) parent = mfc.mf6cc_parent; rtnl_lock(); if (optname == MRT6_DEL_MFC || optname == MRT6_DEL_MFC_PROXY) ret = ip6mr_mfc_delete(mrt, &mfc, parent); else ret = ip6mr_mfc_add(net, mrt, &mfc, sk == rtnl_dereference(mrt->mroute_sk), parent); rtnl_unlock(); return ret; case MRT6_FLUSH: { int flags; if (optlen != sizeof(flags)) return -EINVAL; if (copy_from_sockptr(&flags, optval, sizeof(flags))) return -EFAULT; rtnl_lock(); mroute_clean_tables(mrt, flags); rtnl_unlock(); return 0; } /* * Control PIM assert (to activate pim will activate assert) */ case MRT6_ASSERT: { int v; if (optlen != sizeof(v)) return -EINVAL; if (copy_from_sockptr(&v, optval, sizeof(v))) return -EFAULT; mrt->mroute_do_assert = v; return 0; } #ifdef CONFIG_IPV6_PIMSM_V2 case MRT6_PIM: { bool do_wrmifwhole; int v; if (optlen != sizeof(v)) return -EINVAL; if (copy_from_sockptr(&v, optval, sizeof(v))) return -EFAULT; do_wrmifwhole = (v == MRT6MSG_WRMIFWHOLE); v = !!v; rtnl_lock(); ret = 0; if (v != mrt->mroute_do_pim) { mrt->mroute_do_pim = v; mrt->mroute_do_assert = v; mrt->mroute_do_wrvifwhole = do_wrmifwhole; } rtnl_unlock(); return ret; } #endif #ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES case MRT6_TABLE: { u32 v; if (optlen != sizeof(u32)) return -EINVAL; if (copy_from_sockptr(&v, optval, sizeof(v))) return -EFAULT; /* "pim6reg%u" should not exceed 16 bytes (IFNAMSIZ) */ if (v != RT_TABLE_DEFAULT && v >= 100000000) return -EINVAL; if (sk == rcu_access_pointer(mrt->mroute_sk)) return -EBUSY; rtnl_lock(); ret = 0; mrt = ip6mr_new_table(net, v); if (IS_ERR(mrt)) ret = PTR_ERR(mrt); else raw6_sk(sk)->ip6mr_table = v; rtnl_unlock(); return ret; } #endif /* * Spurious command, or MRT6_VERSION which you cannot * set. */ default: return -ENOPROTOOPT; } } /* * Getsock opt support for the multicast routing system. */ int ip6_mroute_getsockopt(struct sock *sk, int optname, sockptr_t optval, sockptr_t optlen) { int olr; int val; struct net *net = sock_net(sk); struct mr_table *mrt; if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num != IPPROTO_ICMPV6) return -EOPNOTSUPP; mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT); if (!mrt) return -ENOENT; switch (optname) { case MRT6_VERSION: val = 0x0305; break; #ifdef CONFIG_IPV6_PIMSM_V2 case MRT6_PIM: val = mrt->mroute_do_pim; break; #endif case MRT6_ASSERT: val = mrt->mroute_do_assert; break; default: return -ENOPROTOOPT; } if (copy_from_sockptr(&olr, optlen, sizeof(int))) return -EFAULT; olr = min_t(int, olr, sizeof(int)); if (olr < 0) return -EINVAL; if (copy_to_sockptr(optlen, &olr, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, &val, olr)) return -EFAULT; return 0; } /* * The IP multicast ioctl support routines. */ int ip6mr_ioctl(struct sock *sk, int cmd, void *arg) { struct sioc_sg_req6 *sr; struct sioc_mif_req6 *vr; struct vif_device *vif; struct mfc6_cache *c; struct net *net = sock_net(sk); struct mr_table *mrt; mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT); if (!mrt) return -ENOENT; switch (cmd) { case SIOCGETMIFCNT_IN6: vr = (struct sioc_mif_req6 *)arg; if (vr->mifi >= mrt->maxvif) return -EINVAL; vr->mifi = array_index_nospec(vr->mifi, mrt->maxvif); rcu_read_lock(); vif = &mrt->vif_table[vr->mifi]; if (VIF_EXISTS(mrt, vr->mifi)) { vr->icount = READ_ONCE(vif->pkt_in); vr->ocount = READ_ONCE(vif->pkt_out); vr->ibytes = READ_ONCE(vif->bytes_in); vr->obytes = READ_ONCE(vif->bytes_out); rcu_read_unlock(); return 0; } rcu_read_unlock(); return -EADDRNOTAVAIL; case SIOCGETSGCNT_IN6: sr = (struct sioc_sg_req6 *)arg; rcu_read_lock(); c = ip6mr_cache_find(mrt, &sr->src.sin6_addr, &sr->grp.sin6_addr); if (c) { sr->pktcnt = atomic_long_read(&c->_c.mfc_un.res.pkt); sr->bytecnt = atomic_long_read(&c->_c.mfc_un.res.bytes); sr->wrong_if = atomic_long_read(&c->_c.mfc_un.res.wrong_if); rcu_read_unlock(); return 0; } rcu_read_unlock(); return -EADDRNOTAVAIL; default: return -ENOIOCTLCMD; } } #ifdef CONFIG_COMPAT struct compat_sioc_sg_req6 { struct sockaddr_in6 src; struct sockaddr_in6 grp; compat_ulong_t pktcnt; compat_ulong_t bytecnt; compat_ulong_t wrong_if; }; struct compat_sioc_mif_req6 { mifi_t mifi; compat_ulong_t icount; compat_ulong_t ocount; compat_ulong_t ibytes; compat_ulong_t obytes; }; int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) { struct compat_sioc_sg_req6 sr; struct compat_sioc_mif_req6 vr; struct vif_device *vif; struct mfc6_cache *c; struct net *net = sock_net(sk); struct mr_table *mrt; mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT); if (!mrt) return -ENOENT; switch (cmd) { case SIOCGETMIFCNT_IN6: if (copy_from_user(&vr, arg, sizeof(vr))) return -EFAULT; if (vr.mifi >= mrt->maxvif) return -EINVAL; vr.mifi = array_index_nospec(vr.mifi, mrt->maxvif); rcu_read_lock(); vif = &mrt->vif_table[vr.mifi]; if (VIF_EXISTS(mrt, vr.mifi)) { vr.icount = READ_ONCE(vif->pkt_in); vr.ocount = READ_ONCE(vif->pkt_out); vr.ibytes = READ_ONCE(vif->bytes_in); vr.obytes = READ_ONCE(vif->bytes_out); rcu_read_unlock(); if (copy_to_user(arg, &vr, sizeof(vr))) return -EFAULT; return 0; } rcu_read_unlock(); return -EADDRNOTAVAIL; case SIOCGETSGCNT_IN6: if (copy_from_user(&sr, arg, sizeof(sr))) return -EFAULT; rcu_read_lock(); c = ip6mr_cache_find(mrt, &sr.src.sin6_addr, &sr.grp.sin6_addr); if (c) { sr.pktcnt = atomic_long_read(&c->_c.mfc_un.res.pkt); sr.bytecnt = atomic_long_read(&c->_c.mfc_un.res.bytes); sr.wrong_if = atomic_long_read(&c->_c.mfc_un.res.wrong_if); rcu_read_unlock(); if (copy_to_user(arg, &sr, sizeof(sr))) return -EFAULT; return 0; } rcu_read_unlock(); return -EADDRNOTAVAIL; default: return -ENOIOCTLCMD; } } #endif static inline int ip6mr_forward2_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_OUTFORWDATAGRAMS); return dst_output(net, sk, skb); } /* * Processing handlers for ip6mr_forward */ static int ip6mr_prepare_xmit(struct net *net, struct mr_table *mrt, struct sk_buff *skb, int vifi) { struct vif_device *vif = &mrt->vif_table[vifi]; struct net_device *vif_dev; struct ipv6hdr *ipv6h; struct dst_entry *dst; struct flowi6 fl6; vif_dev = vif_dev_read(vif); if (!vif_dev) return -1; #ifdef CONFIG_IPV6_PIMSM_V2 if (vif->flags & MIFF_REGISTER) { WRITE_ONCE(vif->pkt_out, vif->pkt_out + 1); WRITE_ONCE(vif->bytes_out, vif->bytes_out + skb->len); DEV_STATS_ADD(vif_dev, tx_bytes, skb->len); DEV_STATS_INC(vif_dev, tx_packets); ip6mr_cache_report(mrt, skb, vifi, MRT6MSG_WHOLEPKT); return -1; } #endif ipv6h = ipv6_hdr(skb); fl6 = (struct flowi6) { .flowi6_oif = vif->link, .daddr = ipv6h->daddr, }; dst = ip6_route_output(net, NULL, &fl6); if (dst->error) { dst_release(dst); return -1; } skb_dst_drop(skb); skb_dst_set(skb, dst); /* * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally * not only before forwarding, but after forwarding on all output * interfaces. It is clear, if mrouter runs a multicasting * program, it should receive packets not depending to what interface * program is joined. * If we will not make it, the program will have to join on all * interfaces. On the other hand, multihoming host (or router, but * not mrouter) cannot join to more than one interface - it will * result in receiving multiple packets. */ skb->dev = vif_dev; WRITE_ONCE(vif->pkt_out, vif->pkt_out + 1); WRITE_ONCE(vif->bytes_out, vif->bytes_out + skb->len); /* We are about to write */ /* XXX: extension headers? */ if (skb_cow(skb, sizeof(*ipv6h) + LL_RESERVED_SPACE(vif_dev))) return -1; ipv6h = ipv6_hdr(skb); ipv6h->hop_limit--; return 0; } static void ip6mr_forward2(struct net *net, struct mr_table *mrt, struct sk_buff *skb, int vifi) { struct net_device *indev = skb->dev; if (ip6mr_prepare_xmit(net, mrt, skb, vifi)) goto out_free; IP6CB(skb)->flags |= IP6SKB_FORWARDED; NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, net, NULL, skb, indev, skb->dev, ip6mr_forward2_finish); return; out_free: kfree_skb(skb); } static void ip6mr_output2(struct net *net, struct mr_table *mrt, struct sk_buff *skb, int vifi) { if (ip6mr_prepare_xmit(net, mrt, skb, vifi)) goto out_free; ip6_output(net, NULL, skb); return; out_free: kfree_skb(skb); } /* Called with rcu_read_lock() */ static int ip6mr_find_vif(struct mr_table *mrt, struct net_device *dev) { int ct; /* Pairs with WRITE_ONCE() in mif6_delete()/mif6_add() */ for (ct = READ_ONCE(mrt->maxvif) - 1; ct >= 0; ct--) { if (rcu_access_pointer(mrt->vif_table[ct].dev) == dev) break; } return ct; } /* Called under rcu_read_lock() */ static void ip6_mr_forward(struct net *net, struct mr_table *mrt, struct net_device *dev, struct sk_buff *skb, struct mfc6_cache *c) { int psend = -1; int vif, ct; int true_vifi = ip6mr_find_vif(mrt, dev); vif = c->_c.mfc_parent; atomic_long_inc(&c->_c.mfc_un.res.pkt); atomic_long_add(skb->len, &c->_c.mfc_un.res.bytes); WRITE_ONCE(c->_c.mfc_un.res.lastuse, jiffies); if (ipv6_addr_any(&c->mf6c_origin) && true_vifi >= 0) { struct mfc6_cache *cache_proxy; /* For an (*,G) entry, we only check that the incoming * interface is part of the static tree. */ cache_proxy = mr_mfc_find_any_parent(mrt, vif); if (cache_proxy && cache_proxy->_c.mfc_un.res.ttls[true_vifi] < 255) goto forward; } /* * Wrong interface: drop packet and (maybe) send PIM assert. */ if (rcu_access_pointer(mrt->vif_table[vif].dev) != dev) { atomic_long_inc(&c->_c.mfc_un.res.wrong_if); if (true_vifi >= 0 && mrt->mroute_do_assert && /* pimsm uses asserts, when switching from RPT to SPT, so that we cannot check that packet arrived on an oif. It is bad, but otherwise we would need to move pretty large chunk of pimd to kernel. Ough... --ANK */ (mrt->mroute_do_pim || c->_c.mfc_un.res.ttls[true_vifi] < 255) && time_after(jiffies, c->_c.mfc_un.res.last_assert + MFC_ASSERT_THRESH)) { c->_c.mfc_un.res.last_assert = jiffies; ip6mr_cache_report(mrt, skb, true_vifi, MRT6MSG_WRONGMIF); if (mrt->mroute_do_wrvifwhole) ip6mr_cache_report(mrt, skb, true_vifi, MRT6MSG_WRMIFWHOLE); } goto dont_forward; } forward: WRITE_ONCE(mrt->vif_table[vif].pkt_in, mrt->vif_table[vif].pkt_in + 1); WRITE_ONCE(mrt->vif_table[vif].bytes_in, mrt->vif_table[vif].bytes_in + skb->len); /* * Forward the frame */ if (ipv6_addr_any(&c->mf6c_origin) && ipv6_addr_any(&c->mf6c_mcastgrp)) { if (true_vifi >= 0 && true_vifi != c->_c.mfc_parent && ipv6_hdr(skb)->hop_limit > c->_c.mfc_un.res.ttls[c->_c.mfc_parent]) { /* It's an (*,*) entry and the packet is not coming from * the upstream: forward the packet to the upstream * only. */ psend = c->_c.mfc_parent; goto last_forward; } goto dont_forward; } for (ct = c->_c.mfc_un.res.maxvif - 1; ct >= c->_c.mfc_un.res.minvif; ct--) { /* For (*,G) entry, don't forward to the incoming interface */ if ((!ipv6_addr_any(&c->mf6c_origin) || ct != true_vifi) && ipv6_hdr(skb)->hop_limit > c->_c.mfc_un.res.ttls[ct]) { if (psend != -1) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) ip6mr_forward2(net, mrt, skb2, psend); } psend = ct; } } last_forward: if (psend != -1) { ip6mr_forward2(net, mrt, skb, psend); return; } dont_forward: kfree_skb(skb); } /* Called under rcu_read_lock() */ static void ip6_mr_output_finish(struct net *net, struct mr_table *mrt, struct net_device *dev, struct sk_buff *skb, struct mfc6_cache *c) { int psend = -1; int ct; WARN_ON_ONCE(!rcu_read_lock_held()); atomic_long_inc(&c->_c.mfc_un.res.pkt); atomic_long_add(skb->len, &c->_c.mfc_un.res.bytes); WRITE_ONCE(c->_c.mfc_un.res.lastuse, jiffies); /* Forward the frame */ if (ipv6_addr_any(&c->mf6c_origin) && ipv6_addr_any(&c->mf6c_mcastgrp)) { if (ipv6_hdr(skb)->hop_limit > c->_c.mfc_un.res.ttls[c->_c.mfc_parent]) { /* It's an (*,*) entry and the packet is not coming from * the upstream: forward the packet to the upstream * only. */ psend = c->_c.mfc_parent; goto last_forward; } goto dont_forward; } for (ct = c->_c.mfc_un.res.maxvif - 1; ct >= c->_c.mfc_un.res.minvif; ct--) { if (ipv6_hdr(skb)->hop_limit > c->_c.mfc_un.res.ttls[ct]) { if (psend != -1) { struct sk_buff *skb2; skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) ip6mr_output2(net, mrt, skb2, psend); } psend = ct; } } last_forward: if (psend != -1) { ip6mr_output2(net, mrt, skb, psend); return; } dont_forward: kfree_skb(skb); } /* * Multicast packets for forwarding arrive here */ int ip6_mr_input(struct sk_buff *skb) { struct net_device *dev = skb->dev; struct net *net = dev_net_rcu(dev); struct mfc6_cache *cache; struct mr_table *mrt; struct flowi6 fl6 = { .flowi6_iif = dev->ifindex, .flowi6_mark = skb->mark, }; int err; /* skb->dev passed in is the master dev for vrfs. * Get the proper interface that does have a vif associated with it. */ if (netif_is_l3_master(dev)) { dev = dev_get_by_index_rcu(net, IPCB(skb)->iif); if (!dev) { kfree_skb(skb); return -ENODEV; } } err = ip6mr_fib_lookup(net, &fl6, &mrt); if (err < 0) { kfree_skb(skb); return err; } cache = ip6mr_cache_find(mrt, &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr); if (!cache) { int vif = ip6mr_find_vif(mrt, dev); if (vif >= 0) cache = ip6mr_cache_find_any(mrt, &ipv6_hdr(skb)->daddr, vif); } /* * No usable cache entry */ if (!cache) { int vif; vif = ip6mr_find_vif(mrt, dev); if (vif >= 0) { int err = ip6mr_cache_unresolved(mrt, vif, skb, dev); return err; } kfree_skb(skb); return -ENODEV; } ip6_mr_forward(net, mrt, dev, skb, cache); return 0; } int ip6_mr_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb_dst(skb)->dev; struct flowi6 fl6 = (struct flowi6) { .flowi6_iif = LOOPBACK_IFINDEX, .flowi6_mark = skb->mark, }; struct mfc6_cache *cache; struct mr_table *mrt; int err; int vif; guard(rcu)(); if (IP6CB(skb)->flags & IP6SKB_FORWARDED) goto ip6_output; if (!(IP6CB(skb)->flags & IP6SKB_MCROUTE)) goto ip6_output; err = ip6mr_fib_lookup(net, &fl6, &mrt); if (err < 0) { kfree_skb(skb); return err; } cache = ip6mr_cache_find(mrt, &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr); if (!cache) { vif = ip6mr_find_vif(mrt, dev); if (vif >= 0) cache = ip6mr_cache_find_any(mrt, &ipv6_hdr(skb)->daddr, vif); } /* No usable cache entry */ if (!cache) { vif = ip6mr_find_vif(mrt, dev); if (vif >= 0) return ip6mr_cache_unresolved(mrt, vif, skb, dev); goto ip6_output; } /* Wrong interface */ vif = cache->_c.mfc_parent; if (rcu_access_pointer(mrt->vif_table[vif].dev) != dev) goto ip6_output; ip6_mr_output_finish(net, mrt, dev, skb, cache); return 0; ip6_output: return ip6_output(net, sk, skb); } int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm, u32 portid) { int err; struct mr_table *mrt; struct mfc6_cache *cache; struct rt6_info *rt = dst_rt6_info(skb_dst(skb)); rcu_read_lock(); mrt = __ip6mr_get_table(net, RT6_TABLE_DFLT); if (!mrt) { rcu_read_unlock(); return -ENOENT; } cache = ip6mr_cache_find(mrt, &rt->rt6i_src.addr, &rt->rt6i_dst.addr); if (!cache && skb->dev) { int vif = ip6mr_find_vif(mrt, skb->dev); if (vif >= 0) cache = ip6mr_cache_find_any(mrt, &rt->rt6i_dst.addr, vif); } if (!cache) { struct sk_buff *skb2; struct ipv6hdr *iph; struct net_device *dev; int vif; dev = skb->dev; if (!dev || (vif = ip6mr_find_vif(mrt, dev)) < 0) { rcu_read_unlock(); return -ENODEV; } /* really correct? */ skb2 = alloc_skb(sizeof(struct ipv6hdr), GFP_ATOMIC); if (!skb2) { rcu_read_unlock(); return -ENOMEM; } NETLINK_CB(skb2).portid = portid; skb_reset_transport_header(skb2); skb_put(skb2, sizeof(struct ipv6hdr)); skb_reset_network_header(skb2); iph = ipv6_hdr(skb2); iph->version = 0; iph->priority = 0; iph->flow_lbl[0] = 0; iph->flow_lbl[1] = 0; iph->flow_lbl[2] = 0; iph->payload_len = 0; iph->nexthdr = IPPROTO_NONE; iph->hop_limit = 0; iph->saddr = rt->rt6i_src.addr; iph->daddr = rt->rt6i_dst.addr; err = ip6mr_cache_unresolved(mrt, vif, skb2, dev); rcu_read_unlock(); return err; } err = mr_fill_mroute(mrt, skb, &cache->_c, rtm); rcu_read_unlock(); return err; } static int ip6mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, u32 portid, u32 seq, struct mfc6_cache *c, int cmd, int flags) { struct nlmsghdr *nlh; struct rtmsg *rtm; int err; nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rtm), flags); if (!nlh) return -EMSGSIZE; rtm = nlmsg_data(nlh); rtm->rtm_family = RTNL_FAMILY_IP6MR; rtm->rtm_dst_len = 128; rtm->rtm_src_len = 128; rtm->rtm_tos = 0; rtm->rtm_table = mrt->id; if (nla_put_u32(skb, RTA_TABLE, mrt->id)) goto nla_put_failure; rtm->rtm_type = RTN_MULTICAST; rtm->rtm_scope = RT_SCOPE_UNIVERSE; if (c->_c.mfc_flags & MFC_STATIC) rtm->rtm_protocol = RTPROT_STATIC; else rtm->rtm_protocol = RTPROT_MROUTED; rtm->rtm_flags = 0; if (nla_put_in6_addr(skb, RTA_SRC, &c->mf6c_origin) || nla_put_in6_addr(skb, RTA_DST, &c->mf6c_mcastgrp)) goto nla_put_failure; err = mr_fill_mroute(mrt, skb, &c->_c, rtm); /* do not break the dump if cache is unresolved */ if (err < 0 && err != -ENOENT) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int _ip6mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, u32 portid, u32 seq, struct mr_mfc *c, int cmd, int flags) { return ip6mr_fill_mroute(mrt, skb, portid, seq, (struct mfc6_cache *)c, cmd, flags); } static int mr6_msgsize(bool unresolved, int maxvif) { size_t len = NLMSG_ALIGN(sizeof(struct rtmsg)) + nla_total_size(4) /* RTA_TABLE */ + nla_total_size(sizeof(struct in6_addr)) /* RTA_SRC */ + nla_total_size(sizeof(struct in6_addr)) /* RTA_DST */ ; if (!unresolved) len = len + nla_total_size(4) /* RTA_IIF */ + nla_total_size(0) /* RTA_MULTIPATH */ + maxvif * NLA_ALIGN(sizeof(struct rtnexthop)) /* RTA_MFC_STATS */ + nla_total_size_64bit(sizeof(struct rta_mfc_stats)) ; return len; } static void mr6_netlink_event(struct mr_table *mrt, struct mfc6_cache *mfc, int cmd) { struct net *net = read_pnet(&mrt->net); struct sk_buff *skb; int err = -ENOBUFS; skb = nlmsg_new(mr6_msgsize(mfc->_c.mfc_parent >= MAXMIFS, mrt->maxvif), GFP_ATOMIC); if (!skb) goto errout; err = ip6mr_fill_mroute(mrt, skb, 0, 0, mfc, cmd, 0); if (err < 0) goto errout; rtnl_notify(skb, net, 0, RTNLGRP_IPV6_MROUTE, NULL, GFP_ATOMIC); return; errout: kfree_skb(skb); rtnl_set_sk_err(net, RTNLGRP_IPV6_MROUTE, err); } static size_t mrt6msg_netlink_msgsize(size_t payloadlen) { size_t len = NLMSG_ALIGN(sizeof(struct rtgenmsg)) + nla_total_size(1) /* IP6MRA_CREPORT_MSGTYPE */ + nla_total_size(4) /* IP6MRA_CREPORT_MIF_ID */ /* IP6MRA_CREPORT_SRC_ADDR */ + nla_total_size(sizeof(struct in6_addr)) /* IP6MRA_CREPORT_DST_ADDR */ + nla_total_size(sizeof(struct in6_addr)) /* IP6MRA_CREPORT_PKT */ + nla_total_size(payloadlen) ; return len; } static void mrt6msg_netlink_event(const struct mr_table *mrt, struct sk_buff *pkt) { struct net *net = read_pnet(&mrt->net); struct nlmsghdr *nlh; struct rtgenmsg *rtgenm; struct mrt6msg *msg; struct sk_buff *skb; struct nlattr *nla; int payloadlen; payloadlen = pkt->len - sizeof(struct mrt6msg); msg = (struct mrt6msg *)skb_transport_header(pkt); skb = nlmsg_new(mrt6msg_netlink_msgsize(payloadlen), GFP_ATOMIC); if (!skb) goto errout; nlh = nlmsg_put(skb, 0, 0, RTM_NEWCACHEREPORT, sizeof(struct rtgenmsg), 0); if (!nlh) goto errout; rtgenm = nlmsg_data(nlh); rtgenm->rtgen_family = RTNL_FAMILY_IP6MR; if (nla_put_u8(skb, IP6MRA_CREPORT_MSGTYPE, msg->im6_msgtype) || nla_put_u32(skb, IP6MRA_CREPORT_MIF_ID, msg->im6_mif) || nla_put_in6_addr(skb, IP6MRA_CREPORT_SRC_ADDR, &msg->im6_src) || nla_put_in6_addr(skb, IP6MRA_CREPORT_DST_ADDR, &msg->im6_dst)) goto nla_put_failure; nla = nla_reserve(skb, IP6MRA_CREPORT_PKT, payloadlen); if (!nla || skb_copy_bits(pkt, sizeof(struct mrt6msg), nla_data(nla), payloadlen)) goto nla_put_failure; nlmsg_end(skb, nlh); rtnl_notify(skb, net, 0, RTNLGRP_IPV6_MROUTE_R, NULL, GFP_ATOMIC); return; nla_put_failure: nlmsg_cancel(skb, nlh); errout: kfree_skb(skb); rtnl_set_sk_err(net, RTNLGRP_IPV6_MROUTE_R, -ENOBUFS); } static const struct nla_policy ip6mr_getroute_policy[RTA_MAX + 1] = { [RTA_SRC] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)), [RTA_DST] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)), [RTA_TABLE] = { .type = NLA_U32 }, }; static int ip6mr_rtm_valid_getroute_req(struct sk_buff *skb, const struct nlmsghdr *nlh, struct nlattr **tb, struct netlink_ext_ack *extack) { struct rtmsg *rtm; int err; err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, ip6mr_getroute_policy, extack); if (err) return err; rtm = nlmsg_data(nlh); if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) || (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) || rtm->rtm_tos || rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope || rtm->rtm_type || rtm->rtm_flags) { NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for multicast route get request"); return -EINVAL; } if ((tb[RTA_SRC] && !rtm->rtm_src_len) || (tb[RTA_DST] && !rtm->rtm_dst_len)) { NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6"); return -EINVAL; } return 0; } static int ip6mr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); struct in6_addr src = {}, grp = {}; struct nlattr *tb[RTA_MAX + 1]; struct mfc6_cache *cache; struct mr_table *mrt; struct sk_buff *skb; u32 tableid; int err; err = ip6mr_rtm_valid_getroute_req(in_skb, nlh, tb, extack); if (err < 0) return err; if (tb[RTA_SRC]) src = nla_get_in6_addr(tb[RTA_SRC]); if (tb[RTA_DST]) grp = nla_get_in6_addr(tb[RTA_DST]); tableid = nla_get_u32_default(tb[RTA_TABLE], 0); mrt = __ip6mr_get_table(net, tableid ?: RT_TABLE_DEFAULT); if (!mrt) { NL_SET_ERR_MSG_MOD(extack, "MR table does not exist"); return -ENOENT; } /* entries are added/deleted only under RTNL */ rcu_read_lock(); cache = ip6mr_cache_find(mrt, &src, &grp); rcu_read_unlock(); if (!cache) { NL_SET_ERR_MSG_MOD(extack, "MR cache entry not found"); return -ENOENT; } skb = nlmsg_new(mr6_msgsize(false, mrt->maxvif), GFP_KERNEL); if (!skb) return -ENOBUFS; err = ip6mr_fill_mroute(mrt, skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, cache, RTM_NEWROUTE, 0); if (err < 0) { kfree_skb(skb); return err; } return rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); } static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) { const struct nlmsghdr *nlh = cb->nlh; struct fib_dump_filter filter = { .rtnl_held = true, }; int err; if (cb->strict_check) { err = ip_valid_fib_dump_req(sock_net(skb->sk), nlh, &filter, cb); if (err < 0) return err; } if (filter.table_id) { struct mr_table *mrt; mrt = __ip6mr_get_table(sock_net(skb->sk), filter.table_id); if (!mrt) { if (rtnl_msg_family(cb->nlh) != RTNL_FAMILY_IP6MR) return skb->len; NL_SET_ERR_MSG_MOD(cb->extack, "MR table does not exist"); return -ENOENT; } err = mr_table_dump(mrt, skb, cb, _ip6mr_fill_mroute, &mfc_unres_lock, &filter); return skb->len ? : err; } return mr_rtm_dumproute(skb, cb, ip6mr_mr_table_iter, _ip6mr_fill_mroute, &mfc_unres_lock, &filter); }
1 1 1 43 44 43 43 44 43 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 // SPDX-License-Identifier: GPL-2.0 /* * Devices PM QoS constraints management * * Copyright (C) 2011 Texas Instruments, Inc. * * This module exposes the interface to kernel space for specifying * per-device PM QoS dependencies. It provides infrastructure for registration * of: * * Dependents on a QoS value : register requests * Watchers of QoS value : get notified when target QoS value changes * * This QoS design is best effort based. Dependents register their QoS needs. * Watchers register to keep track of the current QoS needs of the system. * Watchers can register a per-device notification callback using the * dev_pm_qos_*_notifier API. The notification chain data is stored in the * per-device constraint data struct. * * Note about the per-device constraint data struct allocation: * . The per-device constraints data struct ptr is stored into the device * dev_pm_info. * . To minimize the data usage by the per-device constraints, the data struct * is only allocated at the first call to dev_pm_qos_add_request. * . The data is later free'd when the device is removed from the system. * . A global mutex protects the constraints users from the data being * allocated and free'd. */ #include <linux/pm_qos.h> #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/device.h> #include <linux/mutex.h> #include <linux/export.h> #include <linux/pm_runtime.h> #include <linux/err.h> #include <trace/events/power.h> #include "power.h" static DEFINE_MUTEX(dev_pm_qos_mtx); static DEFINE_MUTEX(dev_pm_qos_sysfs_mtx); /** * __dev_pm_qos_flags - Check PM QoS flags for a given device. * @dev: Device to check the PM QoS flags for. * @mask: Flags to check against. * * This routine must be called with dev->power.lock held. */ enum pm_qos_flags_status __dev_pm_qos_flags(struct device *dev, s32 mask) { struct dev_pm_qos *qos = dev->power.qos; struct pm_qos_flags *pqf; s32 val; lockdep_assert_held(&dev->power.lock); if (IS_ERR_OR_NULL(qos)) return PM_QOS_FLAGS_UNDEFINED; pqf = &qos->flags; if (list_empty(&pqf->list)) return PM_QOS_FLAGS_UNDEFINED; val = pqf->effective_flags & mask; if (val) return (val == mask) ? PM_QOS_FLAGS_ALL : PM_QOS_FLAGS_SOME; return PM_QOS_FLAGS_NONE; } /** * dev_pm_qos_flags - Check PM QoS flags for a given device (locked). * @dev: Device to check the PM QoS flags for. * @mask: Flags to check against. */ enum pm_qos_flags_status dev_pm_qos_flags(struct device *dev, s32 mask) { unsigned long irqflags; enum pm_qos_flags_status ret; spin_lock_irqsave(&dev->power.lock, irqflags); ret = __dev_pm_qos_flags(dev, mask); spin_unlock_irqrestore(&dev->power.lock, irqflags); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_flags); /** * __dev_pm_qos_resume_latency - Get resume latency constraint for a given device. * @dev: Device to get the PM QoS constraint value for. * * This routine must be called with dev->power.lock held. */ s32 __dev_pm_qos_resume_latency(struct device *dev) { lockdep_assert_held(&dev->power.lock); return dev_pm_qos_raw_resume_latency(dev); } /** * dev_pm_qos_read_value - Get PM QoS constraint for a given device (locked). * @dev: Device to get the PM QoS constraint value for. * @type: QoS request type. */ s32 dev_pm_qos_read_value(struct device *dev, enum dev_pm_qos_req_type type) { struct dev_pm_qos *qos = dev->power.qos; unsigned long flags; s32 ret; spin_lock_irqsave(&dev->power.lock, flags); switch (type) { case DEV_PM_QOS_RESUME_LATENCY: ret = IS_ERR_OR_NULL(qos) ? PM_QOS_RESUME_LATENCY_NO_CONSTRAINT : pm_qos_read_value(&qos->resume_latency); break; case DEV_PM_QOS_MIN_FREQUENCY: ret = IS_ERR_OR_NULL(qos) ? PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE : freq_qos_read_value(&qos->freq, FREQ_QOS_MIN); break; case DEV_PM_QOS_MAX_FREQUENCY: ret = IS_ERR_OR_NULL(qos) ? PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE : freq_qos_read_value(&qos->freq, FREQ_QOS_MAX); break; default: WARN_ON(1); ret = 0; } spin_unlock_irqrestore(&dev->power.lock, flags); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_read_value); /** * apply_constraint - Add/modify/remove device PM QoS request. * @req: Constraint request to apply * @action: Action to perform (add/update/remove). * @value: Value to assign to the QoS request. * * Internal function to update the constraints list using the PM QoS core * code and if needed call the per-device callbacks. */ static int apply_constraint(struct dev_pm_qos_request *req, enum pm_qos_req_action action, s32 value) { struct dev_pm_qos *qos = req->dev->power.qos; int ret; switch(req->type) { case DEV_PM_QOS_RESUME_LATENCY: if (WARN_ON(action != PM_QOS_REMOVE_REQ && value < 0)) value = 0; ret = pm_qos_update_target(&qos->resume_latency, &req->data.pnode, action, value); break; case DEV_PM_QOS_LATENCY_TOLERANCE: ret = pm_qos_update_target(&qos->latency_tolerance, &req->data.pnode, action, value); if (ret) { value = pm_qos_read_value(&qos->latency_tolerance); req->dev->power.set_latency_tolerance(req->dev, value); } break; case DEV_PM_QOS_MIN_FREQUENCY: case DEV_PM_QOS_MAX_FREQUENCY: ret = freq_qos_apply(&req->data.freq, action, value); break; case DEV_PM_QOS_FLAGS: ret = pm_qos_update_flags(&qos->flags, &req->data.flr, action, value); break; default: ret = -EINVAL; } return ret; } /* * dev_pm_qos_constraints_allocate * @dev: device to allocate data for * * Called at the first call to add_request, for constraint data allocation * Must be called with the dev_pm_qos_mtx mutex held */ static int dev_pm_qos_constraints_allocate(struct device *dev) { struct dev_pm_qos *qos; struct pm_qos_constraints *c; struct blocking_notifier_head *n; qos = kzalloc_obj(*qos); if (!qos) return -ENOMEM; n = kzalloc_objs(*n, 3); if (!n) { kfree(qos); return -ENOMEM; } c = &qos->resume_latency; plist_head_init(&c->list); c->target_value = PM_QOS_RESUME_LATENCY_DEFAULT_VALUE; c->default_value = PM_QOS_RESUME_LATENCY_DEFAULT_VALUE; c->no_constraint_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT; c->type = PM_QOS_MIN; c->notifiers = n; BLOCKING_INIT_NOTIFIER_HEAD(n); c = &qos->latency_tolerance; plist_head_init(&c->list); c->target_value = PM_QOS_LATENCY_TOLERANCE_DEFAULT_VALUE; c->default_value = PM_QOS_LATENCY_TOLERANCE_DEFAULT_VALUE; c->no_constraint_value = PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT; c->type = PM_QOS_MIN; freq_constraints_init(&qos->freq); INIT_LIST_HEAD(&qos->flags.list); spin_lock_irq(&dev->power.lock); dev->power.qos = qos; spin_unlock_irq(&dev->power.lock); return 0; } static void __dev_pm_qos_hide_latency_limit(struct device *dev); static void __dev_pm_qos_hide_flags(struct device *dev); /** * dev_pm_qos_constraints_destroy * @dev: target device * * Called from the device PM subsystem on device removal under device_pm_lock(). */ void dev_pm_qos_constraints_destroy(struct device *dev) { struct dev_pm_qos *qos; struct dev_pm_qos_request *req, *tmp; struct pm_qos_constraints *c; struct pm_qos_flags *f; mutex_lock(&dev_pm_qos_sysfs_mtx); /* * If the device's PM QoS resume latency limit or PM QoS flags have been * exposed to user space, they have to be hidden at this point. */ pm_qos_sysfs_remove_resume_latency(dev); pm_qos_sysfs_remove_flags(dev); mutex_lock(&dev_pm_qos_mtx); __dev_pm_qos_hide_latency_limit(dev); __dev_pm_qos_hide_flags(dev); qos = dev->power.qos; if (!qos) goto out; /* Flush the constraints lists for the device. */ c = &qos->resume_latency; plist_for_each_entry_safe(req, tmp, &c->list, data.pnode) { /* * Update constraints list and call the notification * callbacks if needed */ apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); } c = &qos->latency_tolerance; plist_for_each_entry_safe(req, tmp, &c->list, data.pnode) { apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); } c = &qos->freq.min_freq; plist_for_each_entry_safe(req, tmp, &c->list, data.freq.pnode) { apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); } c = &qos->freq.max_freq; plist_for_each_entry_safe(req, tmp, &c->list, data.freq.pnode) { apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); } f = &qos->flags; list_for_each_entry_safe(req, tmp, &f->list, data.flr.node) { apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); } spin_lock_irq(&dev->power.lock); dev->power.qos = ERR_PTR(-ENODEV); spin_unlock_irq(&dev->power.lock); kfree(qos->resume_latency.notifiers); kfree(qos); out: mutex_unlock(&dev_pm_qos_mtx); mutex_unlock(&dev_pm_qos_sysfs_mtx); } static bool dev_pm_qos_invalid_req_type(struct device *dev, enum dev_pm_qos_req_type type) { return type == DEV_PM_QOS_LATENCY_TOLERANCE && !dev->power.set_latency_tolerance; } static int __dev_pm_qos_add_request(struct device *dev, struct dev_pm_qos_request *req, enum dev_pm_qos_req_type type, s32 value) { int ret = 0; if (!dev || !req || dev_pm_qos_invalid_req_type(dev, type)) return -EINVAL; if (WARN(dev_pm_qos_request_active(req), "%s() called for already added request\n", __func__)) return -EINVAL; if (IS_ERR(dev->power.qos)) ret = -ENODEV; else if (!dev->power.qos) ret = dev_pm_qos_constraints_allocate(dev); trace_dev_pm_qos_add_request(dev_name(dev), type, value); if (ret) return ret; req->dev = dev; req->type = type; if (req->type == DEV_PM_QOS_MIN_FREQUENCY) ret = freq_qos_add_request(&dev->power.qos->freq, &req->data.freq, FREQ_QOS_MIN, value); else if (req->type == DEV_PM_QOS_MAX_FREQUENCY) ret = freq_qos_add_request(&dev->power.qos->freq, &req->data.freq, FREQ_QOS_MAX, value); else ret = apply_constraint(req, PM_QOS_ADD_REQ, value); return ret; } /** * dev_pm_qos_add_request - inserts new qos request into the list * @dev: target device for the constraint * @req: pointer to a preallocated handle * @type: type of the request * @value: defines the qos request * * This function inserts a new entry in the device constraints list of * requested qos performance characteristics. It recomputes the aggregate * QoS expectations of parameters and initializes the dev_pm_qos_request * handle. Caller needs to save this handle for later use in updates and * removal. * * Returns 1 if the aggregated constraint value has changed, * 0 if the aggregated constraint value has not changed, * -EINVAL in case of wrong parameters, -ENOMEM if there's not enough memory * to allocate for data structures, -ENODEV if the device has just been removed * from the system. * * Callers should ensure that the target device is not RPM_SUSPENDED before * using this function for requests of type DEV_PM_QOS_FLAGS. */ int dev_pm_qos_add_request(struct device *dev, struct dev_pm_qos_request *req, enum dev_pm_qos_req_type type, s32 value) { int ret; mutex_lock(&dev_pm_qos_mtx); ret = __dev_pm_qos_add_request(dev, req, type, value); mutex_unlock(&dev_pm_qos_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_add_request); /** * __dev_pm_qos_update_request - Modify an existing device PM QoS request. * @req : PM QoS request to modify. * @new_value: New value to request. */ static int __dev_pm_qos_update_request(struct dev_pm_qos_request *req, s32 new_value) { s32 curr_value; int ret = 0; if (!req) /*guard against callers passing in null */ return -EINVAL; if (WARN(!dev_pm_qos_request_active(req), "%s() called for unknown object\n", __func__)) return -EINVAL; if (IS_ERR_OR_NULL(req->dev->power.qos)) return -ENODEV; switch(req->type) { case DEV_PM_QOS_RESUME_LATENCY: case DEV_PM_QOS_LATENCY_TOLERANCE: curr_value = req->data.pnode.prio; break; case DEV_PM_QOS_MIN_FREQUENCY: case DEV_PM_QOS_MAX_FREQUENCY: curr_value = req->data.freq.pnode.prio; break; case DEV_PM_QOS_FLAGS: curr_value = req->data.flr.flags; break; default: return -EINVAL; } trace_dev_pm_qos_update_request(dev_name(req->dev), req->type, new_value); if (curr_value != new_value) ret = apply_constraint(req, PM_QOS_UPDATE_REQ, new_value); return ret; } /** * dev_pm_qos_update_request - modifies an existing qos request * @req : handle to list element holding a dev_pm_qos request to use * @new_value: defines the qos request * * Updates an existing dev PM qos request along with updating the * target value. * * Attempts are made to make this code callable on hot code paths. * * Returns 1 if the aggregated constraint value has changed, * 0 if the aggregated constraint value has not changed, * -EINVAL in case of wrong parameters, -ENODEV if the device has been * removed from the system * * Callers should ensure that the target device is not RPM_SUSPENDED before * using this function for requests of type DEV_PM_QOS_FLAGS. */ int dev_pm_qos_update_request(struct dev_pm_qos_request *req, s32 new_value) { int ret; mutex_lock(&dev_pm_qos_mtx); ret = __dev_pm_qos_update_request(req, new_value); mutex_unlock(&dev_pm_qos_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_update_request); static int __dev_pm_qos_remove_request(struct dev_pm_qos_request *req) { int ret; if (!req) /*guard against callers passing in null */ return -EINVAL; if (WARN(!dev_pm_qos_request_active(req), "%s() called for unknown object\n", __func__)) return -EINVAL; if (IS_ERR_OR_NULL(req->dev->power.qos)) return -ENODEV; trace_dev_pm_qos_remove_request(dev_name(req->dev), req->type, PM_QOS_DEFAULT_VALUE); ret = apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); return ret; } /** * dev_pm_qos_remove_request - modifies an existing qos request * @req: handle to request list element * * Will remove pm qos request from the list of constraints and * recompute the current target value. Call this on slow code paths. * * Returns 1 if the aggregated constraint value has changed, * 0 if the aggregated constraint value has not changed, * -EINVAL in case of wrong parameters, -ENODEV if the device has been * removed from the system * * Callers should ensure that the target device is not RPM_SUSPENDED before * using this function for requests of type DEV_PM_QOS_FLAGS. */ int dev_pm_qos_remove_request(struct dev_pm_qos_request *req) { int ret; mutex_lock(&dev_pm_qos_mtx); ret = __dev_pm_qos_remove_request(req); mutex_unlock(&dev_pm_qos_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_remove_request); /** * dev_pm_qos_add_notifier - sets notification entry for changes to target value * of per-device PM QoS constraints * * @dev: target device for the constraint * @notifier: notifier block managed by caller. * @type: request type. * * Will register the notifier into a notification chain that gets called * upon changes to the target value for the device. * * If the device's constraints object doesn't exist when this routine is called, * it will be created (or error code will be returned if that fails). */ int dev_pm_qos_add_notifier(struct device *dev, struct notifier_block *notifier, enum dev_pm_qos_req_type type) { int ret = 0; mutex_lock(&dev_pm_qos_mtx); if (IS_ERR(dev->power.qos)) ret = -ENODEV; else if (!dev->power.qos) ret = dev_pm_qos_constraints_allocate(dev); if (ret) goto unlock; switch (type) { case DEV_PM_QOS_RESUME_LATENCY: ret = blocking_notifier_chain_register(dev->power.qos->resume_latency.notifiers, notifier); break; case DEV_PM_QOS_MIN_FREQUENCY: ret = freq_qos_add_notifier(&dev->power.qos->freq, FREQ_QOS_MIN, notifier); break; case DEV_PM_QOS_MAX_FREQUENCY: ret = freq_qos_add_notifier(&dev->power.qos->freq, FREQ_QOS_MAX, notifier); break; default: WARN_ON(1); ret = -EINVAL; } unlock: mutex_unlock(&dev_pm_qos_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_add_notifier); /** * dev_pm_qos_remove_notifier - deletes notification for changes to target value * of per-device PM QoS constraints * * @dev: target device for the constraint * @notifier: notifier block to be removed. * @type: request type. * * Will remove the notifier from the notification chain that gets called * upon changes to the target value. */ int dev_pm_qos_remove_notifier(struct device *dev, struct notifier_block *notifier, enum dev_pm_qos_req_type type) { int ret = 0; mutex_lock(&dev_pm_qos_mtx); /* Silently return if the constraints object is not present. */ if (IS_ERR_OR_NULL(dev->power.qos)) goto unlock; switch (type) { case DEV_PM_QOS_RESUME_LATENCY: ret = blocking_notifier_chain_unregister(dev->power.qos->resume_latency.notifiers, notifier); break; case DEV_PM_QOS_MIN_FREQUENCY: ret = freq_qos_remove_notifier(&dev->power.qos->freq, FREQ_QOS_MIN, notifier); break; case DEV_PM_QOS_MAX_FREQUENCY: ret = freq_qos_remove_notifier(&dev->power.qos->freq, FREQ_QOS_MAX, notifier); break; default: WARN_ON(1); ret = -EINVAL; } unlock: mutex_unlock(&dev_pm_qos_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_remove_notifier); /** * dev_pm_qos_add_ancestor_request - Add PM QoS request for device's ancestor. * @dev: Device whose ancestor to add the request for. * @req: Pointer to the preallocated handle. * @type: Type of the request. * @value: Constraint latency value. */ int dev_pm_qos_add_ancestor_request(struct device *dev, struct dev_pm_qos_request *req, enum dev_pm_qos_req_type type, s32 value) { struct device *ancestor = dev->parent; int ret = -ENODEV; switch (type) { case DEV_PM_QOS_RESUME_LATENCY: while (ancestor && !ancestor->power.ignore_children) ancestor = ancestor->parent; break; case DEV_PM_QOS_LATENCY_TOLERANCE: while (ancestor && !ancestor->power.set_latency_tolerance) ancestor = ancestor->parent; break; default: ancestor = NULL; } if (ancestor) ret = dev_pm_qos_add_request(ancestor, req, type, value); if (ret < 0) req->dev = NULL; return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_add_ancestor_request); static void __dev_pm_qos_drop_user_request(struct device *dev, enum dev_pm_qos_req_type type) { struct dev_pm_qos_request *req = NULL; switch(type) { case DEV_PM_QOS_RESUME_LATENCY: req = dev->power.qos->resume_latency_req; dev->power.qos->resume_latency_req = NULL; break; case DEV_PM_QOS_LATENCY_TOLERANCE: req = dev->power.qos->latency_tolerance_req; dev->power.qos->latency_tolerance_req = NULL; break; case DEV_PM_QOS_FLAGS: req = dev->power.qos->flags_req; dev->power.qos->flags_req = NULL; break; default: WARN_ON(1); return; } __dev_pm_qos_remove_request(req); kfree(req); } static void dev_pm_qos_drop_user_request(struct device *dev, enum dev_pm_qos_req_type type) { mutex_lock(&dev_pm_qos_mtx); __dev_pm_qos_drop_user_request(dev, type); mutex_unlock(&dev_pm_qos_mtx); } /** * dev_pm_qos_expose_latency_limit - Expose PM QoS latency limit to user space. * @dev: Device whose PM QoS latency limit is to be exposed to user space. * @value: Initial value of the latency limit. */ int dev_pm_qos_expose_latency_limit(struct device *dev, s32 value) { struct dev_pm_qos_request *req; int ret; if (!device_is_registered(dev) || value < 0) return -EINVAL; req = kzalloc_obj(*req); if (!req) return -ENOMEM; ret = dev_pm_qos_add_request(dev, req, DEV_PM_QOS_RESUME_LATENCY, value); if (ret < 0) { kfree(req); return ret; } mutex_lock(&dev_pm_qos_sysfs_mtx); mutex_lock(&dev_pm_qos_mtx); if (IS_ERR_OR_NULL(dev->power.qos)) ret = -ENODEV; else if (dev->power.qos->resume_latency_req) ret = -EEXIST; if (ret < 0) { __dev_pm_qos_remove_request(req); kfree(req); mutex_unlock(&dev_pm_qos_mtx); goto out; } dev->power.qos->resume_latency_req = req; mutex_unlock(&dev_pm_qos_mtx); ret = pm_qos_sysfs_add_resume_latency(dev); if (ret) dev_pm_qos_drop_user_request(dev, DEV_PM_QOS_RESUME_LATENCY); out: mutex_unlock(&dev_pm_qos_sysfs_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_expose_latency_limit); static void __dev_pm_qos_hide_latency_limit(struct device *dev) { if (!IS_ERR_OR_NULL(dev->power.qos) && dev->power.qos->resume_latency_req) __dev_pm_qos_drop_user_request(dev, DEV_PM_QOS_RESUME_LATENCY); } /** * dev_pm_qos_hide_latency_limit - Hide PM QoS latency limit from user space. * @dev: Device whose PM QoS latency limit is to be hidden from user space. */ void dev_pm_qos_hide_latency_limit(struct device *dev) { mutex_lock(&dev_pm_qos_sysfs_mtx); pm_qos_sysfs_remove_resume_latency(dev); mutex_lock(&dev_pm_qos_mtx); __dev_pm_qos_hide_latency_limit(dev); mutex_unlock(&dev_pm_qos_mtx); mutex_unlock(&dev_pm_qos_sysfs_mtx); } EXPORT_SYMBOL_GPL(dev_pm_qos_hide_latency_limit); /** * dev_pm_qos_expose_flags - Expose PM QoS flags of a device to user space. * @dev: Device whose PM QoS flags are to be exposed to user space. * @val: Initial values of the flags. */ int dev_pm_qos_expose_flags(struct device *dev, s32 val) { struct dev_pm_qos_request *req; int ret; if (!device_is_registered(dev)) return -EINVAL; req = kzalloc_obj(*req); if (!req) return -ENOMEM; ret = dev_pm_qos_add_request(dev, req, DEV_PM_QOS_FLAGS, val); if (ret < 0) { kfree(req); return ret; } pm_runtime_get_sync(dev); mutex_lock(&dev_pm_qos_sysfs_mtx); mutex_lock(&dev_pm_qos_mtx); if (IS_ERR_OR_NULL(dev->power.qos)) ret = -ENODEV; else if (dev->power.qos->flags_req) ret = -EEXIST; if (ret < 0) { __dev_pm_qos_remove_request(req); kfree(req); mutex_unlock(&dev_pm_qos_mtx); goto out; } dev->power.qos->flags_req = req; mutex_unlock(&dev_pm_qos_mtx); ret = pm_qos_sysfs_add_flags(dev); if (ret) dev_pm_qos_drop_user_request(dev, DEV_PM_QOS_FLAGS); out: mutex_unlock(&dev_pm_qos_sysfs_mtx); pm_runtime_put(dev); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_expose_flags); static void __dev_pm_qos_hide_flags(struct device *dev) { if (!IS_ERR_OR_NULL(dev->power.qos) && dev->power.qos->flags_req) __dev_pm_qos_drop_user_request(dev, DEV_PM_QOS_FLAGS); } /** * dev_pm_qos_hide_flags - Hide PM QoS flags of a device from user space. * @dev: Device whose PM QoS flags are to be hidden from user space. */ void dev_pm_qos_hide_flags(struct device *dev) { pm_runtime_get_sync(dev); mutex_lock(&dev_pm_qos_sysfs_mtx); pm_qos_sysfs_remove_flags(dev); mutex_lock(&dev_pm_qos_mtx); __dev_pm_qos_hide_flags(dev); mutex_unlock(&dev_pm_qos_mtx); mutex_unlock(&dev_pm_qos_sysfs_mtx); pm_runtime_put(dev); } EXPORT_SYMBOL_GPL(dev_pm_qos_hide_flags); /** * dev_pm_qos_update_flags - Update PM QoS flags request owned by user space. * @dev: Device to update the PM QoS flags request for. * @mask: Flags to set/clear. * @set: Whether to set or clear the flags (true means set). */ int dev_pm_qos_update_flags(struct device *dev, s32 mask, bool set) { s32 value; int ret; pm_runtime_get_sync(dev); mutex_lock(&dev_pm_qos_mtx); if (IS_ERR_OR_NULL(dev->power.qos) || !dev->power.qos->flags_req) { ret = -EINVAL; goto out; } value = dev_pm_qos_requested_flags(dev); if (set) value |= mask; else value &= ~mask; ret = __dev_pm_qos_update_request(dev->power.qos->flags_req, value); out: mutex_unlock(&dev_pm_qos_mtx); pm_runtime_put(dev); return ret; } /** * dev_pm_qos_get_user_latency_tolerance - Get user space latency tolerance. * @dev: Device to obtain the user space latency tolerance for. */ s32 dev_pm_qos_get_user_latency_tolerance(struct device *dev) { s32 ret; mutex_lock(&dev_pm_qos_mtx); ret = IS_ERR_OR_NULL(dev->power.qos) || !dev->power.qos->latency_tolerance_req ? PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT : dev->power.qos->latency_tolerance_req->data.pnode.prio; mutex_unlock(&dev_pm_qos_mtx); return ret; } /** * dev_pm_qos_update_user_latency_tolerance - Update user space latency tolerance. * @dev: Device to update the user space latency tolerance for. * @val: New user space latency tolerance for @dev (negative values disable). */ int dev_pm_qos_update_user_latency_tolerance(struct device *dev, s32 val) { int ret; mutex_lock(&dev_pm_qos_mtx); if (IS_ERR_OR_NULL(dev->power.qos) || !dev->power.qos->latency_tolerance_req) { struct dev_pm_qos_request *req; if (val < 0) { if (val == PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT) ret = 0; else ret = -EINVAL; goto out; } req = kzalloc_obj(*req); if (!req) { ret = -ENOMEM; goto out; } ret = __dev_pm_qos_add_request(dev, req, DEV_PM_QOS_LATENCY_TOLERANCE, val); if (ret < 0) { kfree(req); goto out; } dev->power.qos->latency_tolerance_req = req; } else { if (val < 0) { __dev_pm_qos_drop_user_request(dev, DEV_PM_QOS_LATENCY_TOLERANCE); ret = 0; } else { ret = __dev_pm_qos_update_request(dev->power.qos->latency_tolerance_req, val); } } out: mutex_unlock(&dev_pm_qos_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_update_user_latency_tolerance); /** * dev_pm_qos_expose_latency_tolerance - Expose latency tolerance to userspace * @dev: Device whose latency tolerance to expose */ int dev_pm_qos_expose_latency_tolerance(struct device *dev) { int ret; if (!dev->power.set_latency_tolerance) return -EINVAL; mutex_lock(&dev_pm_qos_sysfs_mtx); ret = pm_qos_sysfs_add_latency_tolerance(dev); mutex_unlock(&dev_pm_qos_sysfs_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_expose_latency_tolerance); /** * dev_pm_qos_hide_latency_tolerance - Hide latency tolerance from userspace * @dev: Device whose latency tolerance to hide */ void dev_pm_qos_hide_latency_tolerance(struct device *dev) { mutex_lock(&dev_pm_qos_sysfs_mtx); pm_qos_sysfs_remove_latency_tolerance(dev); mutex_unlock(&dev_pm_qos_sysfs_mtx); /* Remove the request from user space now */ pm_runtime_get_sync(dev); dev_pm_qos_update_user_latency_tolerance(dev, PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT); pm_runtime_put(dev); } EXPORT_SYMBOL_GPL(dev_pm_qos_hide_latency_tolerance);
2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) by Jaroslav Kysela <perex@perex.cz> * Takashi Iwai <tiwai@suse.de> * * Generic memory allocators */ #include <linux/slab.h> #include <linux/mm.h> #include <linux/dma-mapping.h> #include <linux/dma-map-ops.h> #include <linux/genalloc.h> #include <linux/highmem.h> #include <linux/vmalloc.h> #ifdef CONFIG_X86 #include <asm/set_memory.h> #endif #include <sound/memalloc.h> struct snd_malloc_ops { void *(*alloc)(struct snd_dma_buffer *dmab, size_t size); void (*free)(struct snd_dma_buffer *dmab); dma_addr_t (*get_addr)(struct snd_dma_buffer *dmab, size_t offset); struct page *(*get_page)(struct snd_dma_buffer *dmab, size_t offset); unsigned int (*get_chunk_size)(struct snd_dma_buffer *dmab, unsigned int ofs, unsigned int size); int (*mmap)(struct snd_dma_buffer *dmab, struct vm_area_struct *area); void (*sync)(struct snd_dma_buffer *dmab, enum snd_dma_sync_mode mode); }; #define DEFAULT_GFP \ (GFP_KERNEL | \ __GFP_RETRY_MAYFAIL | /* don't trigger OOM-killer */ \ __GFP_NOWARN) /* no stack trace print - this call is non-critical */ static const struct snd_malloc_ops *snd_dma_get_ops(struct snd_dma_buffer *dmab); static void *__snd_dma_alloc_pages(struct snd_dma_buffer *dmab, size_t size) { const struct snd_malloc_ops *ops = snd_dma_get_ops(dmab); if (WARN_ON_ONCE(!ops || !ops->alloc)) return NULL; return ops->alloc(dmab, size); } /** * snd_dma_alloc_dir_pages - allocate the buffer area according to the given * type and direction * @type: the DMA buffer type * @device: the device pointer * @dir: DMA direction * @size: the buffer size to allocate * @dmab: buffer allocation record to store the allocated data * * Calls the memory-allocator function for the corresponding * buffer type. * * Return: Zero if the buffer with the given size is allocated successfully, * otherwise a negative value on error. */ int snd_dma_alloc_dir_pages(int type, struct device *device, enum dma_data_direction dir, size_t size, struct snd_dma_buffer *dmab) { if (WARN_ON(!size)) return -ENXIO; if (WARN_ON(!dmab)) return -ENXIO; size = PAGE_ALIGN(size); dmab->dev.type = type; dmab->dev.dev = device; dmab->dev.dir = dir; dmab->bytes = 0; dmab->addr = 0; dmab->private_data = NULL; dmab->area = __snd_dma_alloc_pages(dmab, size); if (!dmab->area) return -ENOMEM; dmab->bytes = size; return 0; } EXPORT_SYMBOL(snd_dma_alloc_dir_pages); /** * snd_dma_alloc_pages_fallback - allocate the buffer area according to the given type with fallback * @type: the DMA buffer type * @device: the device pointer * @size: the buffer size to allocate * @dmab: buffer allocation record to store the allocated data * * Calls the memory-allocator function for the corresponding * buffer type. When no space is left, this function reduces the size and * tries to allocate again. The size actually allocated is stored in * res_size argument. * * Return: Zero if the buffer with the given size is allocated successfully, * otherwise a negative value on error. */ int snd_dma_alloc_pages_fallback(int type, struct device *device, size_t size, struct snd_dma_buffer *dmab) { int err; while ((err = snd_dma_alloc_pages(type, device, size, dmab)) < 0) { if (err != -ENOMEM) return err; if (size <= PAGE_SIZE) return -ENOMEM; size >>= 1; size = PAGE_SIZE << get_order(size); } if (! dmab->area) return -ENOMEM; return 0; } EXPORT_SYMBOL(snd_dma_alloc_pages_fallback); /** * snd_dma_free_pages - release the allocated buffer * @dmab: the buffer allocation record to release * * Releases the allocated buffer via snd_dma_alloc_pages(). */ void snd_dma_free_pages(struct snd_dma_buffer *dmab) { const struct snd_malloc_ops *ops = snd_dma_get_ops(dmab); if (ops && ops->free) ops->free(dmab); } EXPORT_SYMBOL(snd_dma_free_pages); /* called by devres */ static void __snd_release_pages(struct device *dev, void *res) { snd_dma_free_pages(res); } /** * snd_devm_alloc_dir_pages - allocate the buffer and manage with devres * @dev: the device pointer * @type: the DMA buffer type * @dir: DMA direction * @size: the buffer size to allocate * * Allocate buffer pages depending on the given type and manage using devres. * The pages will be released automatically at the device removal. * * Unlike snd_dma_alloc_pages(), this function requires the real device pointer, * hence it can't work with SNDRV_DMA_TYPE_CONTINUOUS or * SNDRV_DMA_TYPE_VMALLOC type. * * Return: the snd_dma_buffer object at success, or NULL if failed */ struct snd_dma_buffer * snd_devm_alloc_dir_pages(struct device *dev, int type, enum dma_data_direction dir, size_t size) { struct snd_dma_buffer *dmab; int err; if (WARN_ON(type == SNDRV_DMA_TYPE_CONTINUOUS || type == SNDRV_DMA_TYPE_VMALLOC)) return NULL; dmab = devres_alloc(__snd_release_pages, sizeof(*dmab), GFP_KERNEL); if (!dmab) return NULL; err = snd_dma_alloc_dir_pages(type, dev, dir, size, dmab); if (err < 0) { devres_free(dmab); return NULL; } devres_add(dev, dmab); return dmab; } EXPORT_SYMBOL_GPL(snd_devm_alloc_dir_pages); /** * snd_dma_buffer_mmap - perform mmap of the given DMA buffer * @dmab: buffer allocation information * @area: VM area information * * Return: zero if successful, or a negative error code */ int snd_dma_buffer_mmap(struct snd_dma_buffer *dmab, struct vm_area_struct *area) { const struct snd_malloc_ops *ops; if (!dmab) return -ENOENT; ops = snd_dma_get_ops(dmab); if (ops && ops->mmap) return ops->mmap(dmab, area); else return -ENOENT; } EXPORT_SYMBOL(snd_dma_buffer_mmap); #ifdef CONFIG_HAS_DMA /** * snd_dma_buffer_sync - sync DMA buffer between CPU and device * @dmab: buffer allocation information * @mode: sync mode */ void snd_dma_buffer_sync(struct snd_dma_buffer *dmab, enum snd_dma_sync_mode mode) { const struct snd_malloc_ops *ops; if (!dmab || !dmab->dev.need_sync) return; ops = snd_dma_get_ops(dmab); if (ops && ops->sync) ops->sync(dmab, mode); } EXPORT_SYMBOL_GPL(snd_dma_buffer_sync); #endif /* CONFIG_HAS_DMA */ /** * snd_sgbuf_get_addr - return the physical address at the corresponding offset * @dmab: buffer allocation information * @offset: offset in the ring buffer * * Return: the physical address */ dma_addr_t snd_sgbuf_get_addr(struct snd_dma_buffer *dmab, size_t offset) { const struct snd_malloc_ops *ops = snd_dma_get_ops(dmab); if (ops && ops->get_addr) return ops->get_addr(dmab, offset); else return dmab->addr + offset; } EXPORT_SYMBOL(snd_sgbuf_get_addr); /** * snd_sgbuf_get_page - return the physical page at the corresponding offset * @dmab: buffer allocation information * @offset: offset in the ring buffer * * Return: the page pointer */ struct page *snd_sgbuf_get_page(struct snd_dma_buffer *dmab, size_t offset) { const struct snd_malloc_ops *ops = snd_dma_get_ops(dmab); if (ops && ops->get_page) return ops->get_page(dmab, offset); else return virt_to_page(dmab->area + offset); } EXPORT_SYMBOL(snd_sgbuf_get_page); /** * snd_sgbuf_get_chunk_size - compute the max chunk size with continuous pages * on sg-buffer * @dmab: buffer allocation information * @ofs: offset in the ring buffer * @size: the requested size * * Return: the chunk size */ unsigned int snd_sgbuf_get_chunk_size(struct snd_dma_buffer *dmab, unsigned int ofs, unsigned int size) { const struct snd_malloc_ops *ops = snd_dma_get_ops(dmab); if (ops && ops->get_chunk_size) return ops->get_chunk_size(dmab, ofs, size); else return size; } EXPORT_SYMBOL(snd_sgbuf_get_chunk_size); /* * Continuous pages allocator */ static void *do_alloc_pages(struct device *dev, size_t size, dma_addr_t *addr, bool wc) { void *p; gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; again: p = alloc_pages_exact(size, gfp); if (!p) return NULL; *addr = page_to_phys(virt_to_page(p)); if (!dev) return p; if ((*addr + size - 1) & ~dev->coherent_dma_mask) { if (IS_ENABLED(CONFIG_ZONE_DMA32) && !(gfp & GFP_DMA32)) { gfp |= GFP_DMA32; goto again; } if (IS_ENABLED(CONFIG_ZONE_DMA) && !(gfp & GFP_DMA)) { gfp = (gfp & ~GFP_DMA32) | GFP_DMA; goto again; } } #ifdef CONFIG_X86 if (wc) set_memory_wc((unsigned long)(p), size >> PAGE_SHIFT); #endif return p; } static void do_free_pages(void *p, size_t size, bool wc) { #ifdef CONFIG_X86 if (wc) set_memory_wb((unsigned long)(p), size >> PAGE_SHIFT); #endif free_pages_exact(p, size); } static void *snd_dma_continuous_alloc(struct snd_dma_buffer *dmab, size_t size) { return do_alloc_pages(dmab->dev.dev, size, &dmab->addr, false); } static void snd_dma_continuous_free(struct snd_dma_buffer *dmab) { do_free_pages(dmab->area, dmab->bytes, false); } static int snd_dma_continuous_mmap(struct snd_dma_buffer *dmab, struct vm_area_struct *area) { return remap_pfn_range(area, area->vm_start, dmab->addr >> PAGE_SHIFT, area->vm_end - area->vm_start, area->vm_page_prot); } static const struct snd_malloc_ops snd_dma_continuous_ops = { .alloc = snd_dma_continuous_alloc, .free = snd_dma_continuous_free, .mmap = snd_dma_continuous_mmap, }; /* * VMALLOC allocator */ static void *snd_dma_vmalloc_alloc(struct snd_dma_buffer *dmab, size_t size) { return vmalloc(size); } static void snd_dma_vmalloc_free(struct snd_dma_buffer *dmab) { vfree(dmab->area); } static int snd_dma_vmalloc_mmap(struct snd_dma_buffer *dmab, struct vm_area_struct *area) { return remap_vmalloc_range(area, dmab->area, 0); } #define get_vmalloc_page_addr(dmab, offset) \ page_to_phys(vmalloc_to_page((dmab)->area + (offset))) static dma_addr_t snd_dma_vmalloc_get_addr(struct snd_dma_buffer *dmab, size_t offset) { return get_vmalloc_page_addr(dmab, offset) + offset % PAGE_SIZE; } static struct page *snd_dma_vmalloc_get_page(struct snd_dma_buffer *dmab, size_t offset) { return vmalloc_to_page(dmab->area + offset); } static unsigned int snd_dma_vmalloc_get_chunk_size(struct snd_dma_buffer *dmab, unsigned int ofs, unsigned int size) { unsigned int start, end; unsigned long addr; start = ALIGN_DOWN(ofs, PAGE_SIZE); end = ofs + size - 1; /* the last byte address */ /* check page continuity */ addr = get_vmalloc_page_addr(dmab, start); for (;;) { start += PAGE_SIZE; if (start > end) break; addr += PAGE_SIZE; if (get_vmalloc_page_addr(dmab, start) != addr) return start - ofs; } /* ok, all on continuous pages */ return size; } static const struct snd_malloc_ops snd_dma_vmalloc_ops = { .alloc = snd_dma_vmalloc_alloc, .free = snd_dma_vmalloc_free, .mmap = snd_dma_vmalloc_mmap, .get_addr = snd_dma_vmalloc_get_addr, .get_page = snd_dma_vmalloc_get_page, .get_chunk_size = snd_dma_vmalloc_get_chunk_size, }; #ifdef CONFIG_HAS_DMA /* * IRAM allocator */ #ifdef CONFIG_GENERIC_ALLOCATOR static void *snd_dma_iram_alloc(struct snd_dma_buffer *dmab, size_t size) { struct device *dev = dmab->dev.dev; struct gen_pool *pool; void *p; if (dev->of_node) { pool = of_gen_pool_get(dev->of_node, "iram", 0); /* Assign the pool into private_data field */ dmab->private_data = pool; p = gen_pool_dma_alloc_align(pool, size, &dmab->addr, PAGE_SIZE); if (p) return p; } /* Internal memory might have limited size and no enough space, * so if we fail to malloc, try to fetch memory traditionally. */ dmab->dev.type = SNDRV_DMA_TYPE_DEV; return __snd_dma_alloc_pages(dmab, size); } static void snd_dma_iram_free(struct snd_dma_buffer *dmab) { struct gen_pool *pool = dmab->private_data; if (pool && dmab->area) gen_pool_free(pool, (unsigned long)dmab->area, dmab->bytes); } static int snd_dma_iram_mmap(struct snd_dma_buffer *dmab, struct vm_area_struct *area) { area->vm_page_prot = pgprot_writecombine(area->vm_page_prot); return remap_pfn_range(area, area->vm_start, dmab->addr >> PAGE_SHIFT, area->vm_end - area->vm_start, area->vm_page_prot); } static const struct snd_malloc_ops snd_dma_iram_ops = { .alloc = snd_dma_iram_alloc, .free = snd_dma_iram_free, .mmap = snd_dma_iram_mmap, }; #endif /* CONFIG_GENERIC_ALLOCATOR */ /* * Coherent device pages allocator */ static void *snd_dma_dev_alloc(struct snd_dma_buffer *dmab, size_t size) { return dma_alloc_coherent(dmab->dev.dev, size, &dmab->addr, DEFAULT_GFP); } static void snd_dma_dev_free(struct snd_dma_buffer *dmab) { dma_free_coherent(dmab->dev.dev, dmab->bytes, dmab->area, dmab->addr); } static int snd_dma_dev_mmap(struct snd_dma_buffer *dmab, struct vm_area_struct *area) { return dma_mmap_coherent(dmab->dev.dev, area, dmab->area, dmab->addr, dmab->bytes); } static const struct snd_malloc_ops snd_dma_dev_ops = { .alloc = snd_dma_dev_alloc, .free = snd_dma_dev_free, .mmap = snd_dma_dev_mmap, }; /* * Write-combined pages */ #ifdef CONFIG_SND_DMA_SGBUF /* x86-specific allocations */ static void *snd_dma_wc_alloc(struct snd_dma_buffer *dmab, size_t size) { void *p = do_alloc_pages(dmab->dev.dev, size, &dmab->addr, true); if (!p) return NULL; dmab->addr = dma_map_single(dmab->dev.dev, p, size, DMA_BIDIRECTIONAL); if (dma_mapping_error(dmab->dev.dev, dmab->addr)) { do_free_pages(dmab->area, size, true); return NULL; } return p; } static void snd_dma_wc_free(struct snd_dma_buffer *dmab) { dma_unmap_single(dmab->dev.dev, dmab->addr, dmab->bytes, DMA_BIDIRECTIONAL); do_free_pages(dmab->area, dmab->bytes, true); } static int snd_dma_wc_mmap(struct snd_dma_buffer *dmab, struct vm_area_struct *area) { area->vm_page_prot = pgprot_writecombine(area->vm_page_prot); return dma_mmap_coherent(dmab->dev.dev, area, dmab->area, dmab->addr, dmab->bytes); } #else static void *snd_dma_wc_alloc(struct snd_dma_buffer *dmab, size_t size) { return dma_alloc_wc(dmab->dev.dev, size, &dmab->addr, DEFAULT_GFP); } static void snd_dma_wc_free(struct snd_dma_buffer *dmab) { dma_free_wc(dmab->dev.dev, dmab->bytes, dmab->area, dmab->addr); } static int snd_dma_wc_mmap(struct snd_dma_buffer *dmab, struct vm_area_struct *area) { return dma_mmap_wc(dmab->dev.dev, area, dmab->area, dmab->addr, dmab->bytes); } #endif static const struct snd_malloc_ops snd_dma_wc_ops = { .alloc = snd_dma_wc_alloc, .free = snd_dma_wc_free, .mmap = snd_dma_wc_mmap, }; /* * Non-contiguous pages allocator */ static void *snd_dma_noncontig_alloc(struct snd_dma_buffer *dmab, size_t size) { struct sg_table *sgt; void *p; sgt = dma_alloc_noncontiguous(dmab->dev.dev, size, dmab->dev.dir, DEFAULT_GFP, 0); if (!sgt) return NULL; dmab->dev.need_sync = dma_need_sync(dmab->dev.dev, sg_dma_address(sgt->sgl)); p = dma_vmap_noncontiguous(dmab->dev.dev, size, sgt); if (p) { dmab->private_data = sgt; /* store the first page address for convenience */ dmab->addr = snd_sgbuf_get_addr(dmab, 0); } else { dma_free_noncontiguous(dmab->dev.dev, size, sgt, dmab->dev.dir); } return p; } static void snd_dma_noncontig_free(struct snd_dma_buffer *dmab) { dma_vunmap_noncontiguous(dmab->dev.dev, dmab->area); dma_free_noncontiguous(dmab->dev.dev, dmab->bytes, dmab->private_data, dmab->dev.dir); } static int snd_dma_noncontig_mmap(struct snd_dma_buffer *dmab, struct vm_area_struct *area) { return dma_mmap_noncontiguous(dmab->dev.dev, area, dmab->bytes, dmab->private_data); } static void snd_dma_noncontig_sync(struct snd_dma_buffer *dmab, enum snd_dma_sync_mode mode) { if (mode == SNDRV_DMA_SYNC_CPU) { if (dmab->dev.dir == DMA_TO_DEVICE) return; invalidate_kernel_vmap_range(dmab->area, dmab->bytes); dma_sync_sgtable_for_cpu(dmab->dev.dev, dmab->private_data, dmab->dev.dir); } else { if (dmab->dev.dir == DMA_FROM_DEVICE) return; flush_kernel_vmap_range(dmab->area, dmab->bytes); dma_sync_sgtable_for_device(dmab->dev.dev, dmab->private_data, dmab->dev.dir); } } static inline void snd_dma_noncontig_iter_set(struct snd_dma_buffer *dmab, struct sg_page_iter *piter, size_t offset) { struct sg_table *sgt = dmab->private_data; __sg_page_iter_start(piter, sgt->sgl, sgt->orig_nents, offset >> PAGE_SHIFT); } static dma_addr_t snd_dma_noncontig_get_addr(struct snd_dma_buffer *dmab, size_t offset) { struct sg_dma_page_iter iter; snd_dma_noncontig_iter_set(dmab, &iter.base, offset); __sg_page_iter_dma_next(&iter); return sg_page_iter_dma_address(&iter) + offset % PAGE_SIZE; } static struct page *snd_dma_noncontig_get_page(struct snd_dma_buffer *dmab, size_t offset) { struct sg_page_iter iter; snd_dma_noncontig_iter_set(dmab, &iter, offset); __sg_page_iter_next(&iter); return sg_page_iter_page(&iter); } static unsigned int snd_dma_noncontig_get_chunk_size(struct snd_dma_buffer *dmab, unsigned int ofs, unsigned int size) { struct sg_dma_page_iter iter; unsigned int start, end; unsigned long addr; start = ALIGN_DOWN(ofs, PAGE_SIZE); end = ofs + size - 1; /* the last byte address */ snd_dma_noncontig_iter_set(dmab, &iter.base, start); if (!__sg_page_iter_dma_next(&iter)) return 0; /* check page continuity */ addr = sg_page_iter_dma_address(&iter); for (;;) { start += PAGE_SIZE; if (start > end) break; addr += PAGE_SIZE; if (!__sg_page_iter_dma_next(&iter) || sg_page_iter_dma_address(&iter) != addr) return start - ofs; } /* ok, all on continuous pages */ return size; } static const struct snd_malloc_ops snd_dma_noncontig_ops = { .alloc = snd_dma_noncontig_alloc, .free = snd_dma_noncontig_free, .mmap = snd_dma_noncontig_mmap, .sync = snd_dma_noncontig_sync, .get_addr = snd_dma_noncontig_get_addr, .get_page = snd_dma_noncontig_get_page, .get_chunk_size = snd_dma_noncontig_get_chunk_size, }; #ifdef CONFIG_SND_DMA_SGBUF /* Fallback SG-buffer allocations for x86 */ struct snd_dma_sg_fallback { struct sg_table sgt; /* used by get_addr - must be the first item */ size_t count; struct page **pages; unsigned int *npages; }; static void __snd_dma_sg_fallback_free(struct snd_dma_buffer *dmab, struct snd_dma_sg_fallback *sgbuf) { bool wc = dmab->dev.type == SNDRV_DMA_TYPE_DEV_WC_SG;