Total coverage: 352122 (20%)of 1832231
4 11 5 4 13 1 25 1 21 16 12 4 13 13 13 13 3 10 5 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2016-2021 Christoph Hellwig. */ #include <linux/module.h> #include <linux/compiler.h> #include <linux/fs.h> #include <linux/iomap.h> #include <linux/fiemap.h> #include <linux/pagemap.h> static int iomap_to_fiemap(struct fiemap_extent_info *fi, const struct iomap *iomap, u32 flags) { switch (iomap->type) { case IOMAP_HOLE: /* skip holes */ return 0; case IOMAP_DELALLOC: flags |= FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN; break; case IOMAP_MAPPED: break; case IOMAP_UNWRITTEN: flags |= FIEMAP_EXTENT_UNWRITTEN; break; case IOMAP_INLINE: flags |= FIEMAP_EXTENT_DATA_INLINE; break; } if (iomap->flags & IOMAP_F_MERGED) flags |= FIEMAP_EXTENT_MERGED; if (iomap->flags & IOMAP_F_SHARED) flags |= FIEMAP_EXTENT_SHARED; return fiemap_fill_next_extent(fi, iomap->offset, iomap->addr != IOMAP_NULL_ADDR ? iomap->addr : 0, iomap->length, flags); } static loff_t iomap_fiemap_iter(const struct iomap_iter *iter, struct fiemap_extent_info *fi, struct iomap *prev) { int ret; if (iter->iomap.type == IOMAP_HOLE) return iomap_length(iter); ret = iomap_to_fiemap(fi, prev, 0); *prev = iter->iomap; switch (ret) { case 0: /* success */ return iomap_length(iter); case 1: /* extent array full */ return 0; default: /* error */ return ret; } } int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi, u64 start, u64 len, const struct iomap_ops *ops) { struct iomap_iter iter = { .inode = inode, .pos = start, .len = len, .flags = IOMAP_REPORT, }; struct iomap prev = { .type = IOMAP_HOLE, }; int ret; ret = fiemap_prep(inode, fi, start, &iter.len, 0); if (ret) return ret; while ((ret = iomap_iter(&iter, ops)) > 0) iter.processed = iomap_fiemap_iter(&iter, fi, &prev); if (prev.type != IOMAP_HOLE) { ret = iomap_to_fiemap(fi, &prev, FIEMAP_EXTENT_LAST); if (ret < 0) return ret; } /* inode with no (attribute) mapping will give ENOENT */ if (ret < 0 && ret != -ENOENT) return ret; return 0; } EXPORT_SYMBOL_GPL(iomap_fiemap); /* legacy ->bmap interface. 0 is the error return (!) */ sector_t iomap_bmap(struct address_space *mapping, sector_t bno, const struct iomap_ops *ops) { struct iomap_iter iter = { .inode = mapping->host, .pos = (loff_t)bno << mapping->host->i_blkbits, .len = i_blocksize(mapping->host), .flags = IOMAP_REPORT, }; const unsigned int blkshift = mapping->host->i_blkbits - SECTOR_SHIFT; int ret; if (filemap_write_and_wait(mapping)) return 0; bno = 0; while ((ret = iomap_iter(&iter, ops)) > 0) { if (iter.iomap.type == IOMAP_MAPPED) bno = iomap_sector(&iter.iomap, iter.pos) >> blkshift; /* leave iter.processed unset to abort loop */ } if (ret) return 0; return bno; } EXPORT_SYMBOL_GPL(iomap_bmap);
310 310 309 308 48 310 160 52 309 48 2 310 308 310 310 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 // SPDX-License-Identifier: BSD-3-Clause /* $OpenBSD: siphash.c,v 1.3 2015/02/20 11:51:03 tedu Exp $ */ /*- * Copyright (c) 2013 Andre Oppermann <andre@FreeBSD.org> * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * SipHash is a family of PRFs SipHash-c-d where the integer parameters c and d * are the number of compression rounds and the number of finalization rounds. * A compression round is identical to a finalization round and this round * function is called SipRound. Given a 128-bit key k and a (possibly empty) * byte string m, SipHash-c-d returns a 64-bit value SipHash-c-d(k; m). * * Implemented from the paper "SipHash: a fast short-input PRF", 2012.09.18, * by Jean-Philippe Aumasson and Daniel J. Bernstein, * Permanent Document ID b9a943a805fbfc6fde808af9fc0ecdfa * https://131002.net/siphash/siphash.pdf * https://131002.net/siphash/ */ #include <asm/byteorder.h> #include <linux/unaligned.h> #include <linux/bitops.h> #include <linux/string.h> #include "siphash.h" static void SipHash_Rounds(SIPHASH_CTX *ctx, int rounds) { while (rounds--) { ctx->v[0] += ctx->v[1]; ctx->v[2] += ctx->v[3]; ctx->v[1] = rol64(ctx->v[1], 13); ctx->v[3] = rol64(ctx->v[3], 16); ctx->v[1] ^= ctx->v[0]; ctx->v[3] ^= ctx->v[2]; ctx->v[0] = rol64(ctx->v[0], 32); ctx->v[2] += ctx->v[1]; ctx->v[0] += ctx->v[3]; ctx->v[1] = rol64(ctx->v[1], 17); ctx->v[3] = rol64(ctx->v[3], 21); ctx->v[1] ^= ctx->v[2]; ctx->v[3] ^= ctx->v[0]; ctx->v[2] = rol64(ctx->v[2], 32); } } static void SipHash_CRounds(SIPHASH_CTX *ctx, const void *ptr, int rounds) { u64 m = get_unaligned_le64(ptr); ctx->v[3] ^= m; SipHash_Rounds(ctx, rounds); ctx->v[0] ^= m; } void SipHash_Init(SIPHASH_CTX *ctx, const SIPHASH_KEY *key) { u64 k0, k1; k0 = le64_to_cpu(key->k0); k1 = le64_to_cpu(key->k1); ctx->v[0] = 0x736f6d6570736575ULL ^ k0; ctx->v[1] = 0x646f72616e646f6dULL ^ k1; ctx->v[2] = 0x6c7967656e657261ULL ^ k0; ctx->v[3] = 0x7465646279746573ULL ^ k1; memset(ctx->buf, 0, sizeof(ctx->buf)); ctx->bytes = 0; } void SipHash_Update(SIPHASH_CTX *ctx, int rc, int rf, const void *src, size_t len) { const u8 *ptr = src; size_t left, used; if (len == 0) return; used = ctx->bytes % sizeof(ctx->buf); ctx->bytes += len; if (used > 0) { left = sizeof(ctx->buf) - used; if (len >= left) { memcpy(&ctx->buf[used], ptr, left); SipHash_CRounds(ctx, ctx->buf, rc); len -= left; ptr += left; } else { memcpy(&ctx->buf[used], ptr, len); return; } } while (len >= sizeof(ctx->buf)) { SipHash_CRounds(ctx, ptr, rc); len -= sizeof(ctx->buf); ptr += sizeof(ctx->buf); } if (len > 0) memcpy(&ctx->buf[used], ptr, len); } void SipHash_Final(void *dst, SIPHASH_CTX *ctx, int rc, int rf) { u64 r; r = SipHash_End(ctx, rc, rf); *((__le64 *) dst) = cpu_to_le64(r); } u64 SipHash_End(SIPHASH_CTX *ctx, int rc, int rf) { u64 r; size_t left, used; used = ctx->bytes % sizeof(ctx->buf); left = sizeof(ctx->buf) - used; memset(&ctx->buf[used], 0, left - 1); ctx->buf[7] = ctx->bytes; SipHash_CRounds(ctx, ctx->buf, rc); ctx->v[2] ^= 0xff; SipHash_Rounds(ctx, rf); r = (ctx->v[0] ^ ctx->v[1]) ^ (ctx->v[2] ^ ctx->v[3]); memset(ctx, 0, sizeof(*ctx)); return r; } u64 SipHash(const SIPHASH_KEY *key, int rc, int rf, const void *src, size_t len) { SIPHASH_CTX ctx; SipHash_Init(&ctx, key); SipHash_Update(&ctx, rc, rf, src, len); return SipHash_End(&ctx, rc, rf); }
4 10 214 485 485 483 484 3 89 89 5 8 53 5 127 22 53 53 1 51 53 11 11 98 98 98 1 1 1 1 1 5 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 /* SPDX-License-Identifier: GPL-1.0+ */ /* * Bond several ethernet interfaces into a Cisco, running 'Etherchannel'. * * Portions are (c) Copyright 1995 Simon "Guru Aleph-Null" Janes * NCM: Network and Communications Management, Inc. * * BUT, I'm the one who modified it for ethernet, so: * (c) Copyright 1999, Thomas Davis, tadavis@lbl.gov * */ #ifndef _NET_BONDING_H #define _NET_BONDING_H #include <linux/timer.h> #include <linux/proc_fs.h> #include <linux/if_bonding.h> #include <linux/cpumask.h> #include <linux/in6.h> #include <linux/netpoll.h> #include <linux/inetdevice.h> #include <linux/etherdevice.h> #include <linux/reciprocal_div.h> #include <linux/if_link.h> #include <net/bond_3ad.h> #include <net/bond_alb.h> #include <net/bond_options.h> #include <net/ipv6.h> #include <net/addrconf.h> #define BOND_MAX_ARP_TARGETS 16 #define BOND_MAX_NS_TARGETS BOND_MAX_ARP_TARGETS #define BOND_DEFAULT_MIIMON 100 #ifndef __long_aligned #define __long_aligned __attribute__((aligned((sizeof(long))))) #endif #define slave_info(bond_dev, slave_dev, fmt, ...) \ netdev_info(bond_dev, "(slave %s): " fmt, (slave_dev)->name, ##__VA_ARGS__) #define slave_warn(bond_dev, slave_dev, fmt, ...) \ netdev_warn(bond_dev, "(slave %s): " fmt, (slave_dev)->name, ##__VA_ARGS__) #define slave_dbg(bond_dev, slave_dev, fmt, ...) \ netdev_dbg(bond_dev, "(slave %s): " fmt, (slave_dev)->name, ##__VA_ARGS__) #define slave_err(bond_dev, slave_dev, fmt, ...) \ netdev_err(bond_dev, "(slave %s): " fmt, (slave_dev)->name, ##__VA_ARGS__) #define BOND_MODE(bond) ((bond)->params.mode) /* slave list primitives */ #define bond_slave_list(bond) (&(bond)->dev->adj_list.lower) #define bond_has_slaves(bond) !list_empty(bond_slave_list(bond)) /* IMPORTANT: bond_first/last_slave can return NULL in case of an empty list */ #define bond_first_slave(bond) \ (bond_has_slaves(bond) ? \ netdev_adjacent_get_private(bond_slave_list(bond)->next) : \ NULL) #define bond_last_slave(bond) \ (bond_has_slaves(bond) ? \ netdev_adjacent_get_private(bond_slave_list(bond)->prev) : \ NULL) /* Caller must have rcu_read_lock */ #define bond_first_slave_rcu(bond) \ netdev_lower_get_first_private_rcu(bond->dev) #define bond_is_first_slave(bond, pos) (pos == bond_first_slave(bond)) #define bond_is_last_slave(bond, pos) (pos == bond_last_slave(bond)) /** * bond_for_each_slave - iterate over all slaves * @bond: the bond holding this list * @pos: current slave * @iter: list_head * iterator * * Caller must hold RTNL */ #define bond_for_each_slave(bond, pos, iter) \ netdev_for_each_lower_private((bond)->dev, pos, iter) /* Caller must have rcu_read_lock */ #define bond_for_each_slave_rcu(bond, pos, iter) \ netdev_for_each_lower_private_rcu((bond)->dev, pos, iter) #define BOND_XFRM_FEATURES (NETIF_F_HW_ESP | NETIF_F_HW_ESP_TX_CSUM | \ NETIF_F_GSO_ESP) #ifdef CONFIG_NET_POLL_CONTROLLER extern atomic_t netpoll_block_tx; static inline void block_netpoll_tx(void) { atomic_inc(&netpoll_block_tx); } static inline void unblock_netpoll_tx(void) { atomic_dec(&netpoll_block_tx); } static inline int is_netpoll_tx_blocked(struct net_device *dev) { if (unlikely(netpoll_tx_running(dev))) return atomic_read(&netpoll_block_tx); return 0; } #else #define block_netpoll_tx() #define unblock_netpoll_tx() #define is_netpoll_tx_blocked(dev) (0) #endif struct bond_params { int mode; int xmit_policy; int miimon; u8 num_peer_notif; u8 missed_max; int arp_interval; int arp_validate; int arp_all_targets; int use_carrier; int fail_over_mac; int updelay; int downdelay; int peer_notif_delay; int lacp_active; int lacp_fast; unsigned int min_links; int ad_select; char primary[IFNAMSIZ]; int primary_reselect; __be32 arp_targets[BOND_MAX_ARP_TARGETS]; int tx_queues; int all_slaves_active; int resend_igmp; int lp_interval; int packets_per_slave; int tlb_dynamic_lb; struct reciprocal_value reciprocal_packets_per_slave; u16 ad_actor_sys_prio; u16 ad_user_port_key; #if IS_ENABLED(CONFIG_IPV6) struct in6_addr ns_targets[BOND_MAX_NS_TARGETS]; #endif int coupled_control; /* 2 bytes of padding : see ether_addr_equal_64bits() */ u8 ad_actor_system[ETH_ALEN + 2]; }; struct slave { struct net_device *dev; /* first - useful for panic debug */ struct bonding *bond; /* our master */ int delay; /* all 4 in jiffies */ unsigned long last_link_up; unsigned long last_tx; unsigned long last_rx; unsigned long target_last_arp_rx[BOND_MAX_ARP_TARGETS]; s8 link; /* one of BOND_LINK_XXXX */ s8 link_new_state; /* one of BOND_LINK_XXXX */ u8 backup:1, /* indicates backup slave. Value corresponds with BOND_STATE_ACTIVE and BOND_STATE_BACKUP */ inactive:1, /* indicates inactive slave */ rx_disabled:1, /* indicates whether slave's Rx is disabled */ should_notify:1, /* indicates whether the state changed */ should_notify_link:1; /* indicates whether the link changed */ u8 duplex; u32 original_mtu; u32 link_failure_count; u32 speed; u16 queue_id; u8 perm_hwaddr[MAX_ADDR_LEN]; int prio; struct ad_slave_info *ad_info; struct tlb_slave_info tlb_info; #ifdef CONFIG_NET_POLL_CONTROLLER struct netpoll *np; #endif struct delayed_work notify_work; struct kobject kobj; struct rtnl_link_stats64 slave_stats; }; static inline struct slave *to_slave(struct kobject *kobj) { return container_of(kobj, struct slave, kobj); } struct bond_up_slave { unsigned int count; struct rcu_head rcu; struct slave *arr[]; }; /* * Link pseudo-state only used internally by monitors */ #define BOND_LINK_NOCHANGE -1 struct bond_ipsec { struct list_head list; struct xfrm_state *xs; }; /* * Here are the locking policies for the two bonding locks: * Get rcu_read_lock when reading or RTNL when writing slave list. */ struct bonding { struct net_device *dev; /* first - useful for panic debug */ struct slave __rcu *curr_active_slave; struct slave __rcu *current_arp_slave; struct slave __rcu *primary_slave; struct bond_up_slave __rcu *usable_slaves; struct bond_up_slave __rcu *all_slaves; bool force_primary; bool notifier_ctx; s32 slave_cnt; /* never change this value outside the attach/detach wrappers */ int (*recv_probe)(const struct sk_buff *, struct bonding *, struct slave *); /* mode_lock is used for mode-specific locking needs, currently used by: * 3ad mode (4) - protect against running bond_3ad_unbind_slave() and * bond_3ad_state_machine_handler() concurrently and also * the access to the state machine shared variables. * TLB mode (5) - to sync the use and modifications of its hash table * ALB mode (6) - to sync the use and modifications of its hash table */ spinlock_t mode_lock; spinlock_t stats_lock; u32 send_peer_notif; u8 igmp_retrans; #ifdef CONFIG_PROC_FS struct proc_dir_entry *proc_entry; char proc_file_name[IFNAMSIZ]; #endif /* CONFIG_PROC_FS */ struct list_head bond_list; u32 __percpu *rr_tx_counter; struct ad_bond_info ad_info; struct alb_bond_info alb_info; struct bond_params params; struct workqueue_struct *wq; struct delayed_work mii_work; struct delayed_work arp_work; struct delayed_work alb_work; struct delayed_work ad_work; struct delayed_work mcast_work; struct delayed_work slave_arr_work; #ifdef CONFIG_DEBUG_FS /* debugging support via debugfs */ struct dentry *debug_dir; #endif /* CONFIG_DEBUG_FS */ struct rtnl_link_stats64 bond_stats; #ifdef CONFIG_XFRM_OFFLOAD struct list_head ipsec_list; /* protecting ipsec_list */ struct mutex ipsec_lock; #endif /* CONFIG_XFRM_OFFLOAD */ struct bpf_prog *xdp_prog; }; #define bond_slave_get_rcu(dev) \ ((struct slave *) rcu_dereference(dev->rx_handler_data)) #define bond_slave_get_rtnl(dev) \ ((struct slave *) rtnl_dereference(dev->rx_handler_data)) void bond_queue_slave_event(struct slave *slave); void bond_lower_state_changed(struct slave *slave); struct bond_vlan_tag { __be16 vlan_proto; unsigned short vlan_id; }; /* * Returns NULL if the net_device does not belong to any of the bond's slaves * * Caller must hold bond lock for read */ static inline struct slave *bond_get_slave_by_dev(struct bonding *bond, struct net_device *slave_dev) { return netdev_lower_dev_get_private(bond->dev, slave_dev); } static inline struct bonding *bond_get_bond_by_slave(struct slave *slave) { return slave->bond; } static inline bool bond_should_override_tx_queue(struct bonding *bond) { return BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP || BOND_MODE(bond) == BOND_MODE_ROUNDROBIN; } static inline bool bond_is_lb(const struct bonding *bond) { return BOND_MODE(bond) == BOND_MODE_TLB || BOND_MODE(bond) == BOND_MODE_ALB; } static inline bool bond_needs_speed_duplex(const struct bonding *bond) { return BOND_MODE(bond) == BOND_MODE_8023AD || bond_is_lb(bond); } static inline bool bond_is_nondyn_tlb(const struct bonding *bond) { return (bond_is_lb(bond) && bond->params.tlb_dynamic_lb == 0); } static inline bool bond_mode_can_use_xmit_hash(const struct bonding *bond) { return (BOND_MODE(bond) == BOND_MODE_8023AD || BOND_MODE(bond) == BOND_MODE_XOR || BOND_MODE(bond) == BOND_MODE_TLB || BOND_MODE(bond) == BOND_MODE_ALB); } static inline bool bond_mode_uses_xmit_hash(const struct bonding *bond) { return (BOND_MODE(bond) == BOND_MODE_8023AD || BOND_MODE(bond) == BOND_MODE_XOR || bond_is_nondyn_tlb(bond)); } static inline bool bond_mode_uses_arp(int mode) { return mode != BOND_MODE_8023AD && mode != BOND_MODE_TLB && mode != BOND_MODE_ALB; } static inline bool bond_mode_uses_primary(int mode) { return mode == BOND_MODE_ACTIVEBACKUP || mode == BOND_MODE_TLB || mode == BOND_MODE_ALB; } static inline bool bond_uses_primary(struct bonding *bond) { return bond_mode_uses_primary(BOND_MODE(bond)); } static inline struct net_device *bond_option_active_slave_get_rcu(struct bonding *bond) { struct slave *slave = rcu_dereference_rtnl(bond->curr_active_slave); return bond_uses_primary(bond) && slave ? slave->dev : NULL; } static inline bool bond_slave_is_up(struct slave *slave) { return netif_running(slave->dev) && netif_carrier_ok(slave->dev); } static inline void bond_set_active_slave(struct slave *slave) { if (slave->backup) { slave->backup = 0; bond_queue_slave_event(slave); bond_lower_state_changed(slave); } } static inline void bond_set_backup_slave(struct slave *slave) { if (!slave->backup) { slave->backup = 1; bond_queue_slave_event(slave); bond_lower_state_changed(slave); } } static inline void bond_set_slave_state(struct slave *slave, int slave_state, bool notify) { if (slave->backup == slave_state) return; slave->backup = slave_state; if (notify) { bond_lower_state_changed(slave); bond_queue_slave_event(slave); slave->should_notify = 0; } else { if (slave->should_notify) slave->should_notify = 0; else slave->should_notify = 1; } } static inline void bond_slave_state_change(struct bonding *bond) { struct list_head *iter; struct slave *tmp; bond_for_each_slave(bond, tmp, iter) { if (tmp->link == BOND_LINK_UP) bond_set_active_slave(tmp); else if (tmp->link == BOND_LINK_DOWN) bond_set_backup_slave(tmp); } } static inline void bond_slave_state_notify(struct bonding *bond) { struct list_head *iter; struct slave *tmp; bond_for_each_slave(bond, tmp, iter) { if (tmp->should_notify) { bond_lower_state_changed(tmp); tmp->should_notify = 0; } } } static inline int bond_slave_state(struct slave *slave) { return slave->backup; } static inline bool bond_is_active_slave(struct slave *slave) { return !bond_slave_state(slave); } static inline bool bond_slave_can_tx(struct slave *slave) { return bond_slave_is_up(slave) && slave->link == BOND_LINK_UP && bond_is_active_slave(slave); } static inline bool bond_is_active_slave_dev(const struct net_device *slave_dev) { struct slave *slave; bool active; rcu_read_lock(); slave = bond_slave_get_rcu(slave_dev); active = bond_is_active_slave(slave); rcu_read_unlock(); return active; } static inline void bond_hw_addr_copy(u8 *dst, const u8 *src, unsigned int len) { if (len == ETH_ALEN) { ether_addr_copy(dst, src); return; } memcpy(dst, src, len); } #define BOND_PRI_RESELECT_ALWAYS 0 #define BOND_PRI_RESELECT_BETTER 1 #define BOND_PRI_RESELECT_FAILURE 2 #define BOND_FOM_NONE 0 #define BOND_FOM_ACTIVE 1 #define BOND_FOM_FOLLOW 2 #define BOND_ARP_TARGETS_ANY 0 #define BOND_ARP_TARGETS_ALL 1 #define BOND_ARP_VALIDATE_NONE 0 #define BOND_ARP_VALIDATE_ACTIVE (1 << BOND_STATE_ACTIVE) #define BOND_ARP_VALIDATE_BACKUP (1 << BOND_STATE_BACKUP) #define BOND_ARP_VALIDATE_ALL (BOND_ARP_VALIDATE_ACTIVE | \ BOND_ARP_VALIDATE_BACKUP) #define BOND_ARP_FILTER (BOND_ARP_VALIDATE_ALL + 1) #define BOND_ARP_FILTER_ACTIVE (BOND_ARP_VALIDATE_ACTIVE | \ BOND_ARP_FILTER) #define BOND_ARP_FILTER_BACKUP (BOND_ARP_VALIDATE_BACKUP | \ BOND_ARP_FILTER) #define BOND_SLAVE_NOTIFY_NOW true #define BOND_SLAVE_NOTIFY_LATER false static inline int slave_do_arp_validate(struct bonding *bond, struct slave *slave) { return bond->params.arp_validate & (1 << bond_slave_state(slave)); } static inline int slave_do_arp_validate_only(struct bonding *bond) { return bond->params.arp_validate & BOND_ARP_FILTER; } static inline int bond_is_ip_target_ok(__be32 addr) { return !ipv4_is_lbcast(addr) && !ipv4_is_zeronet(addr); } #if IS_ENABLED(CONFIG_IPV6) static inline int bond_is_ip6_target_ok(struct in6_addr *addr) { return !ipv6_addr_any(addr) && !ipv6_addr_loopback(addr) && !ipv6_addr_is_multicast(addr); } #endif /* Get the oldest arp which we've received on this slave for bond's * arp_targets. */ static inline unsigned long slave_oldest_target_arp_rx(struct bonding *bond, struct slave *slave) { int i = 1; unsigned long ret = slave->target_last_arp_rx[0]; for (; (i < BOND_MAX_ARP_TARGETS) && bond->params.arp_targets[i]; i++) if (time_before(slave->target_last_arp_rx[i], ret)) ret = slave->target_last_arp_rx[i]; return ret; } static inline unsigned long slave_last_rx(struct bonding *bond, struct slave *slave) { if (bond->params.arp_all_targets == BOND_ARP_TARGETS_ALL) return slave_oldest_target_arp_rx(bond, slave); return slave->last_rx; } static inline void slave_update_last_tx(struct slave *slave) { WRITE_ONCE(slave->last_tx, jiffies); } static inline unsigned long slave_last_tx(struct slave *slave) { return READ_ONCE(slave->last_tx); } #ifdef CONFIG_NET_POLL_CONTROLLER static inline netdev_tx_t bond_netpoll_send_skb(const struct slave *slave, struct sk_buff *skb) { return netpoll_send_skb(slave->np, skb); } #else static inline netdev_tx_t bond_netpoll_send_skb(const struct slave *slave, struct sk_buff *skb) { BUG(); return NETDEV_TX_OK; } #endif static inline void bond_set_slave_inactive_flags(struct slave *slave, bool notify) { if (!bond_is_lb(slave->bond)) bond_set_slave_state(slave, BOND_STATE_BACKUP, notify); if (!slave->bond->params.all_slaves_active) slave->inactive = 1; if (BOND_MODE(slave->bond) == BOND_MODE_8023AD) slave->rx_disabled = 1; } static inline void bond_set_slave_tx_disabled_flags(struct slave *slave, bool notify) { bond_set_slave_state(slave, BOND_STATE_BACKUP, notify); } static inline void bond_set_slave_active_flags(struct slave *slave, bool notify) { bond_set_slave_state(slave, BOND_STATE_ACTIVE, notify); slave->inactive = 0; if (BOND_MODE(slave->bond) == BOND_MODE_8023AD) slave->rx_disabled = 0; } static inline void bond_set_slave_rx_enabled_flags(struct slave *slave, bool notify) { slave->rx_disabled = 0; } static inline bool bond_is_slave_inactive(struct slave *slave) { return slave->inactive; } static inline bool bond_is_slave_rx_disabled(struct slave *slave) { return slave->rx_disabled; } static inline void bond_propose_link_state(struct slave *slave, int state) { slave->link_new_state = state; } static inline void bond_commit_link_state(struct slave *slave, bool notify) { if (slave->link_new_state == BOND_LINK_NOCHANGE) return; slave->link = slave->link_new_state; if (notify) { bond_queue_slave_event(slave); bond_lower_state_changed(slave); slave->should_notify_link = 0; } else { if (slave->should_notify_link) slave->should_notify_link = 0; else slave->should_notify_link = 1; } } static inline void bond_set_slave_link_state(struct slave *slave, int state, bool notify) { bond_propose_link_state(slave, state); bond_commit_link_state(slave, notify); } static inline void bond_slave_link_notify(struct bonding *bond) { struct list_head *iter; struct slave *tmp; bond_for_each_slave(bond, tmp, iter) { if (tmp->should_notify_link) { bond_queue_slave_event(tmp); bond_lower_state_changed(tmp); tmp->should_notify_link = 0; } } } static inline __be32 bond_confirm_addr(struct net_device *dev, __be32 dst, __be32 local) { struct in_device *in_dev; __be32 addr = 0; rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); if (in_dev) addr = inet_confirm_addr(dev_net(dev), in_dev, dst, local, RT_SCOPE_HOST); rcu_read_unlock(); return addr; } struct bond_net { struct net *net; /* Associated network namespace */ struct list_head dev_list; #ifdef CONFIG_PROC_FS struct proc_dir_entry *proc_dir; #endif struct class_attribute class_attr_bonding_masters; }; int bond_rcv_validate(const struct sk_buff *skb, struct bonding *bond, struct slave *slave); netdev_tx_t bond_dev_queue_xmit(struct bonding *bond, struct sk_buff *skb, struct net_device *slave_dev); int bond_create(struct net *net, const char *name); int bond_create_sysfs(struct bond_net *net); void bond_destroy_sysfs(struct bond_net *net); void bond_prepare_sysfs_group(struct bonding *bond); int bond_sysfs_slave_add(struct slave *slave); void bond_sysfs_slave_del(struct slave *slave); void bond_xdp_set_features(struct net_device *bond_dev); int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev, struct netlink_ext_ack *extack); int bond_release(struct net_device *bond_dev, struct net_device *slave_dev); u32 bond_xmit_hash(struct bonding *bond, struct sk_buff *skb); int bond_set_carrier(struct bonding *bond); void bond_select_active_slave(struct bonding *bond); void bond_change_active_slave(struct bonding *bond, struct slave *new_active); void bond_create_debugfs(void); void bond_destroy_debugfs(void); void bond_debug_register(struct bonding *bond); void bond_debug_unregister(struct bonding *bond); void bond_debug_reregister(struct bonding *bond); const char *bond_mode_name(int mode); void bond_setup(struct net_device *bond_dev); unsigned int bond_get_num_tx_queues(void); int bond_netlink_init(void); void bond_netlink_fini(void); struct net_device *bond_option_active_slave_get_rcu(struct bonding *bond); const char *bond_slave_link_status(s8 link); struct bond_vlan_tag *bond_verify_device_path(struct net_device *start_dev, struct net_device *end_dev, int level); int bond_update_slave_arr(struct bonding *bond, struct slave *skipslave); void bond_slave_arr_work_rearm(struct bonding *bond, unsigned long delay); void bond_work_init_all(struct bonding *bond); #ifdef CONFIG_PROC_FS void bond_create_proc_entry(struct bonding *bond); void bond_remove_proc_entry(struct bonding *bond); void bond_create_proc_dir(struct bond_net *bn); void bond_destroy_proc_dir(struct bond_net *bn); #else static inline void bond_create_proc_entry(struct bonding *bond) { } static inline void bond_remove_proc_entry(struct bonding *bond) { } static inline void bond_create_proc_dir(struct bond_net *bn) { } static inline void bond_destroy_proc_dir(struct bond_net *bn) { } #endif static inline struct slave *bond_slave_has_mac(struct bonding *bond, const u8 *mac) { struct list_head *iter; struct slave *tmp; bond_for_each_slave(bond, tmp, iter) if (ether_addr_equal_64bits(mac, tmp->dev->dev_addr)) return tmp; return NULL; } /* Caller must hold rcu_read_lock() for read */ static inline bool bond_slave_has_mac_rcu(struct bonding *bond, const u8 *mac) { struct list_head *iter; struct slave *tmp; bond_for_each_slave_rcu(bond, tmp, iter) if (ether_addr_equal_64bits(mac, tmp->dev->dev_addr)) return true; return false; } /* Check if the ip is present in arp ip list, or first free slot if ip == 0 * Returns -1 if not found, index if found */ static inline int bond_get_targets_ip(__be32 *targets, __be32 ip) { int i; for (i = 0; i < BOND_MAX_ARP_TARGETS; i++) if (targets[i] == ip) return i; else if (targets[i] == 0) break; return -1; } #if IS_ENABLED(CONFIG_IPV6) static inline int bond_get_targets_ip6(struct in6_addr *targets, struct in6_addr *ip) { struct in6_addr mcaddr; int i; for (i = 0; i < BOND_MAX_NS_TARGETS; i++) { addrconf_addr_solict_mult(&targets[i], &mcaddr); if ((ipv6_addr_equal(&targets[i], ip)) || (ipv6_addr_equal(&mcaddr, ip))) return i; else if (ipv6_addr_any(&targets[i])) break; } return -1; } #endif /* exported from bond_main.c */ extern unsigned int bond_net_id; /* exported from bond_netlink.c */ extern struct rtnl_link_ops bond_link_ops; /* exported from bond_sysfs_slave.c */ extern const struct sysfs_ops slave_sysfs_ops; /* exported from bond_3ad.c */ extern const u8 lacpdu_mcast_addr[]; static inline netdev_tx_t bond_tx_drop(struct net_device *dev, struct sk_buff *skb) { dev_core_stats_tx_dropped_inc(dev); dev_kfree_skb_any(skb); return NET_XMIT_DROP; } #endif /* _NET_BONDING_H */
42 339 42 296 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef LINUX_EXPORTFS_H #define LINUX_EXPORTFS_H 1 #include <linux/types.h> #include <linux/path.h> struct dentry; struct iattr; struct inode; struct iomap; struct super_block; struct vfsmount; /* limit the handle size to NFSv4 handle size now */ #define MAX_HANDLE_SZ 128 /* * The fileid_type identifies how the file within the filesystem is encoded. * In theory this is freely set and parsed by the filesystem, but we try to * stick to conventions so we can share some generic code and don't confuse * sniffers like ethereal/wireshark. * * The filesystem must not use the value '0' or '0xff'. */ enum fid_type { /* * The root, or export point, of the filesystem. * (Never actually passed down to the filesystem. */ FILEID_ROOT = 0, /* * 32bit inode number, 32 bit generation number. */ FILEID_INO32_GEN = 1, /* * 32bit inode number, 32 bit generation number, * 32 bit parent directory inode number. */ FILEID_INO32_GEN_PARENT = 2, /* * 64 bit object ID, 64 bit root object ID, * 32 bit generation number. */ FILEID_BTRFS_WITHOUT_PARENT = 0x4d, /* * 64 bit object ID, 64 bit root object ID, * 32 bit generation number, * 64 bit parent object ID, 32 bit parent generation. */ FILEID_BTRFS_WITH_PARENT = 0x4e, /* * 64 bit object ID, 64 bit root object ID, * 32 bit generation number, * 64 bit parent object ID, 32 bit parent generation, * 64 bit parent root object ID. */ FILEID_BTRFS_WITH_PARENT_ROOT = 0x4f, /* * 32 bit block number, 16 bit partition reference, * 16 bit unused, 32 bit generation number. */ FILEID_UDF_WITHOUT_PARENT = 0x51, /* * 32 bit block number, 16 bit partition reference, * 16 bit unused, 32 bit generation number, * 32 bit parent block number, 32 bit parent generation number */ FILEID_UDF_WITH_PARENT = 0x52, /* * 64 bit checkpoint number, 64 bit inode number, * 32 bit generation number. */ FILEID_NILFS_WITHOUT_PARENT = 0x61, /* * 64 bit checkpoint number, 64 bit inode number, * 32 bit generation number, 32 bit parent generation. * 64 bit parent inode number. */ FILEID_NILFS_WITH_PARENT = 0x62, /* * 32 bit generation number, 40 bit i_pos. */ FILEID_FAT_WITHOUT_PARENT = 0x71, /* * 32 bit generation number, 40 bit i_pos, * 32 bit parent generation number, 40 bit parent i_pos */ FILEID_FAT_WITH_PARENT = 0x72, /* * 64 bit inode number, 32 bit generation number. */ FILEID_INO64_GEN = 0x81, /* * 64 bit inode number, 32 bit generation number, * 64 bit parent inode number, 32 bit parent generation. */ FILEID_INO64_GEN_PARENT = 0x82, /* * 128 bit child FID (struct lu_fid) * 128 bit parent FID (struct lu_fid) */ FILEID_LUSTRE = 0x97, /* * 64 bit inode number, 32 bit subvolume, 32 bit generation number: */ FILEID_BCACHEFS_WITHOUT_PARENT = 0xb1, FILEID_BCACHEFS_WITH_PARENT = 0xb2, /* * 64 bit unique kernfs id */ FILEID_KERNFS = 0xfe, /* * Filesystems must not use 0xff file ID. */ FILEID_INVALID = 0xff, }; struct fid { union { struct { u32 ino; u32 gen; u32 parent_ino; u32 parent_gen; } i32; struct { u64 ino; u32 gen; } __packed i64; struct { u32 block; u16 partref; u16 parent_partref; u32 generation; u32 parent_block; u32 parent_generation; } udf; DECLARE_FLEX_ARRAY(__u32, raw); }; }; enum handle_to_path_flags { HANDLE_CHECK_PERMS = (1 << 0), HANDLE_CHECK_SUBTREE = (1 << 1), }; struct handle_to_path_ctx { struct path root; enum handle_to_path_flags flags; unsigned int fh_flags; }; #define EXPORT_FH_CONNECTABLE 0x1 /* Encode file handle with parent */ #define EXPORT_FH_FID 0x2 /* File handle may be non-decodeable */ #define EXPORT_FH_DIR_ONLY 0x4 /* Only decode file handle for a directory */ /* * Filesystems use only lower 8 bits of file_handle type for fid_type. * name_to_handle_at() uses upper 16 bits of type as user flags to be * interpreted by open_by_handle_at(). */ #define FILEID_USER_FLAGS_MASK 0xffff0000 #define FILEID_USER_FLAGS(type) ((type) & FILEID_USER_FLAGS_MASK) /* Flags supported in encoded handle_type that is exported to user */ #define FILEID_IS_CONNECTABLE 0x10000 #define FILEID_IS_DIR 0x20000 #define FILEID_VALID_USER_FLAGS (FILEID_IS_CONNECTABLE | FILEID_IS_DIR) /** * struct export_operations - for nfsd to communicate with file systems * @encode_fh: encode a file handle fragment from a dentry * @fh_to_dentry: find the implied object and get a dentry for it * @fh_to_parent: find the implied object's parent and get a dentry for it * @get_name: find the name for a given inode in a given directory * @get_parent: find the parent of a given directory * @commit_metadata: commit metadata changes to stable storage * * See Documentation/filesystems/nfs/exporting.rst for details on how to use * this interface correctly. * * encode_fh: * @encode_fh should store in the file handle fragment @fh (using at most * @max_len bytes) information that can be used by @decode_fh to recover the * file referred to by the &struct dentry @de. If @flag has CONNECTABLE bit * set, the encode_fh() should store sufficient information so that a good * attempt can be made to find not only the file but also it's place in the * filesystem. This typically means storing a reference to de->d_parent in * the filehandle fragment. encode_fh() should return the fileid_type on * success and on error returns 255 (if the space needed to encode fh is * greater than @max_len*4 bytes). On error @max_len contains the minimum * size(in 4 byte unit) needed to encode the file handle. * * fh_to_dentry: * @fh_to_dentry is given a &struct super_block (@sb) and a file handle * fragment (@fh, @fh_len). It should return a &struct dentry which refers * to the same file that the file handle fragment refers to. If it cannot, * it should return a %NULL pointer if the file cannot be found, or an * %ERR_PTR error code of %ENOMEM if a memory allocation failure occurred. * Any other error code is treated like %NULL, and will cause an %ESTALE error * for callers of exportfs_decode_fh(). * Any suitable dentry can be returned including, if necessary, a new dentry * created with d_alloc_root. The caller can then find any other extant * dentries by following the d_alias links. * * fh_to_parent: * Same as @fh_to_dentry, except that it returns a pointer to the parent * dentry if it was encoded into the filehandle fragment by @encode_fh. * * get_name: * @get_name should find a name for the given @child in the given @parent * directory. The name should be stored in the @name (with the * understanding that it is already pointing to a %NAME_MAX+1 sized * buffer. get_name() should return %0 on success, a negative error code * or error. @get_name will be called without @parent->i_mutex held. * * get_parent: * @get_parent should find the parent directory for the given @child which * is also a directory. In the event that it cannot be found, or storage * space cannot be allocated, a %ERR_PTR should be returned. * * permission: * Allow filesystems to specify a custom permission function. * * open: * Allow filesystems to specify a custom open function. * * commit_metadata: * @commit_metadata should commit metadata changes to stable storage. * * Locking rules: * get_parent is called with child->d_inode->i_mutex down * get_name is not (which is possibly inconsistent) */ struct export_operations { int (*encode_fh)(struct inode *inode, __u32 *fh, int *max_len, struct inode *parent); struct dentry * (*fh_to_dentry)(struct super_block *sb, struct fid *fid, int fh_len, int fh_type); struct dentry * (*fh_to_parent)(struct super_block *sb, struct fid *fid, int fh_len, int fh_type); int (*get_name)(struct dentry *parent, char *name, struct dentry *child); struct dentry * (*get_parent)(struct dentry *child); int (*commit_metadata)(struct inode *inode); int (*get_uuid)(struct super_block *sb, u8 *buf, u32 *len, u64 *offset); int (*map_blocks)(struct inode *inode, loff_t offset, u64 len, struct iomap *iomap, bool write, u32 *device_generation); int (*commit_blocks)(struct inode *inode, struct iomap *iomaps, int nr_iomaps, struct iattr *iattr); int (*permission)(struct handle_to_path_ctx *ctx, unsigned int oflags); struct file * (*open)(struct path *path, unsigned int oflags); #define EXPORT_OP_NOWCC (0x1) /* don't collect v3 wcc data */ #define EXPORT_OP_NOSUBTREECHK (0x2) /* no subtree checking */ #define EXPORT_OP_CLOSE_BEFORE_UNLINK (0x4) /* close files before unlink */ #define EXPORT_OP_REMOTE_FS (0x8) /* Filesystem is remote */ #define EXPORT_OP_NOATOMIC_ATTR (0x10) /* Filesystem cannot supply atomic attribute updates */ #define EXPORT_OP_FLUSH_ON_CLOSE (0x20) /* fs flushes file data on close */ #define EXPORT_OP_ASYNC_LOCK (0x40) /* fs can do async lock request */ unsigned long flags; }; extern int exportfs_encode_inode_fh(struct inode *inode, struct fid *fid, int *max_len, struct inode *parent, int flags); extern int exportfs_encode_fh(struct dentry *dentry, struct fid *fid, int *max_len, int flags); static inline bool exportfs_can_encode_fid(const struct export_operations *nop) { return !nop || nop->encode_fh; } static inline bool exportfs_can_decode_fh(const struct export_operations *nop) { return nop && nop->fh_to_dentry; } static inline bool exportfs_can_encode_fh(const struct export_operations *nop, int fh_flags) { /* * If a non-decodeable file handle was requested, we only need to make * sure that filesystem did not opt-out of encoding fid. */ if (fh_flags & EXPORT_FH_FID) return exportfs_can_encode_fid(nop); /* * If a decodeable file handle was requested, we need to make sure that * filesystem can also decode file handles. */ return exportfs_can_decode_fh(nop); } static inline int exportfs_encode_fid(struct inode *inode, struct fid *fid, int *max_len) { return exportfs_encode_inode_fh(inode, fid, max_len, NULL, EXPORT_FH_FID); } extern struct dentry *exportfs_decode_fh_raw(struct vfsmount *mnt, struct fid *fid, int fh_len, int fileid_type, unsigned int flags, int (*acceptable)(void *, struct dentry *), void *context); extern struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid, int fh_len, int fileid_type, int (*acceptable)(void *, struct dentry *), void *context); /* * Generic helpers for filesystems. */ int generic_encode_ino32_fh(struct inode *inode, __u32 *fh, int *max_len, struct inode *parent); struct dentry *generic_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type, struct inode *(*get_inode) (struct super_block *sb, u64 ino, u32 gen)); struct dentry *generic_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type, struct inode *(*get_inode) (struct super_block *sb, u64 ino, u32 gen)); #endif /* LINUX_EXPORTFS_H */
67 67 67 40 64 64 67 57 19 5 5 67 4 4 2 2 2 2 4 3 2 4 67 4 4 67 66 67 55 55 55 70 1 5 69 67 34 2 55 55 55 55 269 205 1 54 55 238 238 70 66 68 70 34 50 18 34 50 49 49 14 34 18 3 1 18 9 9 1 1 8 49 43 9 34 34 34 34 34 34 34 1 34 2 32 1 34 34 9 9 1 1 34 34 34 34 34 34 34 34 1 34 34 34 34 34 34 65 29 37 37 34 34 13 1 34 1 1 26 26 23 3 24 3 5 22 1 2 23 1 1 1 32 32 34 34 65 65 79 79 64 64 3 66 66 34 41 34 9 34 34 33 33 4 1 65 1 65 66 25 78 4 78 4 78 78 69 3 66 69 69 78 78 4 145 144 78 78 77 65 1 3 3 3 1 66 38 40 40 38 2 39 39 39 40 67 67 40 38 67 1 40 67 40 67 40 40 40 40 1 40 40 4 40 40 40 2 38 1 38 4 38 38 66 65 4 4 66 66 39 39 39 67 20 3 2 39 39 39 63 53 10 53 10 68 68 68 68 68 66 29 67 55 55 65 66 66 59 59 1 59 59 12 59 58 1 59 59 116 2 117 59 12 59 57 59 59 59 59 1 57 1 1 1 59 58 59 59 59 58 12 59 1 78 78 1 57 59 58 57 60 60 26 12 12 60 68 68 3 29 68 67 25 56 56 56 56 56 56 56 77 27 78 12 5 75 76 60 57 68 60 57 80 78 3 80 3 78 67 78 78 66 57 56 60 67 60 66 66 60 68 78 2 78 2 78 2 76 81 81 2 79 80 79 79 80 81 80 34 68 7 81 81 2 80 76 9 63 71 71 71 73 79 79 66 66 67 67 67 67 59 59 58 264 214 69 66 39 39 39 39 3 39 39 2 63 50 35 35 33 33 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2009 Oracle. All rights reserved. */ #include <linux/sched.h> #include <linux/pagemap.h> #include <linux/writeback.h> #include <linux/blkdev.h> #include <linux/rbtree.h> #include <linux/slab.h> #include <linux/error-injection.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" #include "volumes.h" #include "locking.h" #include "btrfs_inode.h" #include "async-thread.h" #include "free-space-cache.h" #include "qgroup.h" #include "print-tree.h" #include "delalloc-space.h" #include "block-group.h" #include "backref.h" #include "misc.h" #include "subpage.h" #include "zoned.h" #include "inode-item.h" #include "space-info.h" #include "fs.h" #include "accessors.h" #include "extent-tree.h" #include "root-tree.h" #include "file-item.h" #include "relocation.h" #include "super.h" #include "tree-checker.h" #include "raid-stripe-tree.h" /* * Relocation overview * * [What does relocation do] * * The objective of relocation is to relocate all extents of the target block * group to other block groups. * This is utilized by resize (shrink only), profile converting, compacting * space, or balance routine to spread chunks over devices. * * Before | After * ------------------------------------------------------------------ * BG A: 10 data extents | BG A: deleted * BG B: 2 data extents | BG B: 10 data extents (2 old + 8 relocated) * BG C: 1 extents | BG C: 3 data extents (1 old + 2 relocated) * * [How does relocation work] * * 1. Mark the target block group read-only * New extents won't be allocated from the target block group. * * 2.1 Record each extent in the target block group * To build a proper map of extents to be relocated. * * 2.2 Build data reloc tree and reloc trees * Data reloc tree will contain an inode, recording all newly relocated * data extents. * There will be only one data reloc tree for one data block group. * * Reloc tree will be a special snapshot of its source tree, containing * relocated tree blocks. * Each tree referring to a tree block in target block group will get its * reloc tree built. * * 2.3 Swap source tree with its corresponding reloc tree * Each involved tree only refers to new extents after swap. * * 3. Cleanup reloc trees and data reloc tree. * As old extents in the target block group are still referenced by reloc * trees, we need to clean them up before really freeing the target block * group. * * The main complexity is in steps 2.2 and 2.3. * * The entry point of relocation is relocate_block_group() function. */ #define RELOCATION_RESERVED_NODES 256 /* * map address of tree root to tree */ struct mapping_node { struct { struct rb_node rb_node; u64 bytenr; }; /* Use rb_simle_node for search/insert */ void *data; }; struct mapping_tree { struct rb_root rb_root; spinlock_t lock; }; /* * present a tree block to process */ struct tree_block { struct { struct rb_node rb_node; u64 bytenr; }; /* Use rb_simple_node for search/insert */ u64 owner; struct btrfs_key key; u8 level; bool key_ready; }; #define MAX_EXTENTS 128 struct file_extent_cluster { u64 start; u64 end; u64 boundary[MAX_EXTENTS]; unsigned int nr; u64 owning_root; }; /* Stages of data relocation. */ enum reloc_stage { MOVE_DATA_EXTENTS, UPDATE_DATA_PTRS }; struct reloc_control { /* block group to relocate */ struct btrfs_block_group *block_group; /* extent tree */ struct btrfs_root *extent_root; /* inode for moving data */ struct inode *data_inode; struct btrfs_block_rsv *block_rsv; struct btrfs_backref_cache backref_cache; struct file_extent_cluster cluster; /* tree blocks have been processed */ struct extent_io_tree processed_blocks; /* map start of tree root to corresponding reloc tree */ struct mapping_tree reloc_root_tree; /* list of reloc trees */ struct list_head reloc_roots; /* list of subvolume trees that get relocated */ struct list_head dirty_subvol_roots; /* size of metadata reservation for merging reloc trees */ u64 merging_rsv_size; /* size of relocated tree nodes */ u64 nodes_relocated; /* reserved size for block group relocation*/ u64 reserved_bytes; u64 search_start; u64 extents_found; enum reloc_stage stage; bool create_reloc_tree; bool merge_reloc_tree; bool found_file_extent; }; static void mark_block_processed(struct reloc_control *rc, struct btrfs_backref_node *node) { u32 blocksize; if (node->level == 0 || in_range(node->bytenr, rc->block_group->start, rc->block_group->length)) { blocksize = rc->extent_root->fs_info->nodesize; set_extent_bit(&rc->processed_blocks, node->bytenr, node->bytenr + blocksize - 1, EXTENT_DIRTY, NULL); } node->processed = 1; } /* * walk up backref nodes until reach node presents tree root */ static struct btrfs_backref_node *walk_up_backref( struct btrfs_backref_node *node, struct btrfs_backref_edge *edges[], int *index) { struct btrfs_backref_edge *edge; int idx = *index; while (!list_empty(&node->upper)) { edge = list_entry(node->upper.next, struct btrfs_backref_edge, list[LOWER]); edges[idx++] = edge; node = edge->node[UPPER]; } BUG_ON(node->detached); *index = idx; return node; } /* * walk down backref nodes to find start of next reference path */ static struct btrfs_backref_node *walk_down_backref( struct btrfs_backref_edge *edges[], int *index) { struct btrfs_backref_edge *edge; struct btrfs_backref_node *lower; int idx = *index; while (idx > 0) { edge = edges[idx - 1]; lower = edge->node[LOWER]; if (list_is_last(&edge->list[LOWER], &lower->upper)) { idx--; continue; } edge = list_entry(edge->list[LOWER].next, struct btrfs_backref_edge, list[LOWER]); edges[idx - 1] = edge; *index = idx; return edge->node[UPPER]; } *index = 0; return NULL; } static bool reloc_root_is_dead(const struct btrfs_root *root) { /* * Pair with set_bit/clear_bit in clean_dirty_subvols and * btrfs_update_reloc_root. We need to see the updated bit before * trying to access reloc_root */ smp_rmb(); if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state)) return true; return false; } /* * Check if this subvolume tree has valid reloc tree. * * Reloc tree after swap is considered dead, thus not considered as valid. * This is enough for most callers, as they don't distinguish dead reloc root * from no reloc root. But btrfs_should_ignore_reloc_root() below is a * special case. */ static bool have_reloc_root(const struct btrfs_root *root) { if (reloc_root_is_dead(root)) return false; if (!root->reloc_root) return false; return true; } bool btrfs_should_ignore_reloc_root(const struct btrfs_root *root) { struct btrfs_root *reloc_root; if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) return false; /* This root has been merged with its reloc tree, we can ignore it */ if (reloc_root_is_dead(root)) return true; reloc_root = root->reloc_root; if (!reloc_root) return false; if (btrfs_header_generation(reloc_root->commit_root) == root->fs_info->running_transaction->transid) return false; /* * If there is reloc tree and it was created in previous transaction * backref lookup can find the reloc tree, so backref node for the fs * tree root is useless for relocation. */ return true; } /* * find reloc tree by address of tree root */ struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr) { struct reloc_control *rc = fs_info->reloc_ctl; struct rb_node *rb_node; struct mapping_node *node; struct btrfs_root *root = NULL; ASSERT(rc); spin_lock(&rc->reloc_root_tree.lock); rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, bytenr); if (rb_node) { node = rb_entry(rb_node, struct mapping_node, rb_node); root = node->data; } spin_unlock(&rc->reloc_root_tree.lock); return btrfs_grab_root(root); } /* * For useless nodes, do two major clean ups: * * - Cleanup the children edges and nodes * If child node is also orphan (no parent) during cleanup, then the child * node will also be cleaned up. * * - Freeing up leaves (level 0), keeps nodes detached * For nodes, the node is still cached as "detached" * * Return false if @node is not in the @useless_nodes list. * Return true if @node is in the @useless_nodes list. */ static bool handle_useless_nodes(struct reloc_control *rc, struct btrfs_backref_node *node) { struct btrfs_backref_cache *cache = &rc->backref_cache; struct list_head *useless_node = &cache->useless_node; bool ret = false; while (!list_empty(useless_node)) { struct btrfs_backref_node *cur; cur = list_first_entry(useless_node, struct btrfs_backref_node, list); list_del_init(&cur->list); /* Only tree root nodes can be added to @useless_nodes */ ASSERT(list_empty(&cur->upper)); if (cur == node) ret = true; /* Cleanup the lower edges */ while (!list_empty(&cur->lower)) { struct btrfs_backref_edge *edge; struct btrfs_backref_node *lower; edge = list_entry(cur->lower.next, struct btrfs_backref_edge, list[UPPER]); list_del(&edge->list[UPPER]); list_del(&edge->list[LOWER]); lower = edge->node[LOWER]; btrfs_backref_free_edge(cache, edge); /* Child node is also orphan, queue for cleanup */ if (list_empty(&lower->upper)) list_add(&lower->list, useless_node); } /* Mark this block processed for relocation */ mark_block_processed(rc, cur); /* * Backref nodes for tree leaves are deleted from the cache. * Backref nodes for upper level tree blocks are left in the * cache to avoid unnecessary backref lookup. */ if (cur->level > 0) { cur->detached = 1; } else { rb_erase(&cur->rb_node, &cache->rb_root); btrfs_backref_free_node(cache, cur); } } return ret; } /* * Build backref tree for a given tree block. Root of the backref tree * corresponds the tree block, leaves of the backref tree correspond roots of * b-trees that reference the tree block. * * The basic idea of this function is check backrefs of a given block to find * upper level blocks that reference the block, and then check backrefs of * these upper level blocks recursively. The recursion stops when tree root is * reached or backrefs for the block is cached. * * NOTE: if we find that backrefs for a block are cached, we know backrefs for * all upper level blocks that directly/indirectly reference the block are also * cached. */ static noinline_for_stack struct btrfs_backref_node *build_backref_tree( struct btrfs_trans_handle *trans, struct reloc_control *rc, struct btrfs_key *node_key, int level, u64 bytenr) { struct btrfs_backref_iter *iter; struct btrfs_backref_cache *cache = &rc->backref_cache; /* For searching parent of TREE_BLOCK_REF */ struct btrfs_path *path; struct btrfs_backref_node *cur; struct btrfs_backref_node *node = NULL; struct btrfs_backref_edge *edge; int ret; iter = btrfs_backref_iter_alloc(rc->extent_root->fs_info); if (!iter) return ERR_PTR(-ENOMEM); path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto out; } node = btrfs_backref_alloc_node(cache, bytenr, level); if (!node) { ret = -ENOMEM; goto out; } cur = node; /* Breadth-first search to build backref cache */ do { ret = btrfs_backref_add_tree_node(trans, cache, path, iter, node_key, cur); if (ret < 0) goto out; edge = list_first_entry_or_null(&cache->pending_edge, struct btrfs_backref_edge, list[UPPER]); /* * The pending list isn't empty, take the first block to * process */ if (edge) { list_del_init(&edge->list[UPPER]); cur = edge->node[UPPER]; } } while (edge); /* Finish the upper linkage of newly added edges/nodes */ ret = btrfs_backref_finish_upper_links(cache, node); if (ret < 0) goto out; if (handle_useless_nodes(rc, node)) node = NULL; out: btrfs_free_path(iter->path); kfree(iter); btrfs_free_path(path); if (ret) { btrfs_backref_error_cleanup(cache, node); return ERR_PTR(ret); } ASSERT(!node || !node->detached); ASSERT(list_empty(&cache->useless_node) && list_empty(&cache->pending_edge)); return node; } /* * helper to add 'address of tree root -> reloc tree' mapping */ static int __add_reloc_root(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; struct rb_node *rb_node; struct mapping_node *node; struct reloc_control *rc = fs_info->reloc_ctl; node = kmalloc(sizeof(*node), GFP_NOFS); if (!node) return -ENOMEM; node->bytenr = root->commit_root->start; node->data = root; spin_lock(&rc->reloc_root_tree.lock); rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root, node->bytenr, &node->rb_node); spin_unlock(&rc->reloc_root_tree.lock); if (rb_node) { btrfs_err(fs_info, "Duplicate root found for start=%llu while inserting into relocation tree", node->bytenr); return -EEXIST; } list_add_tail(&root->root_list, &rc->reloc_roots); return 0; } /* * helper to delete the 'address of tree root -> reloc tree' * mapping */ static void __del_reloc_root(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; struct rb_node *rb_node; struct mapping_node *node = NULL; struct reloc_control *rc = fs_info->reloc_ctl; bool put_ref = false; if (rc && root->node) { spin_lock(&rc->reloc_root_tree.lock); rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, root->commit_root->start); if (rb_node) { node = rb_entry(rb_node, struct mapping_node, rb_node); rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); RB_CLEAR_NODE(&node->rb_node); } spin_unlock(&rc->reloc_root_tree.lock); ASSERT(!node || (struct btrfs_root *)node->data == root); } /* * We only put the reloc root here if it's on the list. There's a lot * of places where the pattern is to splice the rc->reloc_roots, process * the reloc roots, and then add the reloc root back onto * rc->reloc_roots. If we call __del_reloc_root while it's off of the * list we don't want the reference being dropped, because the guy * messing with the list is in charge of the reference. */ spin_lock(&fs_info->trans_lock); if (!list_empty(&root->root_list)) { put_ref = true; list_del_init(&root->root_list); } spin_unlock(&fs_info->trans_lock); if (put_ref) btrfs_put_root(root); kfree(node); } /* * helper to update the 'address of tree root -> reloc tree' * mapping */ static int __update_reloc_root(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; struct rb_node *rb_node; struct mapping_node *node = NULL; struct reloc_control *rc = fs_info->reloc_ctl; spin_lock(&rc->reloc_root_tree.lock); rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, root->commit_root->start); if (rb_node) { node = rb_entry(rb_node, struct mapping_node, rb_node); rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); } spin_unlock(&rc->reloc_root_tree.lock); if (!node) return 0; BUG_ON((struct btrfs_root *)node->data != root); spin_lock(&rc->reloc_root_tree.lock); node->bytenr = root->node->start; rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root, node->bytenr, &node->rb_node); spin_unlock(&rc->reloc_root_tree.lock); if (rb_node) btrfs_backref_panic(fs_info, node->bytenr, -EEXIST); return 0; } static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 objectid) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_root *reloc_root; struct extent_buffer *eb; struct btrfs_root_item *root_item; struct btrfs_key root_key; int ret = 0; bool must_abort = false; root_item = kmalloc(sizeof(*root_item), GFP_NOFS); if (!root_item) return ERR_PTR(-ENOMEM); root_key.objectid = BTRFS_TREE_RELOC_OBJECTID; root_key.type = BTRFS_ROOT_ITEM_KEY; root_key.offset = objectid; if (btrfs_root_id(root) == objectid) { u64 commit_root_gen; /* called by btrfs_init_reloc_root */ ret = btrfs_copy_root(trans, root, root->commit_root, &eb, BTRFS_TREE_RELOC_OBJECTID); if (ret) goto fail; /* * Set the last_snapshot field to the generation of the commit * root - like this ctree.c:btrfs_block_can_be_shared() behaves * correctly (returns true) when the relocation root is created * either inside the critical section of a transaction commit * (through transaction.c:qgroup_account_snapshot()) and when * it's created before the transaction commit is started. */ commit_root_gen = btrfs_header_generation(root->commit_root); btrfs_set_root_last_snapshot(&root->root_item, commit_root_gen); } else { /* * called by btrfs_reloc_post_snapshot_hook. * the source tree is a reloc tree, all tree blocks * modified after it was created have RELOC flag * set in their headers. so it's OK to not update * the 'last_snapshot'. */ ret = btrfs_copy_root(trans, root, root->node, &eb, BTRFS_TREE_RELOC_OBJECTID); if (ret) goto fail; } /* * We have changed references at this point, we must abort the * transaction if anything fails. */ must_abort = true; memcpy(root_item, &root->root_item, sizeof(*root_item)); btrfs_set_root_bytenr(root_item, eb->start); btrfs_set_root_level(root_item, btrfs_header_level(eb)); btrfs_set_root_generation(root_item, trans->transid); if (btrfs_root_id(root) == objectid) { btrfs_set_root_refs(root_item, 0); memset(&root_item->drop_progress, 0, sizeof(struct btrfs_disk_key)); btrfs_set_root_drop_level(root_item, 0); } btrfs_tree_unlock(eb); free_extent_buffer(eb); ret = btrfs_insert_root(trans, fs_info->tree_root, &root_key, root_item); if (ret) goto fail; kfree(root_item); reloc_root = btrfs_read_tree_root(fs_info->tree_root, &root_key); if (IS_ERR(reloc_root)) { ret = PTR_ERR(reloc_root); goto abort; } set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state); btrfs_set_root_last_trans(reloc_root, trans->transid); return reloc_root; fail: kfree(root_item); abort: if (must_abort) btrfs_abort_transaction(trans, ret); return ERR_PTR(ret); } /* * create reloc tree for a given fs tree. reloc tree is just a * snapshot of the fs tree with special root objectid. * * The reloc_root comes out of here with two references, one for * root->reloc_root, and another for being on the rc->reloc_roots list. */ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_root *reloc_root; struct reloc_control *rc = fs_info->reloc_ctl; struct btrfs_block_rsv *rsv; int clear_rsv = 0; int ret; if (!rc) return 0; /* * The subvolume has reloc tree but the swap is finished, no need to * create/update the dead reloc tree */ if (reloc_root_is_dead(root)) return 0; /* * This is subtle but important. We do not do * record_root_in_transaction for reloc roots, instead we record their * corresponding fs root, and then here we update the last trans for the * reloc root. This means that we have to do this for the entire life * of the reloc root, regardless of which stage of the relocation we are * in. */ if (root->reloc_root) { reloc_root = root->reloc_root; btrfs_set_root_last_trans(reloc_root, trans->transid); return 0; } /* * We are merging reloc roots, we do not need new reloc trees. Also * reloc trees never need their own reloc tree. */ if (!rc->create_reloc_tree || btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) return 0; if (!trans->reloc_reserved) { rsv = trans->block_rsv; trans->block_rsv = rc->block_rsv; clear_rsv = 1; } reloc_root = create_reloc_root(trans, root, btrfs_root_id(root)); if (clear_rsv) trans->block_rsv = rsv; if (IS_ERR(reloc_root)) return PTR_ERR(reloc_root); ret = __add_reloc_root(reloc_root); ASSERT(ret != -EEXIST); if (ret) { /* Pairs with create_reloc_root */ btrfs_put_root(reloc_root); return ret; } root->reloc_root = btrfs_grab_root(reloc_root); return 0; } /* * update root item of reloc tree */ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_root *reloc_root; struct btrfs_root_item *root_item; int ret; if (!have_reloc_root(root)) return 0; reloc_root = root->reloc_root; root_item = &reloc_root->root_item; /* * We are probably ok here, but __del_reloc_root() will drop its ref of * the root. We have the ref for root->reloc_root, but just in case * hold it while we update the reloc root. */ btrfs_grab_root(reloc_root); /* root->reloc_root will stay until current relocation finished */ if (fs_info->reloc_ctl && fs_info->reloc_ctl->merge_reloc_tree && btrfs_root_refs(root_item) == 0) { set_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); /* * Mark the tree as dead before we change reloc_root so * have_reloc_root will not touch it from now on. */ smp_wmb(); __del_reloc_root(reloc_root); } if (reloc_root->commit_root != reloc_root->node) { __update_reloc_root(reloc_root); btrfs_set_root_node(root_item, reloc_root->node); free_extent_buffer(reloc_root->commit_root); reloc_root->commit_root = btrfs_root_node(reloc_root); } ret = btrfs_update_root(trans, fs_info->tree_root, &reloc_root->root_key, root_item); btrfs_put_root(reloc_root); return ret; } /* * get new location of data */ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr, u64 bytenr, u64 num_bytes) { struct btrfs_root *root = BTRFS_I(reloc_inode)->root; struct btrfs_path *path; struct btrfs_file_extent_item *fi; struct extent_buffer *leaf; int ret; path = btrfs_alloc_path(); if (!path) return -ENOMEM; bytenr -= BTRFS_I(reloc_inode)->reloc_block_group_start; ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(BTRFS_I(reloc_inode)), bytenr, 0); if (ret < 0) goto out; if (ret > 0) { ret = -ENOENT; goto out; } leaf = path->nodes[0]; fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); BUG_ON(btrfs_file_extent_offset(leaf, fi) || btrfs_file_extent_compression(leaf, fi) || btrfs_file_extent_encryption(leaf, fi) || btrfs_file_extent_other_encoding(leaf, fi)); if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) { ret = -EINVAL; goto out; } *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); ret = 0; out: btrfs_free_path(path); return ret; } /* * update file extent items in the tree leaf to point to * the new locations. */ static noinline_for_stack int replace_file_extents(struct btrfs_trans_handle *trans, struct reloc_control *rc, struct btrfs_root *root, struct extent_buffer *leaf) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key key; struct btrfs_file_extent_item *fi; struct btrfs_inode *inode = NULL; u64 parent; u64 bytenr; u64 new_bytenr = 0; u64 num_bytes; u64 end; u32 nritems; u32 i; int ret = 0; int first = 1; if (rc->stage != UPDATE_DATA_PTRS) return 0; /* reloc trees always use full backref */ if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) parent = leaf->start; else parent = 0; nritems = btrfs_header_nritems(leaf); for (i = 0; i < nritems; i++) { struct btrfs_ref ref = { 0 }; cond_resched(); btrfs_item_key_to_cpu(leaf, &key, i); if (key.type != BTRFS_EXTENT_DATA_KEY) continue; fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) continue; bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); if (bytenr == 0) continue; if (!in_range(bytenr, rc->block_group->start, rc->block_group->length)) continue; /* * if we are modifying block in fs tree, wait for read_folio * to complete and drop the extent cache */ if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID) { if (first) { inode = btrfs_find_first_inode(root, key.objectid); first = 0; } else if (inode && btrfs_ino(inode) < key.objectid) { btrfs_add_delayed_iput(inode); inode = btrfs_find_first_inode(root, key.objectid); } if (inode && btrfs_ino(inode) == key.objectid) { struct extent_state *cached_state = NULL; end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); WARN_ON(!IS_ALIGNED(key.offset, fs_info->sectorsize)); WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize)); end--; /* Take mmap lock to serialize with reflinks. */ if (!down_read_trylock(&inode->i_mmap_lock)) continue; ret = try_lock_extent(&inode->io_tree, key.offset, end, &cached_state); if (!ret) { up_read(&inode->i_mmap_lock); continue; } btrfs_drop_extent_map_range(inode, key.offset, end, true); unlock_extent(&inode->io_tree, key.offset, end, &cached_state); up_read(&inode->i_mmap_lock); } } ret = get_new_location(rc->data_inode, &new_bytenr, bytenr, num_bytes); if (ret) { /* * Don't have to abort since we've not changed anything * in the file extent yet. */ break; } btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr); key.offset -= btrfs_file_extent_offset(leaf, fi); ref.action = BTRFS_ADD_DELAYED_REF; ref.bytenr = new_bytenr; ref.num_bytes = num_bytes; ref.parent = parent; ref.owning_root = btrfs_root_id(root); ref.ref_root = btrfs_header_owner(leaf); btrfs_init_data_ref(&ref, key.objectid, key.offset, btrfs_root_id(root), false); ret = btrfs_inc_extent_ref(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); break; } ref.action = BTRFS_DROP_DELAYED_REF; ref.bytenr = bytenr; ref.num_bytes = num_bytes; ref.parent = parent; ref.owning_root = btrfs_root_id(root); ref.ref_root = btrfs_header_owner(leaf); btrfs_init_data_ref(&ref, key.objectid, key.offset, btrfs_root_id(root), false); ret = btrfs_free_extent(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); break; } } if (inode) btrfs_add_delayed_iput(inode); return ret; } static noinline_for_stack int memcmp_node_keys(const struct extent_buffer *eb, int slot, const struct btrfs_path *path, int level) { struct btrfs_disk_key key1; struct btrfs_disk_key key2; btrfs_node_key(eb, &key1, slot); btrfs_node_key(path->nodes[level], &key2, path->slots[level]); return memcmp(&key1, &key2, sizeof(key1)); } /* * try to replace tree blocks in fs tree with the new blocks * in reloc tree. tree blocks haven't been modified since the * reloc tree was create can be replaced. * * if a block was replaced, level of the block + 1 is returned. * if no block got replaced, 0 is returned. if there are other * errors, a negative error number is returned. */ static noinline_for_stack int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc, struct btrfs_root *dest, struct btrfs_root *src, struct btrfs_path *path, struct btrfs_key *next_key, int lowest_level, int max_level) { struct btrfs_fs_info *fs_info = dest->fs_info; struct extent_buffer *eb; struct extent_buffer *parent; struct btrfs_ref ref = { 0 }; struct btrfs_key key; u64 old_bytenr; u64 new_bytenr; u64 old_ptr_gen; u64 new_ptr_gen; u64 last_snapshot; u32 blocksize; int cow = 0; int level; int ret; int slot; ASSERT(btrfs_root_id(src) == BTRFS_TREE_RELOC_OBJECTID); ASSERT(btrfs_root_id(dest) != BTRFS_TREE_RELOC_OBJECTID); last_snapshot = btrfs_root_last_snapshot(&src->root_item); again: slot = path->slots[lowest_level]; btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot); eb = btrfs_lock_root_node(dest); level = btrfs_header_level(eb); if (level < lowest_level) { btrfs_tree_unlock(eb); free_extent_buffer(eb); return 0; } if (cow) { ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb, BTRFS_NESTING_COW); if (ret) { btrfs_tree_unlock(eb); free_extent_buffer(eb); return ret; } } if (next_key) { next_key->objectid = (u64)-1; next_key->type = (u8)-1; next_key->offset = (u64)-1; } parent = eb; while (1) { level = btrfs_header_level(parent); ASSERT(level >= lowest_level); ret = btrfs_bin_search(parent, 0, &key, &slot); if (ret < 0) break; if (ret && slot > 0) slot--; if (next_key && slot + 1 < btrfs_header_nritems(parent)) btrfs_node_key_to_cpu(parent, next_key, slot + 1); old_bytenr = btrfs_node_blockptr(parent, slot); blocksize = fs_info->nodesize; old_ptr_gen = btrfs_node_ptr_generation(parent, slot); if (level <= max_level) { eb = path->nodes[level]; new_bytenr = btrfs_node_blockptr(eb, path->slots[level]); new_ptr_gen = btrfs_node_ptr_generation(eb, path->slots[level]); } else { new_bytenr = 0; new_ptr_gen = 0; } if (WARN_ON(new_bytenr > 0 && new_bytenr == old_bytenr)) { ret = level; break; } if (new_bytenr == 0 || old_ptr_gen > last_snapshot || memcmp_node_keys(parent, slot, path, level)) { if (level <= lowest_level) { ret = 0; break; } eb = btrfs_read_node_slot(parent, slot); if (IS_ERR(eb)) { ret = PTR_ERR(eb); break; } btrfs_tree_lock(eb); if (cow) { ret = btrfs_cow_block(trans, dest, eb, parent, slot, &eb, BTRFS_NESTING_COW); if (ret) { btrfs_tree_unlock(eb); free_extent_buffer(eb); break; } } btrfs_tree_unlock(parent); free_extent_buffer(parent); parent = eb; continue; } if (!cow) { btrfs_tree_unlock(parent); free_extent_buffer(parent); cow = 1; goto again; } btrfs_node_key_to_cpu(path->nodes[level], &key, path->slots[level]); btrfs_release_path(path); path->lowest_level = level; set_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &src->state); ret = btrfs_search_slot(trans, src, &key, path, 0, 1); clear_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &src->state); path->lowest_level = 0; if (ret) { if (ret > 0) ret = -ENOENT; break; } /* * Info qgroup to trace both subtrees. * * We must trace both trees. * 1) Tree reloc subtree * If not traced, we will leak data numbers * 2) Fs subtree * If not traced, we will double count old data * * We don't scan the subtree right now, but only record * the swapped tree blocks. * The real subtree rescan is delayed until we have new * CoW on the subtree root node before transaction commit. */ ret = btrfs_qgroup_add_swapped_blocks(dest, rc->block_group, parent, slot, path->nodes[level], path->slots[level], last_snapshot); if (ret < 0) break; /* * swap blocks in fs tree and reloc tree. */ btrfs_set_node_blockptr(parent, slot, new_bytenr); btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen); btrfs_set_node_blockptr(path->nodes[level], path->slots[level], old_bytenr); btrfs_set_node_ptr_generation(path->nodes[level], path->slots[level], old_ptr_gen); ref.action = BTRFS_ADD_DELAYED_REF; ref.bytenr = old_bytenr; ref.num_bytes = blocksize; ref.parent = path->nodes[level]->start; ref.owning_root = btrfs_root_id(src); ref.ref_root = btrfs_root_id(src); btrfs_init_tree_ref(&ref, level - 1, 0, true); ret = btrfs_inc_extent_ref(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); break; } ref.action = BTRFS_ADD_DELAYED_REF; ref.bytenr = new_bytenr; ref.num_bytes = blocksize; ref.parent = 0; ref.owning_root = btrfs_root_id(dest); ref.ref_root = btrfs_root_id(dest); btrfs_init_tree_ref(&ref, level - 1, 0, true); ret = btrfs_inc_extent_ref(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); break; } /* We don't know the real owning_root, use 0. */ ref.action = BTRFS_DROP_DELAYED_REF; ref.bytenr = new_bytenr; ref.num_bytes = blocksize; ref.parent = path->nodes[level]->start; ref.owning_root = 0; ref.ref_root = btrfs_root_id(src); btrfs_init_tree_ref(&ref, level - 1, 0, true); ret = btrfs_free_extent(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); break; } /* We don't know the real owning_root, use 0. */ ref.action = BTRFS_DROP_DELAYED_REF; ref.bytenr = old_bytenr; ref.num_bytes = blocksize; ref.parent = 0; ref.owning_root = 0; ref.ref_root = btrfs_root_id(dest); btrfs_init_tree_ref(&ref, level - 1, 0, true); ret = btrfs_free_extent(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); break; } btrfs_unlock_up_safe(path, 0); ret = level; break; } btrfs_tree_unlock(parent); free_extent_buffer(parent); return ret; } /* * helper to find next relocated block in reloc tree */ static noinline_for_stack int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, int *level) { struct extent_buffer *eb; int i; u64 last_snapshot; u32 nritems; last_snapshot = btrfs_root_last_snapshot(&root->root_item); for (i = 0; i < *level; i++) { free_extent_buffer(path->nodes[i]); path->nodes[i] = NULL; } for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) { eb = path->nodes[i]; nritems = btrfs_header_nritems(eb); while (path->slots[i] + 1 < nritems) { path->slots[i]++; if (btrfs_node_ptr_generation(eb, path->slots[i]) <= last_snapshot) continue; *level = i; return 0; } free_extent_buffer(path->nodes[i]); path->nodes[i] = NULL; } return 1; } /* * walk down reloc tree to find relocated block of lowest level */ static noinline_for_stack int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, int *level) { struct extent_buffer *eb = NULL; int i; u64 ptr_gen = 0; u64 last_snapshot; u32 nritems; last_snapshot = btrfs_root_last_snapshot(&root->root_item); for (i = *level; i > 0; i--) { eb = path->nodes[i]; nritems = btrfs_header_nritems(eb); while (path->slots[i] < nritems) { ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]); if (ptr_gen > last_snapshot) break; path->slots[i]++; } if (path->slots[i] >= nritems) { if (i == *level) break; *level = i + 1; return 0; } if (i == 1) { *level = i; return 0; } eb = btrfs_read_node_slot(eb, path->slots[i]); if (IS_ERR(eb)) return PTR_ERR(eb); BUG_ON(btrfs_header_level(eb) != i - 1); path->nodes[i - 1] = eb; path->slots[i - 1] = 0; } return 1; } /* * invalidate extent cache for file extents whose key in range of * [min_key, max_key) */ static int invalidate_extent_cache(struct btrfs_root *root, const struct btrfs_key *min_key, const struct btrfs_key *max_key) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_inode *inode = NULL; u64 objectid; u64 start, end; u64 ino; objectid = min_key->objectid; while (1) { struct extent_state *cached_state = NULL; cond_resched(); if (inode) iput(&inode->vfs_inode); if (objectid > max_key->objectid) break; inode = btrfs_find_first_inode(root, objectid); if (!inode) break; ino = btrfs_ino(inode); if (ino > max_key->objectid) { iput(&inode->vfs_inode); break; } objectid = ino + 1; if (!S_ISREG(inode->vfs_inode.i_mode)) continue; if (unlikely(min_key->objectid == ino)) { if (min_key->type > BTRFS_EXTENT_DATA_KEY) continue; if (min_key->type < BTRFS_EXTENT_DATA_KEY) start = 0; else { start = min_key->offset; WARN_ON(!IS_ALIGNED(start, fs_info->sectorsize)); } } else { start = 0; } if (unlikely(max_key->objectid == ino)) { if (max_key->type < BTRFS_EXTENT_DATA_KEY) continue; if (max_key->type > BTRFS_EXTENT_DATA_KEY) { end = (u64)-1; } else { if (max_key->offset == 0) continue; end = max_key->offset; WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize)); end--; } } else { end = (u64)-1; } /* the lock_extent waits for read_folio to complete */ lock_extent(&inode->io_tree, start, end, &cached_state); btrfs_drop_extent_map_range(inode, start, end, true); unlock_extent(&inode->io_tree, start, end, &cached_state); } return 0; } static int find_next_key(struct btrfs_path *path, int level, struct btrfs_key *key) { while (level < BTRFS_MAX_LEVEL) { if (!path->nodes[level]) break; if (path->slots[level] + 1 < btrfs_header_nritems(path->nodes[level])) { btrfs_node_key_to_cpu(path->nodes[level], key, path->slots[level] + 1); return 0; } level++; } return 1; } /* * Insert current subvolume into reloc_control::dirty_subvol_roots */ static int insert_dirty_subvol(struct btrfs_trans_handle *trans, struct reloc_control *rc, struct btrfs_root *root) { struct btrfs_root *reloc_root = root->reloc_root; struct btrfs_root_item *reloc_root_item; int ret; /* @root must be a subvolume tree root with a valid reloc tree */ ASSERT(btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID); ASSERT(reloc_root); reloc_root_item = &reloc_root->root_item; memset(&reloc_root_item->drop_progress, 0, sizeof(reloc_root_item->drop_progress)); btrfs_set_root_drop_level(reloc_root_item, 0); btrfs_set_root_refs(reloc_root_item, 0); ret = btrfs_update_reloc_root(trans, root); if (ret) return ret; if (list_empty(&root->reloc_dirty_list)) { btrfs_grab_root(root); list_add_tail(&root->reloc_dirty_list, &rc->dirty_subvol_roots); } return 0; } static int clean_dirty_subvols(struct reloc_control *rc) { struct btrfs_root *root; struct btrfs_root *next; int ret = 0; int ret2; list_for_each_entry_safe(root, next, &rc->dirty_subvol_roots, reloc_dirty_list) { if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID) { /* Merged subvolume, cleanup its reloc root */ struct btrfs_root *reloc_root = root->reloc_root; list_del_init(&root->reloc_dirty_list); root->reloc_root = NULL; /* * Need barrier to ensure clear_bit() only happens after * root->reloc_root = NULL. Pairs with have_reloc_root. */ smp_wmb(); clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); if (reloc_root) { /* * btrfs_drop_snapshot drops our ref we hold for * ->reloc_root. If it fails however we must * drop the ref ourselves. */ ret2 = btrfs_drop_snapshot(reloc_root, 0, 1); if (ret2 < 0) { btrfs_put_root(reloc_root); if (!ret) ret = ret2; } } btrfs_put_root(root); } else { /* Orphan reloc tree, just clean it up */ ret2 = btrfs_drop_snapshot(root, 0, 1); if (ret2 < 0) { btrfs_put_root(root); if (!ret) ret = ret2; } } } return ret; } /* * merge the relocated tree blocks in reloc tree with corresponding * fs tree. */ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, struct btrfs_root *root) { struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; struct btrfs_key key; struct btrfs_key next_key; struct btrfs_trans_handle *trans = NULL; struct btrfs_root *reloc_root; struct btrfs_root_item *root_item; struct btrfs_path *path; struct extent_buffer *leaf; int reserve_level; int level; int max_level; int replaced = 0; int ret = 0; u32 min_reserved; path = btrfs_alloc_path(); if (!path) return -ENOMEM; path->reada = READA_FORWARD; reloc_root = root->reloc_root; root_item = &reloc_root->root_item; if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { level = btrfs_root_level(root_item); atomic_inc(&reloc_root->node->refs); path->nodes[level] = reloc_root->node; path->slots[level] = 0; } else { btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); level = btrfs_root_drop_level(root_item); BUG_ON(level == 0); path->lowest_level = level; ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0); path->lowest_level = 0; if (ret < 0) { btrfs_free_path(path); return ret; } btrfs_node_key_to_cpu(path->nodes[level], &next_key, path->slots[level]); WARN_ON(memcmp(&key, &next_key, sizeof(key))); btrfs_unlock_up_safe(path, 0); } /* * In merge_reloc_root(), we modify the upper level pointer to swap the * tree blocks between reloc tree and subvolume tree. Thus for tree * block COW, we COW at most from level 1 to root level for each tree. * * Thus the needed metadata size is at most root_level * nodesize, * and * 2 since we have two trees to COW. */ reserve_level = max_t(int, 1, btrfs_root_level(root_item)); min_reserved = fs_info->nodesize * reserve_level * 2; memset(&next_key, 0, sizeof(next_key)); while (1) { ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, min_reserved, BTRFS_RESERVE_FLUSH_LIMIT); if (ret) goto out; trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); trans = NULL; goto out; } /* * At this point we no longer have a reloc_control, so we can't * depend on btrfs_init_reloc_root to update our last_trans. * * But that's ok, we started the trans handle on our * corresponding fs_root, which means it's been added to the * dirty list. At commit time we'll still call * btrfs_update_reloc_root() and update our root item * appropriately. */ btrfs_set_root_last_trans(reloc_root, trans->transid); trans->block_rsv = rc->block_rsv; replaced = 0; max_level = level; ret = walk_down_reloc_tree(reloc_root, path, &level); if (ret < 0) goto out; if (ret > 0) break; if (!find_next_key(path, level, &key) && btrfs_comp_cpu_keys(&next_key, &key) >= 0) { ret = 0; } else { ret = replace_path(trans, rc, root, reloc_root, path, &next_key, level, max_level); } if (ret < 0) goto out; if (ret > 0) { level = ret; btrfs_node_key_to_cpu(path->nodes[level], &key, path->slots[level]); replaced = 1; } ret = walk_up_reloc_tree(reloc_root, path, &level); if (ret > 0) break; BUG_ON(level == 0); /* * save the merging progress in the drop_progress. * this is OK since root refs == 1 in this case. */ btrfs_node_key(path->nodes[level], &root_item->drop_progress, path->slots[level]); btrfs_set_root_drop_level(root_item, level); btrfs_end_transaction_throttle(trans); trans = NULL; btrfs_btree_balance_dirty(fs_info); if (replaced && rc->stage == UPDATE_DATA_PTRS) invalidate_extent_cache(root, &key, &next_key); } /* * handle the case only one block in the fs tree need to be * relocated and the block is tree root. */ leaf = btrfs_lock_root_node(root); ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf, BTRFS_NESTING_COW); btrfs_tree_unlock(leaf); free_extent_buffer(leaf); out: btrfs_free_path(path); if (ret == 0) { ret = insert_dirty_subvol(trans, rc, root); if (ret) btrfs_abort_transaction(trans, ret); } if (trans) btrfs_end_transaction_throttle(trans); btrfs_btree_balance_dirty(fs_info); if (replaced && rc->stage == UPDATE_DATA_PTRS) invalidate_extent_cache(root, &key, &next_key); return ret; } static noinline_for_stack int prepare_to_merge(struct reloc_control *rc, int err) { struct btrfs_root *root = rc->extent_root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_root *reloc_root; struct btrfs_trans_handle *trans; LIST_HEAD(reloc_roots); u64 num_bytes = 0; int ret; mutex_lock(&fs_info->reloc_mutex); rc->merging_rsv_size += fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2; rc->merging_rsv_size += rc->nodes_relocated * 2; mutex_unlock(&fs_info->reloc_mutex); again: if (!err) { num_bytes = rc->merging_rsv_size; ret = btrfs_block_rsv_add(fs_info, rc->block_rsv, num_bytes, BTRFS_RESERVE_FLUSH_ALL); if (ret) err = ret; } trans = btrfs_join_transaction(rc->extent_root); if (IS_ERR(trans)) { if (!err) btrfs_block_rsv_release(fs_info, rc->block_rsv, num_bytes, NULL); return PTR_ERR(trans); } if (!err) { if (num_bytes != rc->merging_rsv_size) { btrfs_end_transaction(trans); btrfs_block_rsv_release(fs_info, rc->block_rsv, num_bytes, NULL); goto again; } } rc->merge_reloc_tree = true; while (!list_empty(&rc->reloc_roots)) { reloc_root = list_entry(rc->reloc_roots.next, struct btrfs_root, root_list); list_del_init(&reloc_root->root_list); root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false); if (IS_ERR(root)) { /* * Even if we have an error we need this reloc root * back on our list so we can clean up properly. */ list_add(&reloc_root->root_list, &reloc_roots); btrfs_abort_transaction(trans, (int)PTR_ERR(root)); if (!err) err = PTR_ERR(root); break; } if (unlikely(root->reloc_root != reloc_root)) { if (root->reloc_root) { btrfs_err(fs_info, "reloc tree mismatch, root %lld has reloc root key (%lld %u %llu) gen %llu, expect reloc root key (%lld %u %llu) gen %llu", btrfs_root_id(root), btrfs_root_id(root->reloc_root), root->reloc_root->root_key.type, root->reloc_root->root_key.offset, btrfs_root_generation( &root->reloc_root->root_item), btrfs_root_id(reloc_root), reloc_root->root_key.type, reloc_root->root_key.offset, btrfs_root_generation( &reloc_root->root_item)); } else { btrfs_err(fs_info, "reloc tree mismatch, root %lld has no reloc root, expect reloc root key (%lld %u %llu) gen %llu", btrfs_root_id(root), btrfs_root_id(reloc_root), reloc_root->root_key.type, reloc_root->root_key.offset, btrfs_root_generation( &reloc_root->root_item)); } list_add(&reloc_root->root_list, &reloc_roots); btrfs_put_root(root); btrfs_abort_transaction(trans, -EUCLEAN); if (!err) err = -EUCLEAN; break; } /* * set reference count to 1, so btrfs_recover_relocation * knows it should resumes merging */ if (!err) btrfs_set_root_refs(&reloc_root->root_item, 1); ret = btrfs_update_reloc_root(trans, root); /* * Even if we have an error we need this reloc root back on our * list so we can clean up properly. */ list_add(&reloc_root->root_list, &reloc_roots); btrfs_put_root(root); if (ret) { btrfs_abort_transaction(trans, ret); if (!err) err = ret; break; } } list_splice(&reloc_roots, &rc->reloc_roots); if (!err) err = btrfs_commit_transaction(trans); else btrfs_end_transaction(trans); return err; } static noinline_for_stack void free_reloc_roots(struct list_head *list) { struct btrfs_root *reloc_root, *tmp; list_for_each_entry_safe(reloc_root, tmp, list, root_list) __del_reloc_root(reloc_root); } static noinline_for_stack void merge_reloc_roots(struct reloc_control *rc) { struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; struct btrfs_root *root; struct btrfs_root *reloc_root; LIST_HEAD(reloc_roots); int found = 0; int ret = 0; again: root = rc->extent_root; /* * this serializes us with btrfs_record_root_in_transaction, * we have to make sure nobody is in the middle of * adding their roots to the list while we are * doing this splice */ mutex_lock(&fs_info->reloc_mutex); list_splice_init(&rc->reloc_roots, &reloc_roots); mutex_unlock(&fs_info->reloc_mutex); while (!list_empty(&reloc_roots)) { found = 1; reloc_root = list_entry(reloc_roots.next, struct btrfs_root, root_list); root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false); if (btrfs_root_refs(&reloc_root->root_item) > 0) { if (WARN_ON(IS_ERR(root))) { /* * For recovery we read the fs roots on mount, * and if we didn't find the root then we marked * the reloc root as a garbage root. For normal * relocation obviously the root should exist in * memory. However there's no reason we can't * handle the error properly here just in case. */ ret = PTR_ERR(root); goto out; } if (WARN_ON(root->reloc_root != reloc_root)) { /* * This can happen if on-disk metadata has some * corruption, e.g. bad reloc tree key offset. */ ret = -EINVAL; goto out; } ret = merge_reloc_root(rc, root); btrfs_put_root(root); if (ret) { if (list_empty(&reloc_root->root_list)) list_add_tail(&reloc_root->root_list, &reloc_roots); goto out; } } else { if (!IS_ERR(root)) { if (root->reloc_root == reloc_root) { root->reloc_root = NULL; btrfs_put_root(reloc_root); } clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); btrfs_put_root(root); } list_del_init(&reloc_root->root_list); /* Don't forget to queue this reloc root for cleanup */ list_add_tail(&reloc_root->reloc_dirty_list, &rc->dirty_subvol_roots); } } if (found) { found = 0; goto again; } out: if (ret) { btrfs_handle_fs_error(fs_info, ret, NULL); free_reloc_roots(&reloc_roots); /* new reloc root may be added */ mutex_lock(&fs_info->reloc_mutex); list_splice_init(&rc->reloc_roots, &reloc_roots); mutex_unlock(&fs_info->reloc_mutex); free_reloc_roots(&reloc_roots); } /* * We used to have * * BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); * * here, but it's wrong. If we fail to start the transaction in * prepare_to_merge() we will have only 0 ref reloc roots, none of which * have actually been removed from the reloc_root_tree rb tree. This is * fine because we're bailing here, and we hold a reference on the root * for the list that holds it, so these roots will be cleaned up when we * do the reloc_dirty_list afterwards. Meanwhile the root->reloc_root * will be cleaned up on unmount. * * The remaining nodes will be cleaned up by free_reloc_control. */ } static void free_block_list(struct rb_root *blocks) { struct tree_block *block; struct rb_node *rb_node; while ((rb_node = rb_first(blocks))) { block = rb_entry(rb_node, struct tree_block, rb_node); rb_erase(rb_node, blocks); kfree(block); } } static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans, struct btrfs_root *reloc_root) { struct btrfs_fs_info *fs_info = reloc_root->fs_info; struct btrfs_root *root; int ret; if (btrfs_get_root_last_trans(reloc_root) == trans->transid) return 0; root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false); /* * This should succeed, since we can't have a reloc root without having * already looked up the actual root and created the reloc root for this * root. * * However if there's some sort of corruption where we have a ref to a * reloc root without a corresponding root this could return ENOENT. */ if (IS_ERR(root)) { ASSERT(0); return PTR_ERR(root); } if (root->reloc_root != reloc_root) { ASSERT(0); btrfs_err(fs_info, "root %llu has two reloc roots associated with it", reloc_root->root_key.offset); btrfs_put_root(root); return -EUCLEAN; } ret = btrfs_record_root_in_trans(trans, root); btrfs_put_root(root); return ret; } static noinline_for_stack struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, struct reloc_control *rc, struct btrfs_backref_node *node, struct btrfs_backref_edge *edges[]) { struct btrfs_backref_node *next; struct btrfs_root *root; int index = 0; int ret; next = walk_up_backref(node, edges, &index); root = next->root; /* * If there is no root, then our references for this block are * incomplete, as we should be able to walk all the way up to a block * that is owned by a root. * * This path is only for SHAREABLE roots, so if we come upon a * non-SHAREABLE root then we have backrefs that resolve improperly. * * Both of these cases indicate file system corruption, or a bug in the * backref walking code. */ if (unlikely(!root)) { btrfs_err(trans->fs_info, "bytenr %llu doesn't have a backref path ending in a root", node->bytenr); return ERR_PTR(-EUCLEAN); } if (unlikely(!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))) { btrfs_err(trans->fs_info, "bytenr %llu has multiple refs with one ending in a non-shareable root", node->bytenr); return ERR_PTR(-EUCLEAN); } if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) { ret = record_reloc_root_in_trans(trans, root); if (ret) return ERR_PTR(ret); goto found; } ret = btrfs_record_root_in_trans(trans, root); if (ret) return ERR_PTR(ret); root = root->reloc_root; /* * We could have raced with another thread which failed, so * root->reloc_root may not be set, return ENOENT in this case. */ if (!root) return ERR_PTR(-ENOENT); if (next->new_bytenr) { /* * We just created the reloc root, so we shouldn't have * ->new_bytenr set yet. If it is then we have multiple roots * pointing at the same bytenr which indicates corruption, or * we've made a mistake in the backref walking code. */ ASSERT(next->new_bytenr == 0); btrfs_err(trans->fs_info, "bytenr %llu possibly has multiple roots pointing at the same bytenr %llu", node->bytenr, next->bytenr); return ERR_PTR(-EUCLEAN); } next->new_bytenr = root->node->start; btrfs_put_root(next->root); next->root = btrfs_grab_root(root); ASSERT(next->root); mark_block_processed(rc, next); found: next = node; /* setup backref node path for btrfs_reloc_cow_block */ while (1) { rc->backref_cache.path[next->level] = next; if (--index < 0) break; next = edges[index]->node[UPPER]; } return root; } /* * Select a tree root for relocation. * * Return NULL if the block is not shareable. We should use do_relocation() in * this case. * * Return a tree root pointer if the block is shareable. * Return -ENOENT if the block is root of reloc tree. */ static noinline_for_stack struct btrfs_root *select_one_root(struct btrfs_backref_node *node) { struct btrfs_backref_node *next; struct btrfs_root *root; struct btrfs_root *fs_root = NULL; struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; int index = 0; next = node; while (1) { cond_resched(); next = walk_up_backref(next, edges, &index); root = next->root; /* * This can occur if we have incomplete extent refs leading all * the way up a particular path, in this case return -EUCLEAN. */ if (!root) return ERR_PTR(-EUCLEAN); /* No other choice for non-shareable tree */ if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) return root; if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID) fs_root = root; if (next != node) return NULL; next = walk_down_backref(edges, &index); if (!next || next->level <= node->level) break; } if (!fs_root) return ERR_PTR(-ENOENT); return fs_root; } static noinline_for_stack u64 calcu_metadata_size(struct reloc_control *rc, struct btrfs_backref_node *node) { struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; struct btrfs_backref_node *next = node; struct btrfs_backref_edge *edge; struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; u64 num_bytes = 0; int index = 0; BUG_ON(node->processed); while (next) { cond_resched(); while (1) { if (next->processed) break; num_bytes += fs_info->nodesize; if (list_empty(&next->upper)) break; edge = list_entry(next->upper.next, struct btrfs_backref_edge, list[LOWER]); edges[index++] = edge; next = edge->node[UPPER]; } next = walk_down_backref(edges, &index); } return num_bytes; } static int refill_metadata_space(struct btrfs_trans_handle *trans, struct reloc_control *rc, u64 num_bytes) { struct btrfs_fs_info *fs_info = trans->fs_info; int ret; trans->block_rsv = rc->block_rsv; rc->reserved_bytes += num_bytes; /* * We are under a transaction here so we can only do limited flushing. * If we get an enospc just kick back -EAGAIN so we know to drop the * transaction and try to refill when we can flush all the things. */ ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, num_bytes, BTRFS_RESERVE_FLUSH_LIMIT); if (ret) { u64 tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES; while (tmp <= rc->reserved_bytes) tmp <<= 1; /* * only one thread can access block_rsv at this point, * so we don't need hold lock to protect block_rsv. * we expand more reservation size here to allow enough * space for relocation and we will return earlier in * enospc case. */ rc->block_rsv->size = tmp + fs_info->nodesize * RELOCATION_RESERVED_NODES; return -EAGAIN; } return 0; } static int reserve_metadata_space(struct btrfs_trans_handle *trans, struct reloc_control *rc, struct btrfs_backref_node *node) { u64 num_bytes; num_bytes = calcu_metadata_size(rc, node) * 2; return refill_metadata_space(trans, rc, num_bytes); } /* * relocate a block tree, and then update pointers in upper level * blocks that reference the block to point to the new location. * * if called by link_to_upper, the block has already been relocated. * in that case this function just updates pointers. */ static int do_relocation(struct btrfs_trans_handle *trans, struct reloc_control *rc, struct btrfs_backref_node *node, struct btrfs_key *key, struct btrfs_path *path, int lowest) { struct btrfs_backref_node *upper; struct btrfs_backref_edge *edge; struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; struct btrfs_root *root; struct extent_buffer *eb; u32 blocksize; u64 bytenr; int slot; int ret = 0; /* * If we are lowest then this is the first time we're processing this * block, and thus shouldn't have an eb associated with it yet. */ ASSERT(!lowest || !node->eb); path->lowest_level = node->level + 1; rc->backref_cache.path[node->level] = node; list_for_each_entry(edge, &node->upper, list[LOWER]) { cond_resched(); upper = edge->node[UPPER]; root = select_reloc_root(trans, rc, upper, edges); if (IS_ERR(root)) { ret = PTR_ERR(root); goto next; } if (upper->eb && !upper->locked) { if (!lowest) { ret = btrfs_bin_search(upper->eb, 0, key, &slot); if (ret < 0) goto next; BUG_ON(ret); bytenr = btrfs_node_blockptr(upper->eb, slot); if (node->eb->start == bytenr) goto next; } btrfs_backref_drop_node_buffer(upper); } if (!upper->eb) { ret = btrfs_search_slot(trans, root, key, path, 0, 1); if (ret) { if (ret > 0) ret = -ENOENT; btrfs_release_path(path); break; } if (!upper->eb) { upper->eb = path->nodes[upper->level]; path->nodes[upper->level] = NULL; } else { BUG_ON(upper->eb != path->nodes[upper->level]); } upper->locked = 1; path->locks[upper->level] = 0; slot = path->slots[upper->level]; btrfs_release_path(path); } else { ret = btrfs_bin_search(upper->eb, 0, key, &slot); if (ret < 0) goto next; BUG_ON(ret); } bytenr = btrfs_node_blockptr(upper->eb, slot); if (lowest) { if (bytenr != node->bytenr) { btrfs_err(root->fs_info, "lowest leaf/node mismatch: bytenr %llu node->bytenr %llu slot %d upper %llu", bytenr, node->bytenr, slot, upper->eb->start); ret = -EIO; goto next; } } else { if (node->eb->start == bytenr) goto next; } blocksize = root->fs_info->nodesize; eb = btrfs_read_node_slot(upper->eb, slot); if (IS_ERR(eb)) { ret = PTR_ERR(eb); goto next; } btrfs_tree_lock(eb); if (!node->eb) { ret = btrfs_cow_block(trans, root, eb, upper->eb, slot, &eb, BTRFS_NESTING_COW); btrfs_tree_unlock(eb); free_extent_buffer(eb); if (ret < 0) goto next; /* * We've just COWed this block, it should have updated * the correct backref node entry. */ ASSERT(node->eb == eb); } else { struct btrfs_ref ref = { .action = BTRFS_ADD_DELAYED_REF, .bytenr = node->eb->start, .num_bytes = blocksize, .parent = upper->eb->start, .owning_root = btrfs_header_owner(upper->eb), .ref_root = btrfs_header_owner(upper->eb), }; btrfs_set_node_blockptr(upper->eb, slot, node->eb->start); btrfs_set_node_ptr_generation(upper->eb, slot, trans->transid); btrfs_mark_buffer_dirty(trans, upper->eb); btrfs_init_tree_ref(&ref, node->level, btrfs_root_id(root), false); ret = btrfs_inc_extent_ref(trans, &ref); if (!ret) ret = btrfs_drop_subtree(trans, root, eb, upper->eb); if (ret) btrfs_abort_transaction(trans, ret); } next: if (!upper->pending) btrfs_backref_drop_node_buffer(upper); else btrfs_backref_unlock_node_buffer(upper); if (ret) break; } if (!ret && node->pending) { btrfs_backref_drop_node_buffer(node); list_del_init(&node->list); node->pending = 0; } path->lowest_level = 0; /* * We should have allocated all of our space in the block rsv and thus * shouldn't ENOSPC. */ ASSERT(ret != -ENOSPC); return ret; } static int link_to_upper(struct btrfs_trans_handle *trans, struct reloc_control *rc, struct btrfs_backref_node *node, struct btrfs_path *path) { struct btrfs_key key; btrfs_node_key_to_cpu(node->eb, &key, 0); return do_relocation(trans, rc, node, &key, path, 0); } static int finish_pending_nodes(struct btrfs_trans_handle *trans, struct reloc_control *rc, struct btrfs_path *path, int err) { LIST_HEAD(list); struct btrfs_backref_cache *cache = &rc->backref_cache; struct btrfs_backref_node *node; int level; int ret; for (level = 0; level < BTRFS_MAX_LEVEL; level++) { while (!list_empty(&cache->pending[level])) { node = list_entry(cache->pending[level].next, struct btrfs_backref_node, list); list_move_tail(&node->list, &list); BUG_ON(!node->pending); if (!err) { ret = link_to_upper(trans, rc, node, path); if (ret < 0) err = ret; } } list_splice_init(&list, &cache->pending[level]); } return err; } /* * mark a block and all blocks directly/indirectly reference the block * as processed. */ static void update_processed_blocks(struct reloc_control *rc, struct btrfs_backref_node *node) { struct btrfs_backref_node *next = node; struct btrfs_backref_edge *edge; struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; int index = 0; while (next) { cond_resched(); while (1) { if (next->processed) break; mark_block_processed(rc, next); if (list_empty(&next->upper)) break; edge = list_entry(next->upper.next, struct btrfs_backref_edge, list[LOWER]); edges[index++] = edge; next = edge->node[UPPER]; } next = walk_down_backref(edges, &index); } } static int tree_block_processed(u64 bytenr, struct reloc_control *rc) { u32 blocksize = rc->extent_root->fs_info->nodesize; if (test_range_bit(&rc->processed_blocks, bytenr, bytenr + blocksize - 1, EXTENT_DIRTY, NULL)) return 1; return 0; } static int get_tree_block_key(struct btrfs_fs_info *fs_info, struct tree_block *block) { struct btrfs_tree_parent_check check = { .level = block->level, .owner_root = block->owner, .transid = block->key.offset }; struct extent_buffer *eb; eb = read_tree_block(fs_info, block->bytenr, &check); if (IS_ERR(eb)) return PTR_ERR(eb); if (!extent_buffer_uptodate(eb)) { free_extent_buffer(eb); return -EIO; } if (block->level == 0) btrfs_item_key_to_cpu(eb, &block->key, 0); else btrfs_node_key_to_cpu(eb, &block->key, 0); free_extent_buffer(eb); block->key_ready = true; return 0; } /* * helper function to relocate a tree block */ static int relocate_tree_block(struct btrfs_trans_handle *trans, struct reloc_control *rc, struct btrfs_backref_node *node, struct btrfs_key *key, struct btrfs_path *path) { struct btrfs_root *root; int ret = 0; if (!node) return 0; /* * If we fail here we want to drop our backref_node because we are going * to start over and regenerate the tree for it. */ ret = reserve_metadata_space(trans, rc, node); if (ret) goto out; BUG_ON(node->processed); root = select_one_root(node); if (IS_ERR(root)) { ret = PTR_ERR(root); /* See explanation in select_one_root for the -EUCLEAN case. */ ASSERT(ret == -ENOENT); if (ret == -ENOENT) { ret = 0; update_processed_blocks(rc, node); } goto out; } if (root) { if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { /* * This block was the root block of a root, and this is * the first time we're processing the block and thus it * should not have had the ->new_bytenr modified. * * However in the case of corruption we could have * multiple refs pointing to the same block improperly, * and thus we would trip over these checks. ASSERT() * for the developer case, because it could indicate a * bug in the backref code, however error out for a * normal user in the case of corruption. */ ASSERT(node->new_bytenr == 0); if (node->new_bytenr) { btrfs_err(root->fs_info, "bytenr %llu has improper references to it", node->bytenr); ret = -EUCLEAN; goto out; } ret = btrfs_record_root_in_trans(trans, root); if (ret) goto out; /* * Another thread could have failed, need to check if we * have reloc_root actually set. */ if (!root->reloc_root) { ret = -ENOENT; goto out; } root = root->reloc_root; node->new_bytenr = root->node->start; btrfs_put_root(node->root); node->root = btrfs_grab_root(root); ASSERT(node->root); } else { btrfs_err(root->fs_info, "bytenr %llu resolved to a non-shareable root", node->bytenr); ret = -EUCLEAN; goto out; } if (!ret) update_processed_blocks(rc, node); } else { ret = do_relocation(trans, rc, node, key, path, 1); } out: if (ret || node->level == 0) btrfs_backref_cleanup_node(&rc->backref_cache, node); return ret; } static int relocate_cowonly_block(struct btrfs_trans_handle *trans, struct reloc_control *rc, struct tree_block *block, struct btrfs_path *path) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *root; u64 num_bytes; int nr_levels; int ret; root = btrfs_get_fs_root(fs_info, block->owner, true); if (IS_ERR(root)) return PTR_ERR(root); nr_levels = max(btrfs_header_level(root->node) - block->level, 0) + 1; num_bytes = fs_info->nodesize * nr_levels; ret = refill_metadata_space(trans, rc, num_bytes); if (ret) { btrfs_put_root(root); return ret; } path->lowest_level = block->level; if (root == root->fs_info->chunk_root) btrfs_reserve_chunk_metadata(trans, false); ret = btrfs_search_slot(trans, root, &block->key, path, 0, 1); path->lowest_level = 0; btrfs_release_path(path); if (root == root->fs_info->chunk_root) btrfs_trans_release_chunk_metadata(trans); if (ret > 0) ret = 0; btrfs_put_root(root); return ret; } /* * relocate a list of blocks */ static noinline_for_stack int relocate_tree_blocks(struct btrfs_trans_handle *trans, struct reloc_control *rc, struct rb_root *blocks) { struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; struct btrfs_backref_node *node; struct btrfs_path *path; struct tree_block *block; struct tree_block *next; int ret = 0; path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto out_free_blocks; } /* Kick in readahead for tree blocks with missing keys */ rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { if (!block->key_ready) btrfs_readahead_tree_block(fs_info, block->bytenr, block->owner, 0, block->level); } /* Get first keys */ rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { if (!block->key_ready) { ret = get_tree_block_key(fs_info, block); if (ret) goto out_free_path; } } /* Do tree relocation */ rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { /* * For COWonly blocks, or the data reloc tree, we only need to * COW down to the block, there's no need to generate a backref * tree. */ if (block->owner && (!is_fstree(block->owner) || block->owner == BTRFS_DATA_RELOC_TREE_OBJECTID)) { ret = relocate_cowonly_block(trans, rc, block, path); if (ret) break; continue; } node = build_backref_tree(trans, rc, &block->key, block->level, block->bytenr); if (IS_ERR(node)) { ret = PTR_ERR(node); goto out; } ret = relocate_tree_block(trans, rc, node, &block->key, path); if (ret < 0) break; } out: ret = finish_pending_nodes(trans, rc, path, ret); out_free_path: btrfs_free_path(path); out_free_blocks: free_block_list(blocks); return ret; } static noinline_for_stack int prealloc_file_extent_cluster(struct reloc_control *rc) { const struct file_extent_cluster *cluster = &rc->cluster; struct btrfs_inode *inode = BTRFS_I(rc->data_inode); u64 alloc_hint = 0; u64 start; u64 end; u64 offset = inode->reloc_block_group_start; u64 num_bytes; int nr; int ret = 0; u64 i_size = i_size_read(&inode->vfs_inode); u64 prealloc_start = cluster->start - offset; u64 prealloc_end = cluster->end - offset; u64 cur_offset = prealloc_start; /* * For subpage case, previous i_size may not be aligned to PAGE_SIZE. * This means the range [i_size, PAGE_END + 1) is filled with zeros by * btrfs_do_readpage() call of previously relocated file cluster. * * If the current cluster starts in the above range, btrfs_do_readpage() * will skip the read, and relocate_one_folio() will later writeback * the padding zeros as new data, causing data corruption. * * Here we have to manually invalidate the range (i_size, PAGE_END + 1). */ if (!PAGE_ALIGNED(i_size)) { struct address_space *mapping = inode->vfs_inode.i_mapping; struct btrfs_fs_info *fs_info = inode->root->fs_info; const u32 sectorsize = fs_info->sectorsize; struct folio *folio; ASSERT(sectorsize < PAGE_SIZE); ASSERT(IS_ALIGNED(i_size, sectorsize)); /* * Subpage can't handle page with DIRTY but without UPTODATE * bit as it can lead to the following deadlock: * * btrfs_read_folio() * | Page already *locked* * |- btrfs_lock_and_flush_ordered_range() * |- btrfs_start_ordered_extent() * |- extent_write_cache_pages() * |- lock_page() * We try to lock the page we already hold. * * Here we just writeback the whole data reloc inode, so that * we will be ensured to have no dirty range in the page, and * are safe to clear the uptodate bits. * * This shouldn't cause too much overhead, as we need to write * the data back anyway. */ ret = filemap_write_and_wait(mapping); if (ret < 0) return ret; clear_extent_bits(&inode->io_tree, i_size, round_up(i_size, PAGE_SIZE) - 1, EXTENT_UPTODATE); folio = filemap_lock_folio(mapping, i_size >> PAGE_SHIFT); /* * If page is freed we don't need to do anything then, as we * will re-read the whole page anyway. */ if (!IS_ERR(folio)) { btrfs_subpage_clear_uptodate(fs_info, folio, i_size, round_up(i_size, PAGE_SIZE) - i_size); folio_unlock(folio); folio_put(folio); } } BUG_ON(cluster->start != cluster->boundary[0]); ret = btrfs_alloc_data_chunk_ondemand(inode, prealloc_end + 1 - prealloc_start); if (ret) return ret; btrfs_inode_lock(inode, 0); for (nr = 0; nr < cluster->nr; nr++) { struct extent_state *cached_state = NULL; start = cluster->boundary[nr] - offset; if (nr + 1 < cluster->nr) end = cluster->boundary[nr + 1] - 1 - offset; else end = cluster->end - offset; lock_extent(&inode->io_tree, start, end, &cached_state); num_bytes = end + 1 - start; ret = btrfs_prealloc_file_range(&inode->vfs_inode, 0, start, num_bytes, num_bytes, end + 1, &alloc_hint); cur_offset = end + 1; unlock_extent(&inode->io_tree, start, end, &cached_state); if (ret) break; } btrfs_inode_unlock(inode, 0); if (cur_offset < prealloc_end) btrfs_free_reserved_data_space_noquota(inode->root->fs_info, prealloc_end + 1 - cur_offset); return ret; } static noinline_for_stack int setup_relocation_extent_mapping(struct reloc_control *rc) { struct btrfs_inode *inode = BTRFS_I(rc->data_inode); struct extent_map *em; struct extent_state *cached_state = NULL; u64 offset = inode->reloc_block_group_start; u64 start = rc->cluster.start - offset; u64 end = rc->cluster.end - offset; int ret = 0; em = alloc_extent_map(); if (!em) return -ENOMEM; em->start = start; em->len = end + 1 - start; em->disk_bytenr = rc->cluster.start; em->disk_num_bytes = em->len; em->ram_bytes = em->len; em->flags |= EXTENT_FLAG_PINNED; lock_extent(&inode->io_tree, start, end, &cached_state); ret = btrfs_replace_extent_map_range(inode, em, false); unlock_extent(&inode->io_tree, start, end, &cached_state); free_extent_map(em); return ret; } /* * Allow error injection to test balance/relocation cancellation */ noinline int btrfs_should_cancel_balance(const struct btrfs_fs_info *fs_info) { return atomic_read(&fs_info->balance_cancel_req) || atomic_read(&fs_info->reloc_cancel_req) || fatal_signal_pending(current); } ALLOW_ERROR_INJECTION(btrfs_should_cancel_balance, TRUE); static u64 get_cluster_boundary_end(const struct file_extent_cluster *cluster, int cluster_nr) { /* Last extent, use cluster end directly */ if (cluster_nr >= cluster->nr - 1) return cluster->end; /* Use next boundary start*/ return cluster->boundary[cluster_nr + 1] - 1; } static int relocate_one_folio(struct reloc_control *rc, struct file_ra_state *ra, int *cluster_nr, unsigned long index) { const struct file_extent_cluster *cluster = &rc->cluster; struct inode *inode = rc->data_inode; struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); u64 offset = BTRFS_I(inode)->reloc_block_group_start; const unsigned long last_index = (cluster->end - offset) >> PAGE_SHIFT; gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); struct folio *folio; u64 folio_start; u64 folio_end; u64 cur; int ret; const bool use_rst = btrfs_need_stripe_tree_update(fs_info, rc->block_group->flags); ASSERT(index <= last_index); again: folio = filemap_lock_folio(inode->i_mapping, index); if (IS_ERR(folio)) { /* * On relocation we're doing readahead on the relocation inode, * but if the filesystem is backed by a RAID stripe tree we can * get ENOENT (e.g. due to preallocated extents not being * mapped in the RST) from the lookup. * * But readahead doesn't handle the error and submits invalid * reads to the device, causing a assertion failures. */ if (!use_rst) page_cache_sync_readahead(inode->i_mapping, ra, NULL, index, last_index + 1 - index); folio = __filemap_get_folio(inode->i_mapping, index, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask); if (IS_ERR(folio)) return PTR_ERR(folio); } WARN_ON(folio_order(folio)); if (folio_test_readahead(folio) && !use_rst) page_cache_async_readahead(inode->i_mapping, ra, NULL, folio, last_index + 1 - index); if (!folio_test_uptodate(folio)) { btrfs_read_folio(NULL, folio); folio_lock(folio); if (!folio_test_uptodate(folio)) { ret = -EIO; goto release_folio; } if (folio->mapping != inode->i_mapping) { folio_unlock(folio); folio_put(folio); goto again; } } /* * We could have lost folio private when we dropped the lock to read the * folio above, make sure we set_folio_extent_mapped() here so we have any * of the subpage blocksize stuff we need in place. */ ret = set_folio_extent_mapped(folio); if (ret < 0) goto release_folio; folio_start = folio_pos(folio); folio_end = folio_start + PAGE_SIZE - 1; /* * Start from the cluster, as for subpage case, the cluster can start * inside the folio. */ cur = max(folio_start, cluster->boundary[*cluster_nr] - offset); while (cur <= folio_end) { struct extent_state *cached_state = NULL; u64 extent_start = cluster->boundary[*cluster_nr] - offset; u64 extent_end = get_cluster_boundary_end(cluster, *cluster_nr) - offset; u64 clamped_start = max(folio_start, extent_start); u64 clamped_end = min(folio_end, extent_end); u32 clamped_len = clamped_end + 1 - clamped_start; /* Reserve metadata for this range */ ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), clamped_len, clamped_len, false); if (ret) goto release_folio; /* Mark the range delalloc and dirty for later writeback */ lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, &cached_state); ret = btrfs_set_extent_delalloc(BTRFS_I(inode), clamped_start, clamped_end, 0, &cached_state); if (ret) { clear_extent_bit(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, EXTENT_LOCKED | EXTENT_BOUNDARY, &cached_state); btrfs_delalloc_release_metadata(BTRFS_I(inode), clamped_len, true); btrfs_delalloc_release_extents(BTRFS_I(inode), clamped_len); goto release_folio; } btrfs_folio_set_dirty(fs_info, folio, clamped_start, clamped_len); /* * Set the boundary if it's inside the folio. * Data relocation requires the destination extents to have the * same size as the source. * EXTENT_BOUNDARY bit prevents current extent from being merged * with previous extent. */ if (in_range(cluster->boundary[*cluster_nr] - offset, folio_start, PAGE_SIZE)) { u64 boundary_start = cluster->boundary[*cluster_nr] - offset; u64 boundary_end = boundary_start + fs_info->sectorsize - 1; set_extent_bit(&BTRFS_I(inode)->io_tree, boundary_start, boundary_end, EXTENT_BOUNDARY, NULL); } unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, &cached_state); btrfs_delalloc_release_extents(BTRFS_I(inode), clamped_len); cur += clamped_len; /* Crossed extent end, go to next extent */ if (cur >= extent_end) { (*cluster_nr)++; /* Just finished the last extent of the cluster, exit. */ if (*cluster_nr >= cluster->nr) break; } } folio_unlock(folio); folio_put(folio); balance_dirty_pages_ratelimited(inode->i_mapping); btrfs_throttle(fs_info); if (btrfs_should_cancel_balance(fs_info)) ret = -ECANCELED; return ret; release_folio: folio_unlock(folio); folio_put(folio); return ret; } static int relocate_file_extent_cluster(struct reloc_control *rc) { struct inode *inode = rc->data_inode; const struct file_extent_cluster *cluster = &rc->cluster; u64 offset = BTRFS_I(inode)->reloc_block_group_start; unsigned long index; unsigned long last_index; struct file_ra_state *ra; int cluster_nr = 0; int ret = 0; if (!cluster->nr) return 0; ra = kzalloc(sizeof(*ra), GFP_NOFS); if (!ra) return -ENOMEM; ret = prealloc_file_extent_cluster(rc); if (ret) goto out; file_ra_state_init(ra, inode->i_mapping); ret = setup_relocation_extent_mapping(rc); if (ret) goto out; last_index = (cluster->end - offset) >> PAGE_SHIFT; for (index = (cluster->start - offset) >> PAGE_SHIFT; index <= last_index && !ret; index++) ret = relocate_one_folio(rc, ra, &cluster_nr, index); if (ret == 0) WARN_ON(cluster_nr != cluster->nr); out: kfree(ra); return ret; } static noinline_for_stack int relocate_data_extent(struct reloc_control *rc, const struct btrfs_key *extent_key) { struct inode *inode = rc->data_inode; struct file_extent_cluster *cluster = &rc->cluster; int ret; struct btrfs_root *root = BTRFS_I(inode)->root; if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) { ret = relocate_file_extent_cluster(rc); if (ret) return ret; cluster->nr = 0; } /* * Under simple quotas, we set root->relocation_src_root when we find * the extent. If adjacent extents have different owners, we can't merge * them while relocating. Handle this by storing the owning root that * started a cluster and if we see an extent from a different root break * cluster formation (just like the above case of non-adjacent extents). * * Without simple quotas, relocation_src_root is always 0, so we should * never see a mismatch, and it should have no effect on relocation * clusters. */ if (cluster->nr > 0 && cluster->owning_root != root->relocation_src_root) { u64 tmp = root->relocation_src_root; /* * root->relocation_src_root is the state that actually affects * the preallocation we do here, so set it to the root owning * the cluster we need to relocate. */ root->relocation_src_root = cluster->owning_root; ret = relocate_file_extent_cluster(rc); if (ret) return ret; cluster->nr = 0; /* And reset it back for the current extent's owning root. */ root->relocation_src_root = tmp; } if (!cluster->nr) { cluster->start = extent_key->objectid; cluster->owning_root = root->relocation_src_root; } else BUG_ON(cluster->nr >= MAX_EXTENTS); cluster->end = extent_key->objectid + extent_key->offset - 1; cluster->boundary[cluster->nr] = extent_key->objectid; cluster->nr++; if (cluster->nr >= MAX_EXTENTS) { ret = relocate_file_extent_cluster(rc); if (ret) return ret; cluster->nr = 0; } return 0; } /* * helper to add a tree block to the list. * the major work is getting the generation and level of the block */ static int add_tree_block(struct reloc_control *rc, const struct btrfs_key *extent_key, struct btrfs_path *path, struct rb_root *blocks) { struct extent_buffer *eb; struct btrfs_extent_item *ei; struct btrfs_tree_block_info *bi; struct tree_block *block; struct rb_node *rb_node; u32 item_size; int level = -1; u64 generation; u64 owner = 0; eb = path->nodes[0]; item_size = btrfs_item_size(eb, path->slots[0]); if (extent_key->type == BTRFS_METADATA_ITEM_KEY || item_size >= sizeof(*ei) + sizeof(*bi)) { unsigned long ptr = 0, end; ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); end = (unsigned long)ei + item_size; if (extent_key->type == BTRFS_EXTENT_ITEM_KEY) { bi = (struct btrfs_tree_block_info *)(ei + 1); level = btrfs_tree_block_level(eb, bi); ptr = (unsigned long)(bi + 1); } else { level = (int)extent_key->offset; ptr = (unsigned long)(ei + 1); } generation = btrfs_extent_generation(eb, ei); /* * We're reading random blocks without knowing their owner ahead * of time. This is ok most of the time, as all reloc roots and * fs roots have the same lock type. However normal trees do * not, and the only way to know ahead of time is to read the * inline ref offset. We know it's an fs root if * * 1. There's more than one ref. * 2. There's a SHARED_DATA_REF_KEY set. * 3. FULL_BACKREF is set on the flags. * * Otherwise it's safe to assume that the ref offset == the * owner of this block, so we can use that when calling * read_tree_block. */ if (btrfs_extent_refs(eb, ei) == 1 && !(btrfs_extent_flags(eb, ei) & BTRFS_BLOCK_FLAG_FULL_BACKREF) && ptr < end) { struct btrfs_extent_inline_ref *iref; int type; iref = (struct btrfs_extent_inline_ref *)ptr; type = btrfs_get_extent_inline_ref_type(eb, iref, BTRFS_REF_TYPE_BLOCK); if (type == BTRFS_REF_TYPE_INVALID) return -EINVAL; if (type == BTRFS_TREE_BLOCK_REF_KEY) owner = btrfs_extent_inline_ref_offset(eb, iref); } } else { btrfs_print_leaf(eb); btrfs_err(rc->block_group->fs_info, "unrecognized tree backref at tree block %llu slot %u", eb->start, path->slots[0]); btrfs_release_path(path); return -EUCLEAN; } btrfs_release_path(path); BUG_ON(level == -1); block = kmalloc(sizeof(*block), GFP_NOFS); if (!block) return -ENOMEM; block->bytenr = extent_key->objectid; block->key.objectid = rc->extent_root->fs_info->nodesize; block->key.offset = generation; block->level = level; block->key_ready = false; block->owner = owner; rb_node = rb_simple_insert(blocks, block->bytenr, &block->rb_node); if (rb_node) btrfs_backref_panic(rc->extent_root->fs_info, block->bytenr, -EEXIST); return 0; } /* * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY */ static int __add_tree_block(struct reloc_control *rc, u64 bytenr, u32 blocksize, struct rb_root *blocks) { struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; struct btrfs_path *path; struct btrfs_key key; int ret; bool skinny = btrfs_fs_incompat(fs_info, SKINNY_METADATA); if (tree_block_processed(bytenr, rc)) return 0; if (rb_simple_search(blocks, bytenr)) return 0; path = btrfs_alloc_path(); if (!path) return -ENOMEM; again: key.objectid = bytenr; if (skinny) { key.type = BTRFS_METADATA_ITEM_KEY; key.offset = (u64)-1; } else { key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = blocksize; } path->search_commit_root = 1; path->skip_locking = 1; ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0); if (ret < 0) goto out; if (ret > 0 && skinny) { if (path->slots[0]) { path->slots[0]--; btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); if (key.objectid == bytenr && (key.type == BTRFS_METADATA_ITEM_KEY || (key.type == BTRFS_EXTENT_ITEM_KEY && key.offset == blocksize))) ret = 0; } if (ret) { skinny = false; btrfs_release_path(path); goto again; } } if (ret) { ASSERT(ret == 1); btrfs_print_leaf(path->nodes[0]); btrfs_err(fs_info, "tree block extent item (%llu) is not found in extent tree", bytenr); WARN_ON(1); ret = -EINVAL; goto out; } ret = add_tree_block(rc, &key, path, blocks); out: btrfs_free_path(path); return ret; } static int delete_block_group_cache(struct btrfs_fs_info *fs_info, struct btrfs_block_group *block_group, struct inode *inode, u64 ino) { struct btrfs_root *root = fs_info->tree_root; struct btrfs_trans_handle *trans; int ret = 0; if (inode) goto truncate; inode = btrfs_iget(ino, root); if (IS_ERR(inode)) return -ENOENT; truncate: ret = btrfs_check_trunc_cache_free_space(fs_info, &fs_info->global_block_rsv); if (ret) goto out; trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out; } ret = btrfs_truncate_free_space_cache(trans, block_group, inode); btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); out: iput(inode); return ret; } /* * Locate the free space cache EXTENT_DATA in root tree leaf and delete the * cache inode, to avoid free space cache data extent blocking data relocation. */ static int delete_v1_space_cache(struct extent_buffer *leaf, struct btrfs_block_group *block_group, u64 data_bytenr) { u64 space_cache_ino; struct btrfs_file_extent_item *ei; struct btrfs_key key; bool found = false; int i; int ret; if (btrfs_header_owner(leaf) != BTRFS_ROOT_TREE_OBJECTID) return 0; for (i = 0; i < btrfs_header_nritems(leaf); i++) { u8 type; btrfs_item_key_to_cpu(leaf, &key, i); if (key.type != BTRFS_EXTENT_DATA_KEY) continue; ei = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); type = btrfs_file_extent_type(leaf, ei); if ((type == BTRFS_FILE_EXTENT_REG || type == BTRFS_FILE_EXTENT_PREALLOC) && btrfs_file_extent_disk_bytenr(leaf, ei) == data_bytenr) { found = true; space_cache_ino = key.objectid; break; } } if (!found) return -ENOENT; ret = delete_block_group_cache(leaf->fs_info, block_group, NULL, space_cache_ino); return ret; } /* * helper to find all tree blocks that reference a given data extent */ static noinline_for_stack int add_data_references(struct reloc_control *rc, const struct btrfs_key *extent_key, struct btrfs_path *path, struct rb_root *blocks) { struct btrfs_backref_walk_ctx ctx = { 0 }; struct ulist_iterator leaf_uiter; struct ulist_node *ref_node = NULL; const u32 blocksize = rc->extent_root->fs_info->nodesize; int ret = 0; btrfs_release_path(path); ctx.bytenr = extent_key->objectid; ctx.skip_inode_ref_list = true; ctx.fs_info = rc->extent_root->fs_info; ret = btrfs_find_all_leafs(&ctx); if (ret < 0) return ret; ULIST_ITER_INIT(&leaf_uiter); while ((ref_node = ulist_next(ctx.refs, &leaf_uiter))) { struct btrfs_tree_parent_check check = { 0 }; struct extent_buffer *eb; eb = read_tree_block(ctx.fs_info, ref_node->val, &check); if (IS_ERR(eb)) { ret = PTR_ERR(eb); break; } ret = delete_v1_space_cache(eb, rc->block_group, extent_key->objectid); free_extent_buffer(eb); if (ret < 0) break; ret = __add_tree_block(rc, ref_node->val, blocksize, blocks); if (ret < 0) break; } if (ret < 0) free_block_list(blocks); ulist_free(ctx.refs); return ret; } /* * helper to find next unprocessed extent */ static noinline_for_stack int find_next_extent(struct reloc_control *rc, struct btrfs_path *path, struct btrfs_key *extent_key) { struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; struct btrfs_key key; struct extent_buffer *leaf; u64 start, end, last; int ret; last = rc->block_group->start + rc->block_group->length; while (1) { bool block_found; cond_resched(); if (rc->search_start >= last) { ret = 1; break; } key.objectid = rc->search_start; key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = 0; path->search_commit_root = 1; path->skip_locking = 1; ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0); if (ret < 0) break; next: leaf = path->nodes[0]; if (path->slots[0] >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(rc->extent_root, path); if (ret != 0) break; leaf = path->nodes[0]; } btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); if (key.objectid >= last) { ret = 1; break; } if (key.type != BTRFS_EXTENT_ITEM_KEY && key.type != BTRFS_METADATA_ITEM_KEY) { path->slots[0]++; goto next; } if (key.type == BTRFS_EXTENT_ITEM_KEY && key.objectid + key.offset <= rc->search_start) { path->slots[0]++; goto next; } if (key.type == BTRFS_METADATA_ITEM_KEY && key.objectid + fs_info->nodesize <= rc->search_start) { path->slots[0]++; goto next; } block_found = find_first_extent_bit(&rc->processed_blocks, key.objectid, &start, &end, EXTENT_DIRTY, NULL); if (block_found && start <= key.objectid) { btrfs_release_path(path); rc->search_start = end + 1; } else { if (key.type == BTRFS_EXTENT_ITEM_KEY) rc->search_start = key.objectid + key.offset; else rc->search_start = key.objectid + fs_info->nodesize; memcpy(extent_key, &key, sizeof(key)); return 0; } } btrfs_release_path(path); return ret; } static void set_reloc_control(struct reloc_control *rc) { struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; mutex_lock(&fs_info->reloc_mutex); fs_info->reloc_ctl = rc; mutex_unlock(&fs_info->reloc_mutex); } static void unset_reloc_control(struct reloc_control *rc) { struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; mutex_lock(&fs_info->reloc_mutex); fs_info->reloc_ctl = NULL; mutex_unlock(&fs_info->reloc_mutex); } static noinline_for_stack int prepare_to_relocate(struct reloc_control *rc) { struct btrfs_trans_handle *trans; int ret; rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root->fs_info, BTRFS_BLOCK_RSV_TEMP); if (!rc->block_rsv) return -ENOMEM; memset(&rc->cluster, 0, sizeof(rc->cluster)); rc->search_start = rc->block_group->start; rc->extents_found = 0; rc->nodes_relocated = 0; rc->merging_rsv_size = 0; rc->reserved_bytes = 0; rc->block_rsv->size = rc->extent_root->fs_info->nodesize * RELOCATION_RESERVED_NODES; ret = btrfs_block_rsv_refill(rc->extent_root->fs_info, rc->block_rsv, rc->block_rsv->size, BTRFS_RESERVE_FLUSH_ALL); if (ret) return ret; rc->create_reloc_tree = true; set_reloc_control(rc); trans = btrfs_join_transaction(rc->extent_root); if (IS_ERR(trans)) { unset_reloc_control(rc); /* * extent tree is not a ref_cow tree and has no reloc_root to * cleanup. And callers are responsible to free the above * block rsv. */ return PTR_ERR(trans); } ret = btrfs_commit_transaction(trans); if (ret) unset_reloc_control(rc); return ret; } static noinline_for_stack int relocate_block_group(struct reloc_control *rc) { struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; struct rb_root blocks = RB_ROOT; struct btrfs_key key; struct btrfs_trans_handle *trans = NULL; struct btrfs_path *path; struct btrfs_extent_item *ei; u64 flags; int ret; int err = 0; int progress = 0; path = btrfs_alloc_path(); if (!path) return -ENOMEM; path->reada = READA_FORWARD; ret = prepare_to_relocate(rc); if (ret) { err = ret; goto out_free; } while (1) { rc->reserved_bytes = 0; ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, rc->block_rsv->size, BTRFS_RESERVE_FLUSH_ALL); if (ret) { err = ret; break; } progress++; trans = btrfs_start_transaction(rc->extent_root, 0); if (IS_ERR(trans)) { err = PTR_ERR(trans); trans = NULL; break; } restart: if (rc->backref_cache.last_trans != trans->transid) btrfs_backref_release_cache(&rc->backref_cache); rc->backref_cache.last_trans = trans->transid; ret = find_next_extent(rc, path, &key); if (ret < 0) err = ret; if (ret != 0) break; rc->extents_found++; ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_extent_item); flags = btrfs_extent_flags(path->nodes[0], ei); /* * If we are relocating a simple quota owned extent item, we * need to note the owner on the reloc data root so that when * we allocate the replacement item, we can attribute it to the * correct eventual owner (rather than the reloc data root). */ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) { struct btrfs_root *root = BTRFS_I(rc->data_inode)->root; u64 owning_root_id = btrfs_get_extent_owner_root(fs_info, path->nodes[0], path->slots[0]); root->relocation_src_root = owning_root_id; } if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { ret = add_tree_block(rc, &key, path, &blocks); } else if (rc->stage == UPDATE_DATA_PTRS && (flags & BTRFS_EXTENT_FLAG_DATA)) { ret = add_data_references(rc, &key, path, &blocks); } else { btrfs_release_path(path); ret = 0; } if (ret < 0) { err = ret; break; } if (!RB_EMPTY_ROOT(&blocks)) { ret = relocate_tree_blocks(trans, rc, &blocks); if (ret < 0) { if (ret != -EAGAIN) { err = ret; break; } rc->extents_found--; rc->search_start = key.objectid; } } btrfs_end_transaction_throttle(trans); btrfs_btree_balance_dirty(fs_info); trans = NULL; if (rc->stage == MOVE_DATA_EXTENTS && (flags & BTRFS_EXTENT_FLAG_DATA)) { rc->found_file_extent = true; ret = relocate_data_extent(rc, &key); if (ret < 0) { err = ret; break; } } if (btrfs_should_cancel_balance(fs_info)) { err = -ECANCELED; break; } } if (trans && progress && err == -ENOSPC) { ret = btrfs_force_chunk_alloc(trans, rc->block_group->flags); if (ret == 1) { err = 0; progress = 0; goto restart; } } btrfs_release_path(path); clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY); if (trans) { btrfs_end_transaction_throttle(trans); btrfs_btree_balance_dirty(fs_info); } if (!err) { ret = relocate_file_extent_cluster(rc); if (ret < 0) err = ret; } rc->create_reloc_tree = false; set_reloc_control(rc); btrfs_backref_release_cache(&rc->backref_cache); btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL); /* * Even in the case when the relocation is cancelled, we should all go * through prepare_to_merge() and merge_reloc_roots(). * * For error (including cancelled balance), prepare_to_merge() will * mark all reloc trees orphan, then queue them for cleanup in * merge_reloc_roots() */ err = prepare_to_merge(rc, err); merge_reloc_roots(rc); rc->merge_reloc_tree = false; unset_reloc_control(rc); btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL); /* get rid of pinned extents */ trans = btrfs_join_transaction(rc->extent_root); if (IS_ERR(trans)) { err = PTR_ERR(trans); goto out_free; } ret = btrfs_commit_transaction(trans); if (ret && !err) err = ret; out_free: ret = clean_dirty_subvols(rc); if (ret < 0 && !err) err = ret; btrfs_free_block_rsv(fs_info, rc->block_rsv); btrfs_free_path(path); return err; } static int __insert_orphan_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 objectid) { struct btrfs_path *path; struct btrfs_inode_item *item; struct extent_buffer *leaf; int ret; path = btrfs_alloc_path(); if (!path) return -ENOMEM; ret = btrfs_insert_empty_inode(trans, root, path, objectid); if (ret) goto out; leaf = path->nodes[0]; item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item)); btrfs_set_inode_generation(leaf, item, 1); btrfs_set_inode_size(leaf, item, 0); btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC); out: btrfs_free_path(path); return ret; } static void delete_orphan_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 objectid) { struct btrfs_path *path; struct btrfs_key key; int ret = 0; path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto out; } key.objectid = objectid; key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret) { if (ret > 0) ret = -ENOENT; goto out; } ret = btrfs_del_item(trans, root, path); out: if (ret) btrfs_abort_transaction(trans, ret); btrfs_free_path(path); } /* * helper to create inode for data relocation. * the inode is in data relocation tree and its link count is 0 */ static noinline_for_stack struct inode *create_reloc_inode( struct btrfs_fs_info *fs_info, const struct btrfs_block_group *group) { struct inode *inode = NULL; struct btrfs_trans_handle *trans; struct btrfs_root *root; u64 objectid; int ret = 0; root = btrfs_grab_root(fs_info->data_reloc_root); trans = btrfs_start_transaction(root, 6); if (IS_ERR(trans)) { btrfs_put_root(root); return ERR_CAST(trans); } ret = btrfs_get_free_objectid(root, &objectid); if (ret) goto out; ret = __insert_orphan_inode(trans, root, objectid); if (ret) goto out; inode = btrfs_iget(objectid, root); if (IS_ERR(inode)) { delete_orphan_inode(trans, root, objectid); ret = PTR_ERR(inode); inode = NULL; goto out; } BTRFS_I(inode)->reloc_block_group_start = group->start; ret = btrfs_orphan_add(trans, BTRFS_I(inode)); out: btrfs_put_root(root); btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); if (ret) { iput(inode); inode = ERR_PTR(ret); } return inode; } /* * Mark start of chunk relocation that is cancellable. Check if the cancellation * has been requested meanwhile and don't start in that case. * * Return: * 0 success * -EINPROGRESS operation is already in progress, that's probably a bug * -ECANCELED cancellation request was set before the operation started */ static int reloc_chunk_start(struct btrfs_fs_info *fs_info) { if (test_and_set_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) { /* This should not happen */ btrfs_err(fs_info, "reloc already running, cannot start"); return -EINPROGRESS; } if (atomic_read(&fs_info->reloc_cancel_req) > 0) { btrfs_info(fs_info, "chunk relocation canceled on start"); /* * On cancel, clear all requests but let the caller mark * the end after cleanup operations. */ atomic_set(&fs_info->reloc_cancel_req, 0); return -ECANCELED; } return 0; } /* * Mark end of chunk relocation that is cancellable and wake any waiters. */ static void reloc_chunk_end(struct btrfs_fs_info *fs_info) { /* Requested after start, clear bit first so any waiters can continue */ if (atomic_read(&fs_info->reloc_cancel_req) > 0) btrfs_info(fs_info, "chunk relocation canceled during operation"); clear_and_wake_up_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags); atomic_set(&fs_info->reloc_cancel_req, 0); } static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info) { struct reloc_control *rc; rc = kzalloc(sizeof(*rc), GFP_NOFS); if (!rc) return NULL; INIT_LIST_HEAD(&rc->reloc_roots); INIT_LIST_HEAD(&rc->dirty_subvol_roots); btrfs_backref_init_cache(fs_info, &rc->backref_cache, true); rc->reloc_root_tree.rb_root = RB_ROOT; spin_lock_init(&rc->reloc_root_tree.lock); extent_io_tree_init(fs_info, &rc->processed_blocks, IO_TREE_RELOC_BLOCKS); return rc; } static void free_reloc_control(struct reloc_control *rc) { struct mapping_node *node, *tmp; free_reloc_roots(&rc->reloc_roots); rbtree_postorder_for_each_entry_safe(node, tmp, &rc->reloc_root_tree.rb_root, rb_node) kfree(node); kfree(rc); } /* * Print the block group being relocated */ static void describe_relocation(struct btrfs_block_group *block_group) { char buf[128] = {'\0'}; btrfs_describe_block_groups(block_group->flags, buf, sizeof(buf)); btrfs_info(block_group->fs_info, "relocating block group %llu flags %s", block_group->start, buf); } static const char *stage_to_string(enum reloc_stage stage) { if (stage == MOVE_DATA_EXTENTS) return "move data extents"; if (stage == UPDATE_DATA_PTRS) return "update data pointers"; return "unknown"; } /* * function to relocate all extents in a block group. */ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) { struct btrfs_block_group *bg; struct btrfs_root *extent_root = btrfs_extent_root(fs_info, group_start); struct reloc_control *rc; struct inode *inode; struct btrfs_path *path; int ret; int rw = 0; int err = 0; /* * This only gets set if we had a half-deleted snapshot on mount. We * cannot allow relocation to start while we're still trying to clean up * these pending deletions. */ ret = wait_on_bit(&fs_info->flags, BTRFS_FS_UNFINISHED_DROPS, TASK_INTERRUPTIBLE); if (ret) return ret; /* We may have been woken up by close_ctree, so bail if we're closing. */ if (btrfs_fs_closing(fs_info)) return -EINTR; bg = btrfs_lookup_block_group(fs_info, group_start); if (!bg) return -ENOENT; /* * Relocation of a data block group creates ordered extents. Without * sb_start_write(), we can freeze the filesystem while unfinished * ordered extents are left. Such ordered extents can cause a deadlock * e.g. when syncfs() is waiting for their completion but they can't * finish because they block when joining a transaction, due to the * fact that the freeze locks are being held in write mode. */ if (bg->flags & BTRFS_BLOCK_GROUP_DATA) ASSERT(sb_write_started(fs_info->sb)); if (btrfs_pinned_by_swapfile(fs_info, bg)) { btrfs_put_block_group(bg); return -ETXTBSY; } rc = alloc_reloc_control(fs_info); if (!rc) { btrfs_put_block_group(bg); return -ENOMEM; } ret = reloc_chunk_start(fs_info); if (ret < 0) { err = ret; goto out_put_bg; } rc->extent_root = extent_root; rc->block_group = bg; ret = btrfs_inc_block_group_ro(rc->block_group, true); if (ret) { err = ret; goto out; } rw = 1; path = btrfs_alloc_path(); if (!path) { err = -ENOMEM; goto out; } inode = lookup_free_space_inode(rc->block_group, path); btrfs_free_path(path); if (!IS_ERR(inode)) ret = delete_block_group_cache(fs_info, rc->block_group, inode, 0); else ret = PTR_ERR(inode); if (ret && ret != -ENOENT) { err = ret; goto out; } rc->data_inode = create_reloc_inode(fs_info, rc->block_group); if (IS_ERR(rc->data_inode)) { err = PTR_ERR(rc->data_inode); rc->data_inode = NULL; goto out; } describe_relocation(rc->block_group); btrfs_wait_block_group_reservations(rc->block_group); btrfs_wait_nocow_writers(rc->block_group); btrfs_wait_ordered_roots(fs_info, U64_MAX, rc->block_group); ret = btrfs_zone_finish(rc->block_group); WARN_ON(ret && ret != -EAGAIN); while (1) { enum reloc_stage finishes_stage; mutex_lock(&fs_info->cleaner_mutex); ret = relocate_block_group(rc); mutex_unlock(&fs_info->cleaner_mutex); if (ret < 0) err = ret; finishes_stage = rc->stage; /* * We may have gotten ENOSPC after we already dirtied some * extents. If writeout happens while we're relocating a * different block group we could end up hitting the * BUG_ON(rc->stage == UPDATE_DATA_PTRS) in * btrfs_reloc_cow_block. Make sure we write everything out * properly so we don't trip over this problem, and then break * out of the loop if we hit an error. */ if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) { ret = btrfs_wait_ordered_range(BTRFS_I(rc->data_inode), 0, (u64)-1); if (ret) err = ret; invalidate_mapping_pages(rc->data_inode->i_mapping, 0, -1); rc->stage = UPDATE_DATA_PTRS; } if (err < 0) goto out; if (rc->extents_found == 0) break; btrfs_info(fs_info, "found %llu extents, stage: %s", rc->extents_found, stage_to_string(finishes_stage)); } WARN_ON(rc->block_group->pinned > 0); WARN_ON(rc->block_group->reserved > 0); WARN_ON(rc->block_group->used > 0); out: if (err && rw) btrfs_dec_block_group_ro(rc->block_group); iput(rc->data_inode); out_put_bg: btrfs_put_block_group(bg); reloc_chunk_end(fs_info); free_reloc_control(rc); return err; } static noinline_for_stack int mark_garbage_root(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; int ret, err; trans = btrfs_start_transaction(fs_info->tree_root, 0); if (IS_ERR(trans)) return PTR_ERR(trans); memset(&root->root_item.drop_progress, 0, sizeof(root->root_item.drop_progress)); btrfs_set_root_drop_level(&root->root_item, 0); btrfs_set_root_refs(&root->root_item, 0); ret = btrfs_update_root(trans, fs_info->tree_root, &root->root_key, &root->root_item); err = btrfs_end_transaction(trans); if (err) return err; return ret; } /* * recover relocation interrupted by system crash. * * this function resumes merging reloc trees with corresponding fs trees. * this is important for keeping the sharing of tree blocks */ int btrfs_recover_relocation(struct btrfs_fs_info *fs_info) { LIST_HEAD(reloc_roots); struct btrfs_key key; struct btrfs_root *fs_root; struct btrfs_root *reloc_root; struct btrfs_path *path; struct extent_buffer *leaf; struct reloc_control *rc = NULL; struct btrfs_trans_handle *trans; int ret2; int ret = 0; path = btrfs_alloc_path(); if (!path) return -ENOMEM; path->reada = READA_BACK; key.objectid = BTRFS_TREE_RELOC_OBJECTID; key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; while (1) { ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); if (ret < 0) goto out; if (ret > 0) { if (path->slots[0] == 0) break; path->slots[0]--; } ret = 0; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); btrfs_release_path(path); if (key.objectid != BTRFS_TREE_RELOC_OBJECTID || key.type != BTRFS_ROOT_ITEM_KEY) break; reloc_root = btrfs_read_tree_root(fs_info->tree_root, &key); if (IS_ERR(reloc_root)) { ret = PTR_ERR(reloc_root); goto out; } set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state); list_add(&reloc_root->root_list, &reloc_roots); if (btrfs_root_refs(&reloc_root->root_item) > 0) { fs_root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false); if (IS_ERR(fs_root)) { ret = PTR_ERR(fs_root); if (ret != -ENOENT) goto out; ret = mark_garbage_root(reloc_root); if (ret < 0) goto out; ret = 0; } else { btrfs_put_root(fs_root); } } if (key.offset == 0) break; key.offset--; } btrfs_release_path(path); if (list_empty(&reloc_roots)) goto out; rc = alloc_reloc_control(fs_info); if (!rc) { ret = -ENOMEM; goto out; } ret = reloc_chunk_start(fs_info); if (ret < 0) goto out_end; rc->extent_root = btrfs_extent_root(fs_info, 0); set_reloc_control(rc); trans = btrfs_join_transaction(rc->extent_root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out_unset; } rc->merge_reloc_tree = true; while (!list_empty(&reloc_roots)) { reloc_root = list_entry(reloc_roots.next, struct btrfs_root, root_list); list_del(&reloc_root->root_list); if (btrfs_root_refs(&reloc_root->root_item) == 0) { list_add_tail(&reloc_root->root_list, &rc->reloc_roots); continue; } fs_root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false); if (IS_ERR(fs_root)) { ret = PTR_ERR(fs_root); list_add_tail(&reloc_root->root_list, &reloc_roots); btrfs_end_transaction(trans); goto out_unset; } ret = __add_reloc_root(reloc_root); ASSERT(ret != -EEXIST); if (ret) { list_add_tail(&reloc_root->root_list, &reloc_roots); btrfs_put_root(fs_root); btrfs_end_transaction(trans); goto out_unset; } fs_root->reloc_root = btrfs_grab_root(reloc_root); btrfs_put_root(fs_root); } ret = btrfs_commit_transaction(trans); if (ret) goto out_unset; merge_reloc_roots(rc); unset_reloc_control(rc); trans = btrfs_join_transaction(rc->extent_root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out_clean; } ret = btrfs_commit_transaction(trans); out_clean: ret2 = clean_dirty_subvols(rc); if (ret2 < 0 && !ret) ret = ret2; out_unset: unset_reloc_control(rc); out_end: reloc_chunk_end(fs_info); free_reloc_control(rc); out: free_reloc_roots(&reloc_roots); btrfs_free_path(path); if (ret == 0) { /* cleanup orphan inode in data relocation tree */ fs_root = btrfs_grab_root(fs_info->data_reloc_root); ASSERT(fs_root); ret = btrfs_orphan_cleanup(fs_root); btrfs_put_root(fs_root); } return ret; } /* * helper to add ordered checksum for data relocation. * * cloning checksum properly handles the nodatasum extents. * it also saves CPU time to re-calculate the checksum. */ int btrfs_reloc_clone_csums(struct btrfs_ordered_extent *ordered) { struct btrfs_inode *inode = ordered->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; u64 disk_bytenr = ordered->file_offset + inode->reloc_block_group_start; struct btrfs_root *csum_root = btrfs_csum_root(fs_info, disk_bytenr); LIST_HEAD(list); int ret; ret = btrfs_lookup_csums_list(csum_root, disk_bytenr, disk_bytenr + ordered->num_bytes - 1, &list, false); if (ret < 0) { btrfs_mark_ordered_extent_error(ordered); return ret; } while (!list_empty(&list)) { struct btrfs_ordered_sum *sums = list_entry(list.next, struct btrfs_ordered_sum, list); list_del_init(&sums->list); /* * We need to offset the new_bytenr based on where the csum is. * We need to do this because we will read in entire prealloc * extents but we may have written to say the middle of the * prealloc extent, so we need to make sure the csum goes with * the right disk offset. * * We can do this because the data reloc inode refers strictly * to the on disk bytes, so we don't have to worry about * disk_len vs real len like with real inodes since it's all * disk length. */ sums->logical = ordered->disk_bytenr + sums->logical - disk_bytenr; btrfs_add_ordered_sum(ordered, sums); } return 0; } int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, const struct extent_buffer *buf, struct extent_buffer *cow) { struct btrfs_fs_info *fs_info = root->fs_info; struct reloc_control *rc; struct btrfs_backref_node *node; int first_cow = 0; int level; int ret = 0; rc = fs_info->reloc_ctl; if (!rc) return 0; BUG_ON(rc->stage == UPDATE_DATA_PTRS && btrfs_is_data_reloc_root(root)); level = btrfs_header_level(buf); if (btrfs_header_generation(buf) <= btrfs_root_last_snapshot(&root->root_item)) first_cow = 1; if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID && rc->create_reloc_tree) { WARN_ON(!first_cow && level == 0); node = rc->backref_cache.path[level]; /* * If node->bytenr != buf->start and node->new_bytenr != * buf->start then we've got the wrong backref node for what we * expected to see here and the cache is incorrect. */ if (unlikely(node->bytenr != buf->start && node->new_bytenr != buf->start)) { btrfs_err(fs_info, "bytenr %llu was found but our backref cache was expecting %llu or %llu", buf->start, node->bytenr, node->new_bytenr); return -EUCLEAN; } btrfs_backref_drop_node_buffer(node); atomic_inc(&cow->refs); node->eb = cow; node->new_bytenr = cow->start; if (!node->pending) { list_move_tail(&node->list, &rc->backref_cache.pending[level]); node->pending = 1; } if (first_cow) mark_block_processed(rc, node); if (first_cow && level > 0) rc->nodes_relocated += buf->len; } if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS) ret = replace_file_extents(trans, rc, root, cow); return ret; } /* * called before creating snapshot. it calculates metadata reservation * required for relocating tree blocks in the snapshot */ void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending, u64 *bytes_to_reserve) { struct btrfs_root *root = pending->root; struct reloc_control *rc = root->fs_info->reloc_ctl; if (!rc || !have_reloc_root(root)) return; if (!rc->merge_reloc_tree) return; root = root->reloc_root; BUG_ON(btrfs_root_refs(&root->root_item) == 0); /* * relocation is in the stage of merging trees. the space * used by merging a reloc tree is twice the size of * relocated tree nodes in the worst case. half for cowing * the reloc tree, half for cowing the fs tree. the space * used by cowing the reloc tree will be freed after the * tree is dropped. if we create snapshot, cowing the fs * tree may use more space than it frees. so we need * reserve extra space. */ *bytes_to_reserve += rc->nodes_relocated; } /* * called after snapshot is created. migrate block reservation * and create reloc root for the newly created snapshot * * This is similar to btrfs_init_reloc_root(), we come out of here with two * references held on the reloc_root, one for root->reloc_root and one for * rc->reloc_roots. */ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, struct btrfs_pending_snapshot *pending) { struct btrfs_root *root = pending->root; struct btrfs_root *reloc_root; struct btrfs_root *new_root; struct reloc_control *rc = root->fs_info->reloc_ctl; int ret; if (!rc || !have_reloc_root(root)) return 0; rc = root->fs_info->reloc_ctl; rc->merging_rsv_size += rc->nodes_relocated; if (rc->merge_reloc_tree) { ret = btrfs_block_rsv_migrate(&pending->block_rsv, rc->block_rsv, rc->nodes_relocated, true); if (ret) return ret; } new_root = pending->snap; reloc_root = create_reloc_root(trans, root->reloc_root, btrfs_root_id(new_root)); if (IS_ERR(reloc_root)) return PTR_ERR(reloc_root); ret = __add_reloc_root(reloc_root); ASSERT(ret != -EEXIST); if (ret) { /* Pairs with create_reloc_root */ btrfs_put_root(reloc_root); return ret; } new_root->reloc_root = btrfs_grab_root(reloc_root); return 0; } /* * Get the current bytenr for the block group which is being relocated. * * Return U64_MAX if no running relocation. */ u64 btrfs_get_reloc_bg_bytenr(const struct btrfs_fs_info *fs_info) { u64 logical = U64_MAX; lockdep_assert_held(&fs_info->reloc_mutex); if (fs_info->reloc_ctl && fs_info->reloc_ctl->block_group) logical = fs_info->reloc_ctl->block_group->start; return logical; }
43 47 35 34 35 35 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 /* * Copyright © 2017 Red Hat * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. * * Authors: * */ #ifndef __DRM_SYNCOBJ_H__ #define __DRM_SYNCOBJ_H__ #include <linux/dma-fence.h> #include <linux/dma-fence-chain.h> struct drm_file; /** * struct drm_syncobj - sync object. * * This structure defines a generic sync object which wraps a &dma_fence. */ struct drm_syncobj { /** * @refcount: Reference count of this object. */ struct kref refcount; /** * @fence: * NULL or a pointer to the fence bound to this object. * * This field should not be used directly. Use drm_syncobj_fence_get() * and drm_syncobj_replace_fence() instead. */ struct dma_fence __rcu *fence; /** * @cb_list: List of callbacks to call when the &fence gets replaced. */ struct list_head cb_list; /** * @ev_fd_list: List of registered eventfd. */ struct list_head ev_fd_list; /** * @lock: Protects &cb_list and &ev_fd_list, and write-locks &fence. */ spinlock_t lock; /** * @file: A file backing for this syncobj. */ struct file *file; }; void drm_syncobj_free(struct kref *kref); /** * drm_syncobj_get - acquire a syncobj reference * @obj: sync object * * This acquires an additional reference to @obj. It is illegal to call this * without already holding a reference. No locks required. */ static inline void drm_syncobj_get(struct drm_syncobj *obj) { kref_get(&obj->refcount); } /** * drm_syncobj_put - release a reference to a sync object. * @obj: sync object. */ static inline void drm_syncobj_put(struct drm_syncobj *obj) { kref_put(&obj->refcount, drm_syncobj_free); } /** * drm_syncobj_fence_get - get a reference to a fence in a sync object * @syncobj: sync object. * * This acquires additional reference to &drm_syncobj.fence contained in @obj, * if not NULL. It is illegal to call this without already holding a reference. * No locks required. * * Returns: * Either the fence of @obj or NULL if there's none. */ static inline struct dma_fence * drm_syncobj_fence_get(struct drm_syncobj *syncobj) { struct dma_fence *fence; rcu_read_lock(); fence = dma_fence_get_rcu_safe(&syncobj->fence); rcu_read_unlock(); return fence; } struct drm_syncobj *drm_syncobj_find(struct drm_file *file_private, u32 handle); void drm_syncobj_add_point(struct drm_syncobj *syncobj, struct dma_fence_chain *chain, struct dma_fence *fence, uint64_t point); void drm_syncobj_replace_fence(struct drm_syncobj *syncobj, struct dma_fence *fence); int drm_syncobj_find_fence(struct drm_file *file_private, u32 handle, u64 point, u64 flags, struct dma_fence **fence); void drm_syncobj_free(struct kref *kref); int drm_syncobj_create(struct drm_syncobj **out_syncobj, uint32_t flags, struct dma_fence *fence); int drm_syncobj_get_handle(struct drm_file *file_private, struct drm_syncobj *syncobj, u32 *handle); int drm_syncobj_get_fd(struct drm_syncobj *syncobj, int *p_fd); #endif
6 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 #ifndef _TCP_DCTCP_H #define _TCP_DCTCP_H static inline void dctcp_ece_ack_cwr(struct sock *sk, u32 ce_state) { struct tcp_sock *tp = tcp_sk(sk); if (ce_state == 1) tp->ecn_flags |= TCP_ECN_DEMAND_CWR; else tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; } /* Minimal DCTP CE state machine: * * S: 0 <- last pkt was non-CE * 1 <- last pkt was CE */ static inline void dctcp_ece_ack_update(struct sock *sk, enum tcp_ca_event evt, u32 *prior_rcv_nxt, u32 *ce_state) { u32 new_ce_state = (evt == CA_EVENT_ECN_IS_CE) ? 1 : 0; if (*ce_state != new_ce_state) { /* CE state has changed, force an immediate ACK to * reflect the new CE state. If an ACK was delayed, * send that first to reflect the prior CE state. */ if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) { dctcp_ece_ack_cwr(sk, *ce_state); __tcp_send_ack(sk, *prior_rcv_nxt); } inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; } *prior_rcv_nxt = tcp_sk(sk)->rcv_nxt; *ce_state = new_ce_state; dctcp_ece_ack_cwr(sk, new_ce_state); } #endif
2 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 #undef TRACE_SYSTEM #define TRACE_SYSTEM irq_matrix #if !defined(_TRACE_IRQ_MATRIX_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_IRQ_MATRIX_H #include <linux/tracepoint.h> struct irq_matrix; struct cpumap; DECLARE_EVENT_CLASS(irq_matrix_global, TP_PROTO(struct irq_matrix *matrix), TP_ARGS(matrix), TP_STRUCT__entry( __field( unsigned int, online_maps ) __field( unsigned int, global_available ) __field( unsigned int, global_reserved ) __field( unsigned int, total_allocated ) ), TP_fast_assign( __entry->online_maps = matrix->online_maps; __entry->global_available = matrix->global_available; __entry->global_reserved = matrix->global_reserved; __entry->total_allocated = matrix->total_allocated; ), TP_printk("online_maps=%d global_avl=%u, global_rsvd=%u, total_alloc=%u", __entry->online_maps, __entry->global_available, __entry->global_reserved, __entry->total_allocated) ); DECLARE_EVENT_CLASS(irq_matrix_global_update, TP_PROTO(int bit, struct irq_matrix *matrix), TP_ARGS(bit, matrix), TP_STRUCT__entry( __field( int, bit ) __field( unsigned int, online_maps ) __field( unsigned int, global_available ) __field( unsigned int, global_reserved ) __field( unsigned int, total_allocated ) ), TP_fast_assign( __entry->bit = bit; __entry->online_maps = matrix->online_maps; __entry->global_available = matrix->global_available; __entry->global_reserved = matrix->global_reserved; __entry->total_allocated = matrix->total_allocated; ), TP_printk("bit=%d online_maps=%d global_avl=%u, global_rsvd=%u, total_alloc=%u", __entry->bit, __entry->online_maps, __entry->global_available, __entry->global_reserved, __entry->total_allocated) ); DECLARE_EVENT_CLASS(irq_matrix_cpu, TP_PROTO(int bit, unsigned int cpu, struct irq_matrix *matrix, struct cpumap *cmap), TP_ARGS(bit, cpu, matrix, cmap), TP_STRUCT__entry( __field( int, bit ) __field( unsigned int, cpu ) __field( bool, online ) __field( unsigned int, available ) __field( unsigned int, allocated ) __field( unsigned int, managed ) __field( unsigned int, online_maps ) __field( unsigned int, global_available ) __field( unsigned int, global_reserved ) __field( unsigned int, total_allocated ) ), TP_fast_assign( __entry->bit = bit; __entry->cpu = cpu; __entry->online = cmap->online; __entry->available = cmap->available; __entry->allocated = cmap->allocated; __entry->managed = cmap->managed; __entry->online_maps = matrix->online_maps; __entry->global_available = matrix->global_available; __entry->global_reserved = matrix->global_reserved; __entry->total_allocated = matrix->total_allocated; ), TP_printk("bit=%d cpu=%u online=%d avl=%u alloc=%u managed=%u online_maps=%u global_avl=%u, global_rsvd=%u, total_alloc=%u", __entry->bit, __entry->cpu, __entry->online, __entry->available, __entry->allocated, __entry->managed, __entry->online_maps, __entry->global_available, __entry->global_reserved, __entry->total_allocated) ); DEFINE_EVENT(irq_matrix_global, irq_matrix_online, TP_PROTO(struct irq_matrix *matrix), TP_ARGS(matrix) ); DEFINE_EVENT(irq_matrix_global, irq_matrix_offline, TP_PROTO(struct irq_matrix *matrix), TP_ARGS(matrix) ); DEFINE_EVENT(irq_matrix_global, irq_matrix_reserve, TP_PROTO(struct irq_matrix *matrix), TP_ARGS(matrix) ); DEFINE_EVENT(irq_matrix_global, irq_matrix_remove_reserved, TP_PROTO(struct irq_matrix *matrix), TP_ARGS(matrix) ); DEFINE_EVENT(irq_matrix_global_update, irq_matrix_assign_system, TP_PROTO(int bit, struct irq_matrix *matrix), TP_ARGS(bit, matrix) ); DEFINE_EVENT(irq_matrix_cpu, irq_matrix_alloc_reserved, TP_PROTO(int bit, unsigned int cpu, struct irq_matrix *matrix, struct cpumap *cmap), TP_ARGS(bit, cpu, matrix, cmap) ); DEFINE_EVENT(irq_matrix_cpu, irq_matrix_reserve_managed, TP_PROTO(int bit, unsigned int cpu, struct irq_matrix *matrix, struct cpumap *cmap), TP_ARGS(bit, cpu, matrix, cmap) ); DEFINE_EVENT(irq_matrix_cpu, irq_matrix_remove_managed, TP_PROTO(int bit, unsigned int cpu, struct irq_matrix *matrix, struct cpumap *cmap), TP_ARGS(bit, cpu, matrix, cmap) ); DEFINE_EVENT(irq_matrix_cpu, irq_matrix_alloc_managed, TP_PROTO(int bit, unsigned int cpu, struct irq_matrix *matrix, struct cpumap *cmap), TP_ARGS(bit, cpu, matrix, cmap) ); DEFINE_EVENT(irq_matrix_cpu, irq_matrix_assign, TP_PROTO(int bit, unsigned int cpu, struct irq_matrix *matrix, struct cpumap *cmap), TP_ARGS(bit, cpu, matrix, cmap) ); DEFINE_EVENT(irq_matrix_cpu, irq_matrix_alloc, TP_PROTO(int bit, unsigned int cpu, struct irq_matrix *matrix, struct cpumap *cmap), TP_ARGS(bit, cpu, matrix, cmap) ); DEFINE_EVENT(irq_matrix_cpu, irq_matrix_free, TP_PROTO(int bit, unsigned int cpu, struct irq_matrix *matrix, struct cpumap *cmap), TP_ARGS(bit, cpu, matrix, cmap) ); #endif /* _TRACE_IRQ_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
23 23 23 10 10 10 183 9 175 183 171 12 12 6 6 5 1 4 4 4 7 4 4 4 6 6 6 6 6 7 7 7 10 7 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 // SPDX-License-Identifier: GPL-2.0 /* * arch-independent dma-mapping routines * * Copyright (c) 2006 SUSE Linux Products GmbH * Copyright (c) 2006 Tejun Heo <teheo@suse.de> */ #include <linux/memblock.h> /* for max_pfn */ #include <linux/acpi.h> #include <linux/dma-map-ops.h> #include <linux/export.h> #include <linux/gfp.h> #include <linux/iommu-dma.h> #include <linux/kmsan.h> #include <linux/of_device.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include "debug.h" #include "direct.h" #define CREATE_TRACE_POINTS #include <trace/events/dma.h> #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \ defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \ defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) bool dma_default_coherent = IS_ENABLED(CONFIG_ARCH_DMA_DEFAULT_COHERENT); #endif /* * Managed DMA API */ struct dma_devres { size_t size; void *vaddr; dma_addr_t dma_handle; unsigned long attrs; }; static void dmam_release(struct device *dev, void *res) { struct dma_devres *this = res; dma_free_attrs(dev, this->size, this->vaddr, this->dma_handle, this->attrs); } static int dmam_match(struct device *dev, void *res, void *match_data) { struct dma_devres *this = res, *match = match_data; if (this->vaddr == match->vaddr) { WARN_ON(this->size != match->size || this->dma_handle != match->dma_handle); return 1; } return 0; } /** * dmam_free_coherent - Managed dma_free_coherent() * @dev: Device to free coherent memory for * @size: Size of allocation * @vaddr: Virtual address of the memory to free * @dma_handle: DMA handle of the memory to free * * Managed dma_free_coherent(). */ void dmam_free_coherent(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle) { struct dma_devres match_data = { size, vaddr, dma_handle }; WARN_ON(devres_destroy(dev, dmam_release, dmam_match, &match_data)); dma_free_coherent(dev, size, vaddr, dma_handle); } EXPORT_SYMBOL(dmam_free_coherent); /** * dmam_alloc_attrs - Managed dma_alloc_attrs() * @dev: Device to allocate non_coherent memory for * @size: Size of allocation * @dma_handle: Out argument for allocated DMA handle * @gfp: Allocation flags * @attrs: Flags in the DMA_ATTR_* namespace. * * Managed dma_alloc_attrs(). Memory allocated using this function will be * automatically released on driver detach. * * RETURNS: * Pointer to allocated memory on success, NULL on failure. */ void *dmam_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) { struct dma_devres *dr; void *vaddr; dr = devres_alloc(dmam_release, sizeof(*dr), gfp); if (!dr) return NULL; vaddr = dma_alloc_attrs(dev, size, dma_handle, gfp, attrs); if (!vaddr) { devres_free(dr); return NULL; } dr->vaddr = vaddr; dr->dma_handle = *dma_handle; dr->size = size; dr->attrs = attrs; devres_add(dev, dr); return vaddr; } EXPORT_SYMBOL(dmam_alloc_attrs); static bool dma_go_direct(struct device *dev, dma_addr_t mask, const struct dma_map_ops *ops) { if (use_dma_iommu(dev)) return false; if (likely(!ops)) return true; #ifdef CONFIG_DMA_OPS_BYPASS if (dev->dma_ops_bypass) return min_not_zero(mask, dev->bus_dma_limit) >= dma_direct_get_required_mask(dev); #endif return false; } /* * Check if the devices uses a direct mapping for streaming DMA operations. * This allows IOMMU drivers to set a bypass mode if the DMA mask is large * enough. */ static inline bool dma_alloc_direct(struct device *dev, const struct dma_map_ops *ops) { return dma_go_direct(dev, dev->coherent_dma_mask, ops); } static inline bool dma_map_direct(struct device *dev, const struct dma_map_ops *ops) { return dma_go_direct(dev, *dev->dma_mask, ops); } dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, size_t offset, size_t size, enum dma_data_direction dir, unsigned long attrs) { const struct dma_map_ops *ops = get_dma_ops(dev); dma_addr_t addr; BUG_ON(!valid_dma_direction(dir)); if (WARN_ON_ONCE(!dev->dma_mask)) return DMA_MAPPING_ERROR; if (dma_map_direct(dev, ops) || arch_dma_map_page_direct(dev, page_to_phys(page) + offset + size)) addr = dma_direct_map_page(dev, page, offset, size, dir, attrs); else if (use_dma_iommu(dev)) addr = iommu_dma_map_page(dev, page, offset, size, dir, attrs); else addr = ops->map_page(dev, page, offset, size, dir, attrs); kmsan_handle_dma(page, offset, size, dir); trace_dma_map_page(dev, page_to_phys(page) + offset, addr, size, dir, attrs); debug_dma_map_page(dev, page, offset, size, dir, addr, attrs); return addr; } EXPORT_SYMBOL(dma_map_page_attrs); void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { const struct dma_map_ops *ops = get_dma_ops(dev); BUG_ON(!valid_dma_direction(dir)); if (dma_map_direct(dev, ops) || arch_dma_unmap_page_direct(dev, addr + size)) dma_direct_unmap_page(dev, addr, size, dir, attrs); else if (use_dma_iommu(dev)) iommu_dma_unmap_page(dev, addr, size, dir, attrs); else ops->unmap_page(dev, addr, size, dir, attrs); trace_dma_unmap_page(dev, addr, size, dir, attrs); debug_dma_unmap_page(dev, addr, size, dir); } EXPORT_SYMBOL(dma_unmap_page_attrs); static int __dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir, unsigned long attrs) { const struct dma_map_ops *ops = get_dma_ops(dev); int ents; BUG_ON(!valid_dma_direction(dir)); if (WARN_ON_ONCE(!dev->dma_mask)) return 0; if (dma_map_direct(dev, ops) || arch_dma_map_sg_direct(dev, sg, nents)) ents = dma_direct_map_sg(dev, sg, nents, dir, attrs); else if (use_dma_iommu(dev)) ents = iommu_dma_map_sg(dev, sg, nents, dir, attrs); else ents = ops->map_sg(dev, sg, nents, dir, attrs); if (ents > 0) { kmsan_handle_dma_sg(sg, nents, dir); trace_dma_map_sg(dev, sg, nents, ents, dir, attrs); debug_dma_map_sg(dev, sg, nents, ents, dir, attrs); } else if (WARN_ON_ONCE(ents != -EINVAL && ents != -ENOMEM && ents != -EIO && ents != -EREMOTEIO)) { trace_dma_map_sg_err(dev, sg, nents, ents, dir, attrs); return -EIO; } return ents; } /** * dma_map_sg_attrs - Map the given buffer for DMA * @dev: The device for which to perform the DMA operation * @sg: The sg_table object describing the buffer * @nents: Number of entries to map * @dir: DMA direction * @attrs: Optional DMA attributes for the map operation * * Maps a buffer described by a scatterlist passed in the sg argument with * nents segments for the @dir DMA operation by the @dev device. * * Returns the number of mapped entries (which can be less than nents) * on success. Zero is returned for any error. * * dma_unmap_sg_attrs() should be used to unmap the buffer with the * original sg and original nents (not the value returned by this funciton). */ unsigned int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir, unsigned long attrs) { int ret; ret = __dma_map_sg_attrs(dev, sg, nents, dir, attrs); if (ret < 0) return 0; return ret; } EXPORT_SYMBOL(dma_map_sg_attrs); /** * dma_map_sgtable - Map the given buffer for DMA * @dev: The device for which to perform the DMA operation * @sgt: The sg_table object describing the buffer * @dir: DMA direction * @attrs: Optional DMA attributes for the map operation * * Maps a buffer described by a scatterlist stored in the given sg_table * object for the @dir DMA operation by the @dev device. After success, the * ownership for the buffer is transferred to the DMA domain. One has to * call dma_sync_sgtable_for_cpu() or dma_unmap_sgtable() to move the * ownership of the buffer back to the CPU domain before touching the * buffer by the CPU. * * Returns 0 on success or a negative error code on error. The following * error codes are supported with the given meaning: * * -EINVAL An invalid argument, unaligned access or other error * in usage. Will not succeed if retried. * -ENOMEM Insufficient resources (like memory or IOVA space) to * complete the mapping. Should succeed if retried later. * -EIO Legacy error code with an unknown meaning. eg. this is * returned if a lower level call returned * DMA_MAPPING_ERROR. * -EREMOTEIO The DMA device cannot access P2PDMA memory specified * in the sg_table. This will not succeed if retried. */ int dma_map_sgtable(struct device *dev, struct sg_table *sgt, enum dma_data_direction dir, unsigned long attrs) { int nents; nents = __dma_map_sg_attrs(dev, sgt->sgl, sgt->orig_nents, dir, attrs); if (nents < 0) return nents; sgt->nents = nents; return 0; } EXPORT_SYMBOL_GPL(dma_map_sgtable); void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir, unsigned long attrs) { const struct dma_map_ops *ops = get_dma_ops(dev); BUG_ON(!valid_dma_direction(dir)); trace_dma_unmap_sg(dev, sg, nents, dir, attrs); debug_dma_unmap_sg(dev, sg, nents, dir); if (dma_map_direct(dev, ops) || arch_dma_unmap_sg_direct(dev, sg, nents)) dma_direct_unmap_sg(dev, sg, nents, dir, attrs); else if (use_dma_iommu(dev)) iommu_dma_unmap_sg(dev, sg, nents, dir, attrs); else if (ops->unmap_sg) ops->unmap_sg(dev, sg, nents, dir, attrs); } EXPORT_SYMBOL(dma_unmap_sg_attrs); dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { const struct dma_map_ops *ops = get_dma_ops(dev); dma_addr_t addr = DMA_MAPPING_ERROR; BUG_ON(!valid_dma_direction(dir)); if (WARN_ON_ONCE(!dev->dma_mask)) return DMA_MAPPING_ERROR; if (dma_map_direct(dev, ops)) addr = dma_direct_map_resource(dev, phys_addr, size, dir, attrs); else if (use_dma_iommu(dev)) addr = iommu_dma_map_resource(dev, phys_addr, size, dir, attrs); else if (ops->map_resource) addr = ops->map_resource(dev, phys_addr, size, dir, attrs); trace_dma_map_resource(dev, phys_addr, addr, size, dir, attrs); debug_dma_map_resource(dev, phys_addr, size, dir, addr, attrs); return addr; } EXPORT_SYMBOL(dma_map_resource); void dma_unmap_resource(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { const struct dma_map_ops *ops = get_dma_ops(dev); BUG_ON(!valid_dma_direction(dir)); if (dma_map_direct(dev, ops)) ; /* nothing to do: uncached and no swiotlb */ else if (use_dma_iommu(dev)) iommu_dma_unmap_resource(dev, addr, size, dir, attrs); else if (ops->unmap_resource) ops->unmap_resource(dev, addr, size, dir, attrs); trace_dma_unmap_resource(dev, addr, size, dir, attrs); debug_dma_unmap_resource(dev, addr, size, dir); } EXPORT_SYMBOL(dma_unmap_resource); #ifdef CONFIG_DMA_NEED_SYNC void __dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir) { const struct dma_map_ops *ops = get_dma_ops(dev); BUG_ON(!valid_dma_direction(dir)); if (dma_map_direct(dev, ops)) dma_direct_sync_single_for_cpu(dev, addr, size, dir); else if (use_dma_iommu(dev)) iommu_dma_sync_single_for_cpu(dev, addr, size, dir); else if (ops->sync_single_for_cpu) ops->sync_single_for_cpu(dev, addr, size, dir); trace_dma_sync_single_for_cpu(dev, addr, size, dir); debug_dma_sync_single_for_cpu(dev, addr, size, dir); } EXPORT_SYMBOL(__dma_sync_single_for_cpu); void __dma_sync_single_for_device(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir) { const struct dma_map_ops *ops = get_dma_ops(dev); BUG_ON(!valid_dma_direction(dir)); if (dma_map_direct(dev, ops)) dma_direct_sync_single_for_device(dev, addr, size, dir); else if (use_dma_iommu(dev)) iommu_dma_sync_single_for_device(dev, addr, size, dir); else if (ops->sync_single_for_device) ops->sync_single_for_device(dev, addr, size, dir); trace_dma_sync_single_for_device(dev, addr, size, dir); debug_dma_sync_single_for_device(dev, addr, size, dir); } EXPORT_SYMBOL(__dma_sync_single_for_device); void __dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems, enum dma_data_direction dir) { const struct dma_map_ops *ops = get_dma_ops(dev); BUG_ON(!valid_dma_direction(dir)); if (dma_map_direct(dev, ops)) dma_direct_sync_sg_for_cpu(dev, sg, nelems, dir); else if (use_dma_iommu(dev)) iommu_dma_sync_sg_for_cpu(dev, sg, nelems, dir); else if (ops->sync_sg_for_cpu) ops->sync_sg_for_cpu(dev, sg, nelems, dir); trace_dma_sync_sg_for_cpu(dev, sg, nelems, dir); debug_dma_sync_sg_for_cpu(dev, sg, nelems, dir); } EXPORT_SYMBOL(__dma_sync_sg_for_cpu); void __dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems, enum dma_data_direction dir) { const struct dma_map_ops *ops = get_dma_ops(dev); BUG_ON(!valid_dma_direction(dir)); if (dma_map_direct(dev, ops)) dma_direct_sync_sg_for_device(dev, sg, nelems, dir); else if (use_dma_iommu(dev)) iommu_dma_sync_sg_for_device(dev, sg, nelems, dir); else if (ops->sync_sg_for_device) ops->sync_sg_for_device(dev, sg, nelems, dir); trace_dma_sync_sg_for_device(dev, sg, nelems, dir); debug_dma_sync_sg_for_device(dev, sg, nelems, dir); } EXPORT_SYMBOL(__dma_sync_sg_for_device); bool __dma_need_sync(struct device *dev, dma_addr_t dma_addr) { const struct dma_map_ops *ops = get_dma_ops(dev); if (dma_map_direct(dev, ops)) /* * dma_skip_sync could've been reset on first SWIOTLB buffer * mapping, but @dma_addr is not necessary an SWIOTLB buffer. * In this case, fall back to more granular check. */ return dma_direct_need_sync(dev, dma_addr); return true; } EXPORT_SYMBOL_GPL(__dma_need_sync); static void dma_setup_need_sync(struct device *dev) { const struct dma_map_ops *ops = get_dma_ops(dev); if (dma_map_direct(dev, ops) || use_dma_iommu(dev)) /* * dma_skip_sync will be reset to %false on first SWIOTLB buffer * mapping, if any. During the device initialization, it's * enough to check only for the DMA coherence. */ dev->dma_skip_sync = dev_is_dma_coherent(dev); else if (!ops->sync_single_for_device && !ops->sync_single_for_cpu && !ops->sync_sg_for_device && !ops->sync_sg_for_cpu) /* * Synchronization is not possible when none of DMA sync ops * is set. */ dev->dma_skip_sync = true; else dev->dma_skip_sync = false; } #else /* !CONFIG_DMA_NEED_SYNC */ static inline void dma_setup_need_sync(struct device *dev) { } #endif /* !CONFIG_DMA_NEED_SYNC */ /* * The whole dma_get_sgtable() idea is fundamentally unsafe - it seems * that the intention is to allow exporting memory allocated via the * coherent DMA APIs through the dma_buf API, which only accepts a * scattertable. This presents a couple of problems: * 1. Not all memory allocated via the coherent DMA APIs is backed by * a struct page * 2. Passing coherent DMA memory into the streaming APIs is not allowed * as we will try to flush the memory through a different alias to that * actually being used (and the flushes are redundant.) */ int dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt, void *cpu_addr, dma_addr_t dma_addr, size_t size, unsigned long attrs) { const struct dma_map_ops *ops = get_dma_ops(dev); if (dma_alloc_direct(dev, ops)) return dma_direct_get_sgtable(dev, sgt, cpu_addr, dma_addr, size, attrs); if (use_dma_iommu(dev)) return iommu_dma_get_sgtable(dev, sgt, cpu_addr, dma_addr, size, attrs); if (!ops->get_sgtable) return -ENXIO; return ops->get_sgtable(dev, sgt, cpu_addr, dma_addr, size, attrs); } EXPORT_SYMBOL(dma_get_sgtable_attrs); #ifdef CONFIG_MMU /* * Return the page attributes used for mapping dma_alloc_* memory, either in * kernel space if remapping is needed, or to userspace through dma_mmap_*. */ pgprot_t dma_pgprot(struct device *dev, pgprot_t prot, unsigned long attrs) { if (dev_is_dma_coherent(dev)) return prot; #ifdef CONFIG_ARCH_HAS_DMA_WRITE_COMBINE if (attrs & DMA_ATTR_WRITE_COMBINE) return pgprot_writecombine(prot); #endif return pgprot_dmacoherent(prot); } #endif /* CONFIG_MMU */ /** * dma_can_mmap - check if a given device supports dma_mmap_* * @dev: device to check * * Returns %true if @dev supports dma_mmap_coherent() and dma_mmap_attrs() to * map DMA allocations to userspace. */ bool dma_can_mmap(struct device *dev) { const struct dma_map_ops *ops = get_dma_ops(dev); if (dma_alloc_direct(dev, ops)) return dma_direct_can_mmap(dev); if (use_dma_iommu(dev)) return true; return ops->mmap != NULL; } EXPORT_SYMBOL_GPL(dma_can_mmap); /** * dma_mmap_attrs - map a coherent DMA allocation into user space * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices * @vma: vm_area_struct describing requested user mapping * @cpu_addr: kernel CPU-view address returned from dma_alloc_attrs * @dma_addr: device-view address returned from dma_alloc_attrs * @size: size of memory originally requested in dma_alloc_attrs * @attrs: attributes of mapping properties requested in dma_alloc_attrs * * Map a coherent DMA buffer previously allocated by dma_alloc_attrs into user * space. The coherent DMA buffer must not be freed by the driver until the * user space mapping has been released. */ int dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma, void *cpu_addr, dma_addr_t dma_addr, size_t size, unsigned long attrs) { const struct dma_map_ops *ops = get_dma_ops(dev); if (dma_alloc_direct(dev, ops)) return dma_direct_mmap(dev, vma, cpu_addr, dma_addr, size, attrs); if (use_dma_iommu(dev)) return iommu_dma_mmap(dev, vma, cpu_addr, dma_addr, size, attrs); if (!ops->mmap) return -ENXIO; return ops->mmap(dev, vma, cpu_addr, dma_addr, size, attrs); } EXPORT_SYMBOL(dma_mmap_attrs); u64 dma_get_required_mask(struct device *dev) { const struct dma_map_ops *ops = get_dma_ops(dev); if (dma_alloc_direct(dev, ops)) return dma_direct_get_required_mask(dev); if (use_dma_iommu(dev)) return DMA_BIT_MASK(32); if (ops->get_required_mask) return ops->get_required_mask(dev); /* * We require every DMA ops implementation to at least support a 32-bit * DMA mask (and use bounce buffering if that isn't supported in * hardware). As the direct mapping code has its own routine to * actually report an optimal mask we default to 32-bit here as that * is the right thing for most IOMMUs, and at least not actively * harmful in general. */ return DMA_BIT_MASK(32); } EXPORT_SYMBOL_GPL(dma_get_required_mask); void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flag, unsigned long attrs) { const struct dma_map_ops *ops = get_dma_ops(dev); void *cpu_addr; WARN_ON_ONCE(!dev->coherent_dma_mask); /* * DMA allocations can never be turned back into a page pointer, so * requesting compound pages doesn't make sense (and can't even be * supported at all by various backends). */ if (WARN_ON_ONCE(flag & __GFP_COMP)) return NULL; if (dma_alloc_from_dev_coherent(dev, size, dma_handle, &cpu_addr)) { trace_dma_alloc(dev, cpu_addr, *dma_handle, size, DMA_BIDIRECTIONAL, flag, attrs); return cpu_addr; } /* let the implementation decide on the zone to allocate from: */ flag &= ~(__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM); if (dma_alloc_direct(dev, ops)) { cpu_addr = dma_direct_alloc(dev, size, dma_handle, flag, attrs); } else if (use_dma_iommu(dev)) { cpu_addr = iommu_dma_alloc(dev, size, dma_handle, flag, attrs); } else if (ops->alloc) { cpu_addr = ops->alloc(dev, size, dma_handle, flag, attrs); } else { trace_dma_alloc(dev, NULL, 0, size, DMA_BIDIRECTIONAL, flag, attrs); return NULL; } trace_dma_alloc(dev, cpu_addr, *dma_handle, size, DMA_BIDIRECTIONAL, flag, attrs); debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr, attrs); return cpu_addr; } EXPORT_SYMBOL(dma_alloc_attrs); void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr, dma_addr_t dma_handle, unsigned long attrs) { const struct dma_map_ops *ops = get_dma_ops(dev); if (dma_release_from_dev_coherent(dev, get_order(size), cpu_addr)) return; /* * On non-coherent platforms which implement DMA-coherent buffers via * non-cacheable remaps, ops->free() may call vunmap(). Thus getting * this far in IRQ context is a) at risk of a BUG_ON() or trying to * sleep on some machines, and b) an indication that the driver is * probably misusing the coherent API anyway. */ WARN_ON(irqs_disabled()); trace_dma_free(dev, cpu_addr, dma_handle, size, DMA_BIDIRECTIONAL, attrs); if (!cpu_addr) return; debug_dma_free_coherent(dev, size, cpu_addr, dma_handle); if (dma_alloc_direct(dev, ops)) dma_direct_free(dev, size, cpu_addr, dma_handle, attrs); else if (use_dma_iommu(dev)) iommu_dma_free(dev, size, cpu_addr, dma_handle, attrs); else if (ops->free) ops->free(dev, size, cpu_addr, dma_handle, attrs); } EXPORT_SYMBOL(dma_free_attrs); static struct page *__dma_alloc_pages(struct device *dev, size_t size, dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp) { const struct dma_map_ops *ops = get_dma_ops(dev); if (WARN_ON_ONCE(!dev->coherent_dma_mask)) return NULL; if (WARN_ON_ONCE(gfp & (__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM))) return NULL; if (WARN_ON_ONCE(gfp & __GFP_COMP)) return NULL; size = PAGE_ALIGN(size); if (dma_alloc_direct(dev, ops)) return dma_direct_alloc_pages(dev, size, dma_handle, dir, gfp); if (use_dma_iommu(dev)) return dma_common_alloc_pages(dev, size, dma_handle, dir, gfp); if (!ops->alloc_pages_op) return NULL; return ops->alloc_pages_op(dev, size, dma_handle, dir, gfp); } struct page *dma_alloc_pages(struct device *dev, size_t size, dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp) { struct page *page = __dma_alloc_pages(dev, size, dma_handle, dir, gfp); if (page) { trace_dma_alloc_pages(dev, page_to_virt(page), *dma_handle, size, dir, gfp, 0); debug_dma_map_page(dev, page, 0, size, dir, *dma_handle, 0); } else { trace_dma_alloc_pages(dev, NULL, 0, size, dir, gfp, 0); } return page; } EXPORT_SYMBOL_GPL(dma_alloc_pages); static void __dma_free_pages(struct device *dev, size_t size, struct page *page, dma_addr_t dma_handle, enum dma_data_direction dir) { const struct dma_map_ops *ops = get_dma_ops(dev); size = PAGE_ALIGN(size); if (dma_alloc_direct(dev, ops)) dma_direct_free_pages(dev, size, page, dma_handle, dir); else if (use_dma_iommu(dev)) dma_common_free_pages(dev, size, page, dma_handle, dir); else if (ops->free_pages) ops->free_pages(dev, size, page, dma_handle, dir); } void dma_free_pages(struct device *dev, size_t size, struct page *page, dma_addr_t dma_handle, enum dma_data_direction dir) { trace_dma_free_pages(dev, page_to_virt(page), dma_handle, size, dir, 0); debug_dma_unmap_page(dev, dma_handle, size, dir); __dma_free_pages(dev, size, page, dma_handle, dir); } EXPORT_SYMBOL_GPL(dma_free_pages); int dma_mmap_pages(struct device *dev, struct vm_area_struct *vma, size_t size, struct page *page) { unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT; if (vma->vm_pgoff >= count || vma_pages(vma) > count - vma->vm_pgoff) return -ENXIO; return remap_pfn_range(vma, vma->vm_start, page_to_pfn(page) + vma->vm_pgoff, vma_pages(vma) << PAGE_SHIFT, vma->vm_page_prot); } EXPORT_SYMBOL_GPL(dma_mmap_pages); static struct sg_table *alloc_single_sgt(struct device *dev, size_t size, enum dma_data_direction dir, gfp_t gfp) { struct sg_table *sgt; struct page *page; sgt = kmalloc(sizeof(*sgt), gfp); if (!sgt) return NULL; if (sg_alloc_table(sgt, 1, gfp)) goto out_free_sgt; page = __dma_alloc_pages(dev, size, &sgt->sgl->dma_address, dir, gfp); if (!page) goto out_free_table; sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0); sg_dma_len(sgt->sgl) = sgt->sgl->length; return sgt; out_free_table: sg_free_table(sgt); out_free_sgt: kfree(sgt); return NULL; } struct sg_table *dma_alloc_noncontiguous(struct device *dev, size_t size, enum dma_data_direction dir, gfp_t gfp, unsigned long attrs) { struct sg_table *sgt; if (WARN_ON_ONCE(attrs & ~DMA_ATTR_ALLOC_SINGLE_PAGES)) return NULL; if (WARN_ON_ONCE(gfp & __GFP_COMP)) return NULL; if (use_dma_iommu(dev)) sgt = iommu_dma_alloc_noncontiguous(dev, size, dir, gfp, attrs); else sgt = alloc_single_sgt(dev, size, dir, gfp); if (sgt) { sgt->nents = 1; trace_dma_alloc_sgt(dev, sgt, size, dir, gfp, attrs); debug_dma_map_sg(dev, sgt->sgl, sgt->orig_nents, 1, dir, attrs); } else { trace_dma_alloc_sgt_err(dev, NULL, 0, size, dir, gfp, attrs); } return sgt; } EXPORT_SYMBOL_GPL(dma_alloc_noncontiguous); static void free_single_sgt(struct device *dev, size_t size, struct sg_table *sgt, enum dma_data_direction dir) { __dma_free_pages(dev, size, sg_page(sgt->sgl), sgt->sgl->dma_address, dir); sg_free_table(sgt); kfree(sgt); } void dma_free_noncontiguous(struct device *dev, size_t size, struct sg_table *sgt, enum dma_data_direction dir) { trace_dma_free_sgt(dev, sgt, size, dir); debug_dma_unmap_sg(dev, sgt->sgl, sgt->orig_nents, dir); if (use_dma_iommu(dev)) iommu_dma_free_noncontiguous(dev, size, sgt, dir); else free_single_sgt(dev, size, sgt, dir); } EXPORT_SYMBOL_GPL(dma_free_noncontiguous); void *dma_vmap_noncontiguous(struct device *dev, size_t size, struct sg_table *sgt) { if (use_dma_iommu(dev)) return iommu_dma_vmap_noncontiguous(dev, size, sgt); return page_address(sg_page(sgt->sgl)); } EXPORT_SYMBOL_GPL(dma_vmap_noncontiguous); void dma_vunmap_noncontiguous(struct device *dev, void *vaddr) { if (use_dma_iommu(dev)) iommu_dma_vunmap_noncontiguous(dev, vaddr); } EXPORT_SYMBOL_GPL(dma_vunmap_noncontiguous); int dma_mmap_noncontiguous(struct device *dev, struct vm_area_struct *vma, size_t size, struct sg_table *sgt) { if (use_dma_iommu(dev)) return iommu_dma_mmap_noncontiguous(dev, vma, size, sgt); return dma_mmap_pages(dev, vma, size, sg_page(sgt->sgl)); } EXPORT_SYMBOL_GPL(dma_mmap_noncontiguous); static int dma_supported(struct device *dev, u64 mask) { const struct dma_map_ops *ops = get_dma_ops(dev); if (use_dma_iommu(dev)) { if (WARN_ON(ops)) return false; return true; } /* * ->dma_supported sets and clears the bypass flag, so ignore it here * and always call into the method if there is one. */ if (ops) { if (!ops->dma_supported) return true; return ops->dma_supported(dev, mask); } return dma_direct_supported(dev, mask); } bool dma_pci_p2pdma_supported(struct device *dev) { const struct dma_map_ops *ops = get_dma_ops(dev); /* * Note: dma_ops_bypass is not checked here because P2PDMA should * not be used with dma mapping ops that do not have support even * if the specific device is bypassing them. */ /* if ops is not set, dma direct and default IOMMU support P2PDMA */ return !ops; } EXPORT_SYMBOL_GPL(dma_pci_p2pdma_supported); int dma_set_mask(struct device *dev, u64 mask) { /* * Truncate the mask to the actually supported dma_addr_t width to * avoid generating unsupportable addresses. */ mask = (dma_addr_t)mask; if (!dev->dma_mask || !dma_supported(dev, mask)) return -EIO; arch_dma_set_mask(dev, mask); *dev->dma_mask = mask; dma_setup_need_sync(dev); return 0; } EXPORT_SYMBOL(dma_set_mask); int dma_set_coherent_mask(struct device *dev, u64 mask) { /* * Truncate the mask to the actually supported dma_addr_t width to * avoid generating unsupportable addresses. */ mask = (dma_addr_t)mask; if (!dma_supported(dev, mask)) return -EIO; dev->coherent_dma_mask = mask; return 0; } EXPORT_SYMBOL(dma_set_coherent_mask); /** * dma_addressing_limited - return if the device is addressing limited * @dev: device to check * * Return %true if the devices DMA mask is too small to address all memory in * the system, else %false. Lack of addressing bits is the prime reason for * bounce buffering, but might not be the only one. */ bool dma_addressing_limited(struct device *dev) { const struct dma_map_ops *ops = get_dma_ops(dev); if (min_not_zero(dma_get_mask(dev), dev->bus_dma_limit) < dma_get_required_mask(dev)) return true; if (unlikely(ops) || use_dma_iommu(dev)) return false; return !dma_direct_all_ram_mapped(dev); } EXPORT_SYMBOL_GPL(dma_addressing_limited); size_t dma_max_mapping_size(struct device *dev) { const struct dma_map_ops *ops = get_dma_ops(dev); size_t size = SIZE_MAX; if (dma_map_direct(dev, ops)) size = dma_direct_max_mapping_size(dev); else if (use_dma_iommu(dev)) size = iommu_dma_max_mapping_size(dev); else if (ops && ops->max_mapping_size) size = ops->max_mapping_size(dev); return size; } EXPORT_SYMBOL_GPL(dma_max_mapping_size); size_t dma_opt_mapping_size(struct device *dev) { const struct dma_map_ops *ops = get_dma_ops(dev); size_t size = SIZE_MAX; if (use_dma_iommu(dev)) size = iommu_dma_opt_mapping_size(); else if (ops && ops->opt_mapping_size) size = ops->opt_mapping_size(); return min(dma_max_mapping_size(dev), size); } EXPORT_SYMBOL_GPL(dma_opt_mapping_size); unsigned long dma_get_merge_boundary(struct device *dev) { const struct dma_map_ops *ops = get_dma_ops(dev); if (use_dma_iommu(dev)) return iommu_dma_get_merge_boundary(dev); if (!ops || !ops->get_merge_boundary) return 0; /* can't merge */ return ops->get_merge_boundary(dev); } EXPORT_SYMBOL_GPL(dma_get_merge_boundary);
107 350 107 353 400 398 401 399 58 58 36 30 58 58 58 56 20 20 42 42 41 42 368 370 354 332 15 15 369 370 369 370 15 369 370 363 6 42 3 3 354 356 343 3 19 18 3 3 3 396 396 396 396 396 394 53 380 265 315 401 398 663 666 666 401 399 106 6 352 3 396 635 638 41 42 42 103 16 24 73 42 42 42 400 60 413 398 401 60 398 400 16 16 16 379 379 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 // SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2016 Oracle. All Rights Reserved. * Author: Darrick J. Wong <darrick.wong@oracle.com> */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_defer.h" #include "xfs_trans.h" #include "xfs_trans_priv.h" #include "xfs_buf_item.h" #include "xfs_inode.h" #include "xfs_inode_item.h" #include "xfs_trace.h" #include "xfs_icache.h" #include "xfs_log.h" #include "xfs_log_priv.h" #include "xfs_rmap.h" #include "xfs_refcount.h" #include "xfs_bmap.h" #include "xfs_alloc.h" #include "xfs_buf.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" #include "xfs_attr.h" #include "xfs_exchmaps.h" static struct kmem_cache *xfs_defer_pending_cache; /* * Deferred Operations in XFS * * Due to the way locking rules work in XFS, certain transactions (block * mapping and unmapping, typically) have permanent reservations so that * we can roll the transaction to adhere to AG locking order rules and * to unlock buffers between metadata updates. Prior to rmap/reflink, * the mapping code had a mechanism to perform these deferrals for * extents that were going to be freed; this code makes that facility * more generic. * * When adding the reverse mapping and reflink features, it became * necessary to perform complex remapping multi-transactions to comply * with AG locking order rules, and to be able to spread a single * refcount update operation (an operation on an n-block extent can * update as many as n records!) among multiple transactions. XFS can * roll a transaction to facilitate this, but using this facility * requires us to log "intent" items in case log recovery needs to * redo the operation, and to log "done" items to indicate that redo * is not necessary. * * Deferred work is tracked in xfs_defer_pending items. Each pending * item tracks one type of deferred work. Incoming work items (which * have not yet had an intent logged) are attached to a pending item * on the dop_intake list, where they wait for the caller to finish * the deferred operations. * * Finishing a set of deferred operations is an involved process. To * start, we define "rolling a deferred-op transaction" as follows: * * > For each xfs_defer_pending item on the dop_intake list, * - Sort the work items in AG order. XFS locking * order rules require us to lock buffers in AG order. * - Create a log intent item for that type. * - Attach it to the pending item. * - Move the pending item from the dop_intake list to the * dop_pending list. * > Roll the transaction. * * NOTE: To avoid exceeding the transaction reservation, we limit the * number of items that we attach to a given xfs_defer_pending. * * The actual finishing process looks like this: * * > For each xfs_defer_pending in the dop_pending list, * - Roll the deferred-op transaction as above. * - Create a log done item for that type, and attach it to the * log intent item. * - For each work item attached to the log intent item, * * Perform the described action. * * Attach the work item to the log done item. * * If the result of doing the work was -EAGAIN, ->finish work * wants a new transaction. See the "Requesting a Fresh * Transaction while Finishing Deferred Work" section below for * details. * * The key here is that we must log an intent item for all pending * work items every time we roll the transaction, and that we must log * a done item as soon as the work is completed. With this mechanism * we can perform complex remapping operations, chaining intent items * as needed. * * Requesting a Fresh Transaction while Finishing Deferred Work * * If ->finish_item decides that it needs a fresh transaction to * finish the work, it must ask its caller (xfs_defer_finish) for a * continuation. The most likely cause of this circumstance are the * refcount adjust functions deciding that they've logged enough items * to be at risk of exceeding the transaction reservation. * * To get a fresh transaction, we want to log the existing log done * item to prevent the log intent item from replaying, immediately log * a new log intent item with the unfinished work items, roll the * transaction, and re-call ->finish_item wherever it left off. The * log done item and the new log intent item must be in the same * transaction or atomicity cannot be guaranteed; defer_finish ensures * that this happens. * * This requires some coordination between ->finish_item and * defer_finish. Upon deciding to request a new transaction, * ->finish_item should update the current work item to reflect the * unfinished work. Next, it should reset the log done item's list * count to the number of items finished, and return -EAGAIN. * defer_finish sees the -EAGAIN, logs the new log intent item * with the remaining work items, and leaves the xfs_defer_pending * item at the head of the dop_work queue. Then it rolls the * transaction and picks up processing where it left off. It is * required that ->finish_item must be careful to leave enough * transaction reservation to fit the new log intent item. * * This is an example of remapping the extent (E, E+B) into file X at * offset A and dealing with the extent (C, C+B) already being mapped * there: * +-------------------------------------------------+ * | Unmap file X startblock C offset A length B | t0 * | Intent to reduce refcount for extent (C, B) | * | Intent to remove rmap (X, C, A, B) | * | Intent to free extent (D, 1) (bmbt block) | * | Intent to map (X, A, B) at startblock E | * +-------------------------------------------------+ * | Map file X startblock E offset A length B | t1 * | Done mapping (X, E, A, B) | * | Intent to increase refcount for extent (E, B) | * | Intent to add rmap (X, E, A, B) | * +-------------------------------------------------+ * | Reduce refcount for extent (C, B) | t2 * | Done reducing refcount for extent (C, 9) | * | Intent to reduce refcount for extent (C+9, B-9) | * | (ran out of space after 9 refcount updates) | * +-------------------------------------------------+ * | Reduce refcount for extent (C+9, B+9) | t3 * | Done reducing refcount for extent (C+9, B-9) | * | Increase refcount for extent (E, B) | * | Done increasing refcount for extent (E, B) | * | Intent to free extent (C, B) | * | Intent to free extent (F, 1) (refcountbt block) | * | Intent to remove rmap (F, 1, REFC) | * +-------------------------------------------------+ * | Remove rmap (X, C, A, B) | t4 * | Done removing rmap (X, C, A, B) | * | Add rmap (X, E, A, B) | * | Done adding rmap (X, E, A, B) | * | Remove rmap (F, 1, REFC) | * | Done removing rmap (F, 1, REFC) | * +-------------------------------------------------+ * | Free extent (C, B) | t5 * | Done freeing extent (C, B) | * | Free extent (D, 1) | * | Done freeing extent (D, 1) | * | Free extent (F, 1) | * | Done freeing extent (F, 1) | * +-------------------------------------------------+ * * If we should crash before t2 commits, log recovery replays * the following intent items: * * - Intent to reduce refcount for extent (C, B) * - Intent to remove rmap (X, C, A, B) * - Intent to free extent (D, 1) (bmbt block) * - Intent to increase refcount for extent (E, B) * - Intent to add rmap (X, E, A, B) * * In the process of recovering, it should also generate and take care * of these intent items: * * - Intent to free extent (C, B) * - Intent to free extent (F, 1) (refcountbt block) * - Intent to remove rmap (F, 1, REFC) * * Note that the continuation requested between t2 and t3 is likely to * reoccur. */ STATIC struct xfs_log_item * xfs_defer_barrier_create_intent( struct xfs_trans *tp, struct list_head *items, unsigned int count, bool sort) { return NULL; } STATIC void xfs_defer_barrier_abort_intent( struct xfs_log_item *intent) { /* empty */ } STATIC struct xfs_log_item * xfs_defer_barrier_create_done( struct xfs_trans *tp, struct xfs_log_item *intent, unsigned int count) { return NULL; } STATIC int xfs_defer_barrier_finish_item( struct xfs_trans *tp, struct xfs_log_item *done, struct list_head *item, struct xfs_btree_cur **state) { ASSERT(0); return -EFSCORRUPTED; } STATIC void xfs_defer_barrier_cancel_item( struct list_head *item) { ASSERT(0); } static const struct xfs_defer_op_type xfs_barrier_defer_type = { .max_items = 1, .create_intent = xfs_defer_barrier_create_intent, .abort_intent = xfs_defer_barrier_abort_intent, .create_done = xfs_defer_barrier_create_done, .finish_item = xfs_defer_barrier_finish_item, .cancel_item = xfs_defer_barrier_cancel_item, }; /* Create a log intent done item for a log intent item. */ static inline void xfs_defer_create_done( struct xfs_trans *tp, struct xfs_defer_pending *dfp) { struct xfs_log_item *lip; /* If there is no log intent item, there can be no log done item. */ if (!dfp->dfp_intent) return; /* * Mark the transaction dirty, even on error. This ensures the * transaction is aborted, which: * * 1.) releases the log intent item and frees the log done item * 2.) shuts down the filesystem */ tp->t_flags |= XFS_TRANS_DIRTY; lip = dfp->dfp_ops->create_done(tp, dfp->dfp_intent, dfp->dfp_count); if (!lip) return; tp->t_flags |= XFS_TRANS_HAS_INTENT_DONE; xfs_trans_add_item(tp, lip); set_bit(XFS_LI_DIRTY, &lip->li_flags); dfp->dfp_done = lip; } /* * Ensure there's a log intent item associated with this deferred work item if * the operation must be restarted on crash. Returns 1 if there's a log item; * 0 if there isn't; or a negative errno. */ static int xfs_defer_create_intent( struct xfs_trans *tp, struct xfs_defer_pending *dfp, bool sort) { struct xfs_log_item *lip; if (dfp->dfp_intent) return 1; lip = dfp->dfp_ops->create_intent(tp, &dfp->dfp_work, dfp->dfp_count, sort); if (!lip) return 0; if (IS_ERR(lip)) return PTR_ERR(lip); tp->t_flags |= XFS_TRANS_DIRTY; xfs_trans_add_item(tp, lip); set_bit(XFS_LI_DIRTY, &lip->li_flags); dfp->dfp_intent = lip; return 1; } /* * For each pending item in the intake list, log its intent item and the * associated extents, then add the entire intake list to the end of * the pending list. * * Returns 1 if at least one log item was associated with the deferred work; * 0 if there are no log items; or a negative errno. */ static int xfs_defer_create_intents( struct xfs_trans *tp) { struct xfs_defer_pending *dfp; int ret = 0; list_for_each_entry(dfp, &tp->t_dfops, dfp_list) { int ret2; trace_xfs_defer_create_intent(tp->t_mountp, dfp); ret2 = xfs_defer_create_intent(tp, dfp, true); if (ret2 < 0) return ret2; ret |= ret2; } return ret; } static inline void xfs_defer_pending_abort( struct xfs_mount *mp, struct xfs_defer_pending *dfp) { trace_xfs_defer_pending_abort(mp, dfp); if (dfp->dfp_intent && !dfp->dfp_done) { dfp->dfp_ops->abort_intent(dfp->dfp_intent); dfp->dfp_intent = NULL; } } static inline void xfs_defer_pending_cancel_work( struct xfs_mount *mp, struct xfs_defer_pending *dfp) { struct list_head *pwi; struct list_head *n; trace_xfs_defer_cancel_list(mp, dfp); list_del(&dfp->dfp_list); list_for_each_safe(pwi, n, &dfp->dfp_work) { list_del(pwi); dfp->dfp_count--; trace_xfs_defer_cancel_item(mp, dfp, pwi); dfp->dfp_ops->cancel_item(pwi); } ASSERT(dfp->dfp_count == 0); kmem_cache_free(xfs_defer_pending_cache, dfp); } STATIC void xfs_defer_pending_abort_list( struct xfs_mount *mp, struct list_head *dop_list) { struct xfs_defer_pending *dfp; /* Abort intent items that don't have a done item. */ list_for_each_entry(dfp, dop_list, dfp_list) xfs_defer_pending_abort(mp, dfp); } /* Abort all the intents that were committed. */ STATIC void xfs_defer_trans_abort( struct xfs_trans *tp, struct list_head *dop_pending) { trace_xfs_defer_trans_abort(tp, _RET_IP_); xfs_defer_pending_abort_list(tp->t_mountp, dop_pending); } /* * Capture resources that the caller said not to release ("held") when the * transaction commits. Caller is responsible for zero-initializing @dres. */ static int xfs_defer_save_resources( struct xfs_defer_resources *dres, struct xfs_trans *tp) { struct xfs_buf_log_item *bli; struct xfs_inode_log_item *ili; struct xfs_log_item *lip; BUILD_BUG_ON(NBBY * sizeof(dres->dr_ordered) < XFS_DEFER_OPS_NR_BUFS); list_for_each_entry(lip, &tp->t_items, li_trans) { switch (lip->li_type) { case XFS_LI_BUF: bli = container_of(lip, struct xfs_buf_log_item, bli_item); if (bli->bli_flags & XFS_BLI_HOLD) { if (dres->dr_bufs >= XFS_DEFER_OPS_NR_BUFS) { ASSERT(0); return -EFSCORRUPTED; } if (bli->bli_flags & XFS_BLI_ORDERED) dres->dr_ordered |= (1U << dres->dr_bufs); else xfs_trans_dirty_buf(tp, bli->bli_buf); dres->dr_bp[dres->dr_bufs++] = bli->bli_buf; } break; case XFS_LI_INODE: ili = container_of(lip, struct xfs_inode_log_item, ili_item); if (ili->ili_lock_flags == 0) { if (dres->dr_inos >= XFS_DEFER_OPS_NR_INODES) { ASSERT(0); return -EFSCORRUPTED; } xfs_trans_log_inode(tp, ili->ili_inode, XFS_ILOG_CORE); dres->dr_ip[dres->dr_inos++] = ili->ili_inode; } break; default: break; } } return 0; } /* Attach the held resources to the transaction. */ static void xfs_defer_restore_resources( struct xfs_trans *tp, struct xfs_defer_resources *dres) { unsigned short i; /* Rejoin the joined inodes. */ for (i = 0; i < dres->dr_inos; i++) xfs_trans_ijoin(tp, dres->dr_ip[i], 0); /* Rejoin the buffers and dirty them so the log moves forward. */ for (i = 0; i < dres->dr_bufs; i++) { xfs_trans_bjoin(tp, dres->dr_bp[i]); if (dres->dr_ordered & (1U << i)) xfs_trans_ordered_buf(tp, dres->dr_bp[i]); xfs_trans_bhold(tp, dres->dr_bp[i]); } } /* Roll a transaction so we can do some deferred op processing. */ STATIC int xfs_defer_trans_roll( struct xfs_trans **tpp) { struct xfs_defer_resources dres = { }; int error; error = xfs_defer_save_resources(&dres, *tpp); if (error) return error; trace_xfs_defer_trans_roll(*tpp, _RET_IP_); /* * Roll the transaction. Rolling always given a new transaction (even * if committing the old one fails!) to hand back to the caller, so we * join the held resources to the new transaction so that we always * return with the held resources joined to @tpp, no matter what * happened. */ error = xfs_trans_roll(tpp); xfs_defer_restore_resources(*tpp, &dres); if (error) trace_xfs_defer_trans_roll_error(*tpp, error); return error; } /* * Free up any items left in the list. */ static void xfs_defer_cancel_list( struct xfs_mount *mp, struct list_head *dop_list) { struct xfs_defer_pending *dfp; struct xfs_defer_pending *pli; /* * Free the pending items. Caller should already have arranged * for the intent items to be released. */ list_for_each_entry_safe(dfp, pli, dop_list, dfp_list) xfs_defer_pending_cancel_work(mp, dfp); } static inline void xfs_defer_relog_intent( struct xfs_trans *tp, struct xfs_defer_pending *dfp) { struct xfs_log_item *lip; xfs_defer_create_done(tp, dfp); lip = dfp->dfp_ops->relog_intent(tp, dfp->dfp_intent, dfp->dfp_done); if (lip) { xfs_trans_add_item(tp, lip); set_bit(XFS_LI_DIRTY, &lip->li_flags); } dfp->dfp_done = NULL; dfp->dfp_intent = lip; } /* * Prevent a log intent item from pinning the tail of the log by logging a * done item to release the intent item; and then log a new intent item. * The caller should provide a fresh transaction and roll it after we're done. */ static void xfs_defer_relog( struct xfs_trans **tpp, struct list_head *dfops) { struct xlog *log = (*tpp)->t_mountp->m_log; struct xfs_defer_pending *dfp; xfs_lsn_t threshold_lsn = NULLCOMMITLSN; ASSERT((*tpp)->t_flags & XFS_TRANS_PERM_LOG_RES); list_for_each_entry(dfp, dfops, dfp_list) { /* * If the log intent item for this deferred op is not a part of * the current log checkpoint, relog the intent item to keep * the log tail moving forward. We're ok with this being racy * because an incorrect decision means we'll be a little slower * at pushing the tail. */ if (dfp->dfp_intent == NULL || xfs_log_item_in_current_chkpt(dfp->dfp_intent)) continue; /* * Figure out where we need the tail to be in order to maintain * the minimum required free space in the log. Only sample * the log threshold once per call. */ if (threshold_lsn == NULLCOMMITLSN) { threshold_lsn = xfs_ail_get_push_target(log->l_ailp); if (threshold_lsn == NULLCOMMITLSN) break; } if (XFS_LSN_CMP(dfp->dfp_intent->li_lsn, threshold_lsn) >= 0) continue; trace_xfs_defer_relog_intent((*tpp)->t_mountp, dfp); XFS_STATS_INC((*tpp)->t_mountp, defer_relog); xfs_defer_relog_intent(*tpp, dfp); } } /* * Log an intent-done item for the first pending intent, and finish the work * items. */ int xfs_defer_finish_one( struct xfs_trans *tp, struct xfs_defer_pending *dfp) { const struct xfs_defer_op_type *ops = dfp->dfp_ops; struct xfs_btree_cur *state = NULL; struct list_head *li, *n; int error; trace_xfs_defer_pending_finish(tp->t_mountp, dfp); xfs_defer_create_done(tp, dfp); list_for_each_safe(li, n, &dfp->dfp_work) { list_del(li); dfp->dfp_count--; trace_xfs_defer_finish_item(tp->t_mountp, dfp, li); error = ops->finish_item(tp, dfp->dfp_done, li, &state); if (error == -EAGAIN) { int ret; /* * Caller wants a fresh transaction; put the work item * back on the list and log a new log intent item to * replace the old one. See "Requesting a Fresh * Transaction while Finishing Deferred Work" above. */ list_add(li, &dfp->dfp_work); dfp->dfp_count++; dfp->dfp_done = NULL; dfp->dfp_intent = NULL; ret = xfs_defer_create_intent(tp, dfp, false); if (ret < 0) error = ret; } if (error) goto out; } /* Done with the dfp, free it. */ list_del(&dfp->dfp_list); kmem_cache_free(xfs_defer_pending_cache, dfp); out: if (ops->finish_cleanup) ops->finish_cleanup(tp, state, error); return error; } /* Move all paused deferred work from @tp to @paused_list. */ static void xfs_defer_isolate_paused( struct xfs_trans *tp, struct list_head *paused_list) { struct xfs_defer_pending *dfp; struct xfs_defer_pending *pli; list_for_each_entry_safe(dfp, pli, &tp->t_dfops, dfp_list) { if (!(dfp->dfp_flags & XFS_DEFER_PAUSED)) continue; list_move_tail(&dfp->dfp_list, paused_list); trace_xfs_defer_isolate_paused(tp->t_mountp, dfp); } } /* * Finish all the pending work. This involves logging intent items for * any work items that wandered in since the last transaction roll (if * one has even happened), rolling the transaction, and finishing the * work items in the first item on the logged-and-pending list. * * If an inode is provided, relog it to the new transaction. */ int xfs_defer_finish_noroll( struct xfs_trans **tp) { struct xfs_defer_pending *dfp = NULL; int error = 0; LIST_HEAD(dop_pending); LIST_HEAD(dop_paused); ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); trace_xfs_defer_finish(*tp, _RET_IP_); /* Until we run out of pending work to finish... */ while (!list_empty(&dop_pending) || !list_empty(&(*tp)->t_dfops)) { /* * Deferred items that are created in the process of finishing * other deferred work items should be queued at the head of * the pending list, which puts them ahead of the deferred work * that was created by the caller. This keeps the number of * pending work items to a minimum, which decreases the amount * of time that any one intent item can stick around in memory, * pinning the log tail. */ int has_intents = xfs_defer_create_intents(*tp); xfs_defer_isolate_paused(*tp, &dop_paused); list_splice_init(&(*tp)->t_dfops, &dop_pending); if (has_intents < 0) { error = has_intents; goto out_shutdown; } if (has_intents || dfp) { error = xfs_defer_trans_roll(tp); if (error) goto out_shutdown; /* Relog intent items to keep the log moving. */ xfs_defer_relog(tp, &dop_pending); xfs_defer_relog(tp, &dop_paused); if ((*tp)->t_flags & XFS_TRANS_DIRTY) { error = xfs_defer_trans_roll(tp); if (error) goto out_shutdown; } } dfp = list_first_entry_or_null(&dop_pending, struct xfs_defer_pending, dfp_list); if (!dfp) break; error = xfs_defer_finish_one(*tp, dfp); if (error && error != -EAGAIN) goto out_shutdown; } /* Requeue the paused items in the outgoing transaction. */ list_splice_tail_init(&dop_paused, &(*tp)->t_dfops); trace_xfs_defer_finish_done(*tp, _RET_IP_); return 0; out_shutdown: list_splice_tail_init(&dop_paused, &dop_pending); xfs_defer_trans_abort(*tp, &dop_pending); xfs_force_shutdown((*tp)->t_mountp, SHUTDOWN_CORRUPT_INCORE); trace_xfs_defer_finish_error(*tp, error); xfs_defer_cancel_list((*tp)->t_mountp, &dop_pending); xfs_defer_cancel(*tp); return error; } int xfs_defer_finish( struct xfs_trans **tp) { #ifdef DEBUG struct xfs_defer_pending *dfp; #endif int error; /* * Finish and roll the transaction once more to avoid returning to the * caller with a dirty transaction. */ error = xfs_defer_finish_noroll(tp); if (error) return error; if ((*tp)->t_flags & XFS_TRANS_DIRTY) { error = xfs_defer_trans_roll(tp); if (error) { xfs_force_shutdown((*tp)->t_mountp, SHUTDOWN_CORRUPT_INCORE); return error; } } /* Reset LOWMODE now that we've finished all the dfops. */ #ifdef DEBUG list_for_each_entry(dfp, &(*tp)->t_dfops, dfp_list) ASSERT(dfp->dfp_flags & XFS_DEFER_PAUSED); #endif (*tp)->t_flags &= ~XFS_TRANS_LOWMODE; return 0; } void xfs_defer_cancel( struct xfs_trans *tp) { struct xfs_mount *mp = tp->t_mountp; trace_xfs_defer_cancel(tp, _RET_IP_); xfs_defer_trans_abort(tp, &tp->t_dfops); xfs_defer_cancel_list(mp, &tp->t_dfops); } /* * Return the last pending work item attached to this transaction if it matches * the deferred op type. */ static inline struct xfs_defer_pending * xfs_defer_find_last( struct xfs_trans *tp, const struct xfs_defer_op_type *ops) { struct xfs_defer_pending *dfp = NULL; /* No dfops at all? */ if (list_empty(&tp->t_dfops)) return NULL; dfp = list_last_entry(&tp->t_dfops, struct xfs_defer_pending, dfp_list); /* Wrong type? */ if (dfp->dfp_ops != ops) return NULL; return dfp; } /* * Decide if we can add a deferred work item to the last dfops item attached * to the transaction. */ static inline bool xfs_defer_can_append( struct xfs_defer_pending *dfp, const struct xfs_defer_op_type *ops) { /* Already logged? */ if (dfp->dfp_intent) return false; /* Paused items cannot absorb more work */ if (dfp->dfp_flags & XFS_DEFER_PAUSED) return NULL; /* Already full? */ if (ops->max_items && dfp->dfp_count >= ops->max_items) return false; return true; } /* Create a new pending item at the end of the transaction list. */ static inline struct xfs_defer_pending * xfs_defer_alloc( struct list_head *dfops, const struct xfs_defer_op_type *ops) { struct xfs_defer_pending *dfp; dfp = kmem_cache_zalloc(xfs_defer_pending_cache, GFP_KERNEL | __GFP_NOFAIL); dfp->dfp_ops = ops; INIT_LIST_HEAD(&dfp->dfp_work); list_add_tail(&dfp->dfp_list, dfops); return dfp; } /* Add an item for later deferred processing. */ struct xfs_defer_pending * xfs_defer_add( struct xfs_trans *tp, struct list_head *li, const struct xfs_defer_op_type *ops) { struct xfs_defer_pending *dfp = NULL; ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); if (!ops->finish_item) { ASSERT(ops->finish_item != NULL); xfs_force_shutdown(tp->t_mountp, SHUTDOWN_CORRUPT_INCORE); return NULL; } dfp = xfs_defer_find_last(tp, ops); if (!dfp || !xfs_defer_can_append(dfp, ops)) dfp = xfs_defer_alloc(&tp->t_dfops, ops); xfs_defer_add_item(dfp, li); trace_xfs_defer_add_item(tp->t_mountp, dfp, li); return dfp; } /* * Add a defer ops barrier to force two otherwise adjacent deferred work items * to be tracked separately and have separate log items. */ void xfs_defer_add_barrier( struct xfs_trans *tp) { struct xfs_defer_pending *dfp; ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); /* If the last defer op added was a barrier, we're done. */ dfp = xfs_defer_find_last(tp, &xfs_barrier_defer_type); if (dfp) return; xfs_defer_alloc(&tp->t_dfops, &xfs_barrier_defer_type); trace_xfs_defer_add_item(tp->t_mountp, dfp, NULL); } /* * Create a pending deferred work item to replay the recovered intent item * and add it to the list. */ void xfs_defer_start_recovery( struct xfs_log_item *lip, struct list_head *r_dfops, const struct xfs_defer_op_type *ops) { struct xfs_defer_pending *dfp = xfs_defer_alloc(r_dfops, ops); dfp->dfp_intent = lip; } /* * Cancel a deferred work item created to recover a log intent item. @dfp * will be freed after this function returns. */ void xfs_defer_cancel_recovery( struct xfs_mount *mp, struct xfs_defer_pending *dfp) { xfs_defer_pending_abort(mp, dfp); xfs_defer_pending_cancel_work(mp, dfp); } /* Replay the deferred work item created from a recovered log intent item. */ int xfs_defer_finish_recovery( struct xfs_mount *mp, struct xfs_defer_pending *dfp, struct list_head *capture_list) { const struct xfs_defer_op_type *ops = dfp->dfp_ops; int error; /* dfp is freed by recover_work and must not be accessed afterwards */ error = ops->recover_work(dfp, capture_list); if (error) trace_xlog_intent_recovery_failed(mp, ops, error); return error; } /* * Move deferred ops from one transaction to another and reset the source to * initial state. This is primarily used to carry state forward across * transaction rolls with pending dfops. */ void xfs_defer_move( struct xfs_trans *dtp, struct xfs_trans *stp) { list_splice_init(&stp->t_dfops, &dtp->t_dfops); /* * Low free space mode was historically controlled by a dfops field. * This meant that low mode state potentially carried across multiple * transaction rolls. Transfer low mode on a dfops move to preserve * that behavior. */ dtp->t_flags |= (stp->t_flags & XFS_TRANS_LOWMODE); stp->t_flags &= ~XFS_TRANS_LOWMODE; } /* * Prepare a chain of fresh deferred ops work items to be completed later. Log * recovery requires the ability to put off until later the actual finishing * work so that it can process unfinished items recovered from the log in * correct order. * * Create and log intent items for all the work that we're capturing so that we * can be assured that the items will get replayed if the system goes down * before log recovery gets a chance to finish the work it put off. The entire * deferred ops state is transferred to the capture structure and the * transaction is then ready for the caller to commit it. If there are no * intent items to capture, this function returns NULL. * * If capture_ip is not NULL, the capture structure will obtain an extra * reference to the inode. */ static struct xfs_defer_capture * xfs_defer_ops_capture( struct xfs_trans *tp) { struct xfs_defer_capture *dfc; unsigned short i; int error; if (list_empty(&tp->t_dfops)) return NULL; error = xfs_defer_create_intents(tp); if (error < 0) return ERR_PTR(error); /* Create an object to capture the defer ops. */ dfc = kzalloc(sizeof(*dfc), GFP_KERNEL | __GFP_NOFAIL); INIT_LIST_HEAD(&dfc->dfc_list); INIT_LIST_HEAD(&dfc->dfc_dfops); /* Move the dfops chain and transaction state to the capture struct. */ list_splice_init(&tp->t_dfops, &dfc->dfc_dfops); dfc->dfc_tpflags = tp->t_flags & XFS_TRANS_LOWMODE; tp->t_flags &= ~XFS_TRANS_LOWMODE; /* Capture the remaining block reservations along with the dfops. */ dfc->dfc_blkres = tp->t_blk_res - tp->t_blk_res_used; dfc->dfc_rtxres = tp->t_rtx_res - tp->t_rtx_res_used; /* Preserve the log reservation size. */ dfc->dfc_logres = tp->t_log_res; error = xfs_defer_save_resources(&dfc->dfc_held, tp); if (error) { /* * Resource capture should never fail, but if it does, we * still have to shut down the log and release things * properly. */ xfs_force_shutdown(tp->t_mountp, SHUTDOWN_CORRUPT_INCORE); } /* * Grab extra references to the inodes and buffers because callers are * expected to release their held references after we commit the * transaction. */ for (i = 0; i < dfc->dfc_held.dr_inos; i++) { xfs_assert_ilocked(dfc->dfc_held.dr_ip[i], XFS_ILOCK_EXCL); ihold(VFS_I(dfc->dfc_held.dr_ip[i])); } for (i = 0; i < dfc->dfc_held.dr_bufs; i++) xfs_buf_hold(dfc->dfc_held.dr_bp[i]); return dfc; } /* Release all resources that we used to capture deferred ops. */ void xfs_defer_ops_capture_abort( struct xfs_mount *mp, struct xfs_defer_capture *dfc) { unsigned short i; xfs_defer_pending_abort_list(mp, &dfc->dfc_dfops); xfs_defer_cancel_list(mp, &dfc->dfc_dfops); for (i = 0; i < dfc->dfc_held.dr_bufs; i++) xfs_buf_relse(dfc->dfc_held.dr_bp[i]); for (i = 0; i < dfc->dfc_held.dr_inos; i++) xfs_irele(dfc->dfc_held.dr_ip[i]); kfree(dfc); } /* * Capture any deferred ops and commit the transaction. This is the last step * needed to finish a log intent item that we recovered from the log. If any * of the deferred ops operate on an inode, the caller must pass in that inode * so that the reference can be transferred to the capture structure. The * caller must hold ILOCK_EXCL on the inode, and must unlock it before calling * xfs_defer_ops_continue. */ int xfs_defer_ops_capture_and_commit( struct xfs_trans *tp, struct list_head *capture_list) { struct xfs_mount *mp = tp->t_mountp; struct xfs_defer_capture *dfc; int error; /* If we don't capture anything, commit transaction and exit. */ dfc = xfs_defer_ops_capture(tp); if (IS_ERR(dfc)) { xfs_trans_cancel(tp); return PTR_ERR(dfc); } if (!dfc) return xfs_trans_commit(tp); /* Commit the transaction and add the capture structure to the list. */ error = xfs_trans_commit(tp); if (error) { xfs_defer_ops_capture_abort(mp, dfc); return error; } list_add_tail(&dfc->dfc_list, capture_list); return 0; } /* * Attach a chain of captured deferred ops to a new transaction and free the * capture structure. If an inode was captured, it will be passed back to the * caller with ILOCK_EXCL held and joined to the transaction with lockflags==0. * The caller now owns the inode reference. */ void xfs_defer_ops_continue( struct xfs_defer_capture *dfc, struct xfs_trans *tp, struct xfs_defer_resources *dres) { unsigned int i; ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); ASSERT(!(tp->t_flags & XFS_TRANS_DIRTY)); /* Lock the captured resources to the new transaction. */ if (dfc->dfc_held.dr_inos > 2) { xfs_sort_inodes(dfc->dfc_held.dr_ip, dfc->dfc_held.dr_inos); xfs_lock_inodes(dfc->dfc_held.dr_ip, dfc->dfc_held.dr_inos, XFS_ILOCK_EXCL); } else if (dfc->dfc_held.dr_inos == 2) xfs_lock_two_inodes(dfc->dfc_held.dr_ip[0], XFS_ILOCK_EXCL, dfc->dfc_held.dr_ip[1], XFS_ILOCK_EXCL); else if (dfc->dfc_held.dr_inos == 1) xfs_ilock(dfc->dfc_held.dr_ip[0], XFS_ILOCK_EXCL); for (i = 0; i < dfc->dfc_held.dr_bufs; i++) xfs_buf_lock(dfc->dfc_held.dr_bp[i]); /* Join the captured resources to the new transaction. */ xfs_defer_restore_resources(tp, &dfc->dfc_held); memcpy(dres, &dfc->dfc_held, sizeof(struct xfs_defer_resources)); dres->dr_bufs = 0; /* Move captured dfops chain and state to the transaction. */ list_splice_init(&dfc->dfc_dfops, &tp->t_dfops); tp->t_flags |= dfc->dfc_tpflags; kfree(dfc); } /* Release the resources captured and continued during recovery. */ void xfs_defer_resources_rele( struct xfs_defer_resources *dres) { unsigned short i; for (i = 0; i < dres->dr_inos; i++) { xfs_iunlock(dres->dr_ip[i], XFS_ILOCK_EXCL); xfs_irele(dres->dr_ip[i]); dres->dr_ip[i] = NULL; } for (i = 0; i < dres->dr_bufs; i++) { xfs_buf_relse(dres->dr_bp[i]); dres->dr_bp[i] = NULL; } dres->dr_inos = 0; dres->dr_bufs = 0; dres->dr_ordered = 0; } static inline int __init xfs_defer_init_cache(void) { xfs_defer_pending_cache = kmem_cache_create("xfs_defer_pending", sizeof(struct xfs_defer_pending), 0, 0, NULL); return xfs_defer_pending_cache != NULL ? 0 : -ENOMEM; } static inline void xfs_defer_destroy_cache(void) { kmem_cache_destroy(xfs_defer_pending_cache); xfs_defer_pending_cache = NULL; } /* Set up caches for deferred work items. */ int __init xfs_defer_init_item_caches(void) { int error; error = xfs_defer_init_cache(); if (error) return error; error = xfs_rmap_intent_init_cache(); if (error) goto err; error = xfs_refcount_intent_init_cache(); if (error) goto err; error = xfs_bmap_intent_init_cache(); if (error) goto err; error = xfs_extfree_intent_init_cache(); if (error) goto err; error = xfs_attr_intent_init_cache(); if (error) goto err; error = xfs_exchmaps_intent_init_cache(); if (error) goto err; return 0; err: xfs_defer_destroy_item_caches(); return error; } /* Destroy all the deferred work item caches, if they've been allocated. */ void xfs_defer_destroy_item_caches(void) { xfs_exchmaps_intent_destroy_cache(); xfs_attr_intent_destroy_cache(); xfs_extfree_intent_destroy_cache(); xfs_bmap_intent_destroy_cache(); xfs_refcount_intent_destroy_cache(); xfs_rmap_intent_destroy_cache(); xfs_defer_destroy_cache(); } /* * Mark a deferred work item so that it will be requeued indefinitely without * being finished. Caller must ensure there are no data dependencies on this * work item in the meantime. */ void xfs_defer_item_pause( struct xfs_trans *tp, struct xfs_defer_pending *dfp) { ASSERT(!(dfp->dfp_flags & XFS_DEFER_PAUSED)); dfp->dfp_flags |= XFS_DEFER_PAUSED; trace_xfs_defer_item_pause(tp->t_mountp, dfp); } /* * Release a paused deferred work item so that it will be finished during the * next transaction roll. */ void xfs_defer_item_unpause( struct xfs_trans *tp, struct xfs_defer_pending *dfp) { ASSERT(dfp->dfp_flags & XFS_DEFER_PAUSED); dfp->dfp_flags &= ~XFS_DEFER_PAUSED; trace_xfs_defer_item_unpause(tp->t_mountp, dfp); }
29 359 312 313 314 95 96 21 137 2 136 9 9 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2011 Red Hat, Inc. All Rights Reserved. */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_error.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" /* * XFS logging functions */ static void __xfs_printk( const char *level, const struct xfs_mount *mp, struct va_format *vaf) { if (mp && mp->m_super) { printk("%sXFS (%s): %pV\n", level, mp->m_super->s_id, vaf); return; } printk("%sXFS: %pV\n", level, vaf); } void xfs_printk_level( const char *kern_level, const struct xfs_mount *mp, const char *fmt, ...) { struct va_format vaf; va_list args; int level; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; __xfs_printk(kern_level, mp, &vaf); va_end(args); if (!kstrtoint(kern_level, 0, &level) && level <= LOGLEVEL_ERR && xfs_error_level >= XFS_ERRLEVEL_HIGH) xfs_stack_trace(); } void _xfs_alert_tag( const struct xfs_mount *mp, uint32_t panic_tag, const char *fmt, ...) { struct va_format vaf; va_list args; int do_panic = 0; if (xfs_panic_mask && (xfs_panic_mask & panic_tag)) { xfs_alert(mp, "Transforming an alert into a BUG."); do_panic = 1; } va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; __xfs_printk(KERN_ALERT, mp, &vaf); va_end(args); BUG_ON(do_panic); } void asswarn( struct xfs_mount *mp, char *expr, char *file, int line) { xfs_warn(mp, "Assertion failed: %s, file: %s, line: %d", expr, file, line); WARN_ON(1); } void assfail( struct xfs_mount *mp, char *expr, char *file, int line) { xfs_emerg(mp, "Assertion failed: %s, file: %s, line: %d", expr, file, line); if (xfs_globals.bug_on_assert) BUG(); else WARN_ON(1); } void xfs_hex_dump(const void *p, int length) { print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_OFFSET, 16, 1, p, length, 1); } void xfs_buf_alert_ratelimited( struct xfs_buf *bp, const char *rlmsg, const char *fmt, ...) { struct xfs_mount *mp = bp->b_mount; struct va_format vaf; va_list args; /* use the more aggressive per-target rate limit for buffers */ if (!___ratelimit(&bp->b_target->bt_ioerror_rl, rlmsg)) return; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; __xfs_printk(KERN_ALERT, mp, &vaf); va_end(args); } void xfs_warn_experimental( struct xfs_mount *mp, enum xfs_experimental_feat feat) { static const struct { const char *name; long opstate; } features[] = { [XFS_EXPERIMENTAL_PNFS] = { .opstate = XFS_OPSTATE_WARNED_PNFS, .name = "pNFS", }, [XFS_EXPERIMENTAL_SCRUB] = { .opstate = XFS_OPSTATE_WARNED_SCRUB, .name = "online scrub", }, [XFS_EXPERIMENTAL_SHRINK] = { .opstate = XFS_OPSTATE_WARNED_SHRINK, .name = "online shrink", }, [XFS_EXPERIMENTAL_LARP] = { .opstate = XFS_OPSTATE_WARNED_LARP, .name = "logged extended attributes", }, [XFS_EXPERIMENTAL_LBS] = { .opstate = XFS_OPSTATE_WARNED_LBS, .name = "large block size", }, [XFS_EXPERIMENTAL_EXCHRANGE] = { .opstate = XFS_OPSTATE_WARNED_EXCHRANGE, .name = "exchange range", }, [XFS_EXPERIMENTAL_PPTR] = { .opstate = XFS_OPSTATE_WARNED_PPTR, .name = "parent pointer", }, [XFS_EXPERIMENTAL_METADIR] = { .opstate = XFS_OPSTATE_WARNED_METADIR, .name = "metadata directory tree", }, }; ASSERT(feat >= 0 && feat < XFS_EXPERIMENTAL_MAX); BUILD_BUG_ON(ARRAY_SIZE(features) != XFS_EXPERIMENTAL_MAX); if (xfs_should_warn(mp, features[feat].opstate)) xfs_warn(mp, "EXPERIMENTAL %s feature enabled. Use at your own risk!", features[feat].name); }
21 13 1 7 17 3 1 22 149 7 23 16 152 16 143 3 183 41 106 1 49 50 50 48 38 140 20 120 7 143 143 141 1 128 1 127 177 177 140 6 61 74 4 126 2 5 1 122 1 127 1 413 24 1 25 141 249 248 247 249 56 9 7 133 133 65 7 191 22 178 1 37 80 60 38 122 18 13 127 126 127 26 7 7 3 4 4 3 1 3 1 1 2 2 3 1 9 9 9 9 9 4 9 4 9 4 5 9 9 9 1 1 7 7 7 7 7 7 105 105 56 13 92 81 4 77 25 281 283 281 274 6 274 277 10 267 266 265 184 9 1 13 1 1 38 3 4 1 9 243 1 10 11 162 11 1 2 6 1 38 38 38 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 // SPDX-License-Identifier: GPL-2.0-or-later /* * Internet Control Message Protocol (ICMPv6) * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * * Based on net/ipv4/icmp.c * * RFC 1885 */ /* * Changes: * * Andi Kleen : exception handling * Andi Kleen add rate limits. never reply to a icmp. * add more length checks and other fixes. * yoshfuji : ensure to sent parameter problem for * fragments. * YOSHIFUJI Hideaki @USAGI: added sysctl for icmp rate limit. * Randy Dunlap and * YOSHIFUJI Hideaki @USAGI: Per-interface statistics support * Kazunori MIYAZAWA @USAGI: change output process to use ip6_append_data */ #define pr_fmt(fmt) "IPv6: " fmt #include <linux/module.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/kernel.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/skbuff.h> #include <linux/init.h> #include <linux/netfilter.h> #include <linux/slab.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/icmpv6.h> #include <net/ip.h> #include <net/sock.h> #include <net/ipv6.h> #include <net/ip6_checksum.h> #include <net/ping.h> #include <net/protocol.h> #include <net/raw.h> #include <net/rawv6.h> #include <net/seg6.h> #include <net/transp_v6.h> #include <net/ip6_route.h> #include <net/addrconf.h> #include <net/icmp.h> #include <net/xfrm.h> #include <net/inet_common.h> #include <net/dsfield.h> #include <net/l3mdev.h> #include <linux/uaccess.h> static DEFINE_PER_CPU(struct sock *, ipv6_icmp_sk); static int icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { /* icmpv6_notify checks 8 bytes can be pulled, icmp6hdr is 8 bytes */ struct icmp6hdr *icmp6 = (struct icmp6hdr *) (skb->data + offset); struct net *net = dev_net(skb->dev); if (type == ICMPV6_PKT_TOOBIG) ip6_update_pmtu(skb, net, info, skb->dev->ifindex, 0, sock_net_uid(net, NULL)); else if (type == NDISC_REDIRECT) ip6_redirect(skb, net, skb->dev->ifindex, 0, sock_net_uid(net, NULL)); if (!(type & ICMPV6_INFOMSG_MASK)) if (icmp6->icmp6_type == ICMPV6_ECHO_REQUEST) ping_err(skb, offset, ntohl(info)); return 0; } static int icmpv6_rcv(struct sk_buff *skb); static const struct inet6_protocol icmpv6_protocol = { .handler = icmpv6_rcv, .err_handler = icmpv6_err, .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, }; /* Called with BH disabled */ static struct sock *icmpv6_xmit_lock(struct net *net) { struct sock *sk; sk = this_cpu_read(ipv6_icmp_sk); if (unlikely(!spin_trylock(&sk->sk_lock.slock))) { /* This can happen if the output path (f.e. SIT or * ip6ip6 tunnel) signals dst_link_failure() for an * outgoing ICMP6 packet. */ return NULL; } sock_net_set(sk, net); return sk; } static void icmpv6_xmit_unlock(struct sock *sk) { sock_net_set(sk, &init_net); spin_unlock(&sk->sk_lock.slock); } /* * Figure out, may we reply to this packet with icmp error. * * We do not reply, if: * - it was icmp error message. * - it is truncated, so that it is known, that protocol is ICMPV6 * (i.e. in the middle of some exthdr) * * --ANK (980726) */ static bool is_ineligible(const struct sk_buff *skb) { int ptr = (u8 *)(ipv6_hdr(skb) + 1) - skb->data; int len = skb->len - ptr; __u8 nexthdr = ipv6_hdr(skb)->nexthdr; __be16 frag_off; if (len < 0) return true; ptr = ipv6_skip_exthdr(skb, ptr, &nexthdr, &frag_off); if (ptr < 0) return false; if (nexthdr == IPPROTO_ICMPV6) { u8 _type, *tp; tp = skb_header_pointer(skb, ptr+offsetof(struct icmp6hdr, icmp6_type), sizeof(_type), &_type); /* Based on RFC 8200, Section 4.5 Fragment Header, return * false if this is a fragment packet with no icmp header info. */ if (!tp && frag_off != 0) return false; else if (!tp || !(*tp & ICMPV6_INFOMSG_MASK)) return true; } return false; } static bool icmpv6_mask_allow(struct net *net, int type) { if (type > ICMPV6_MSG_MAX) return true; /* Limit if icmp type is set in ratemask. */ if (!test_bit(type, net->ipv6.sysctl.icmpv6_ratemask)) return true; return false; } static bool icmpv6_global_allow(struct net *net, int type, bool *apply_ratelimit) { if (icmpv6_mask_allow(net, type)) return true; if (icmp_global_allow(net)) { *apply_ratelimit = true; return true; } __ICMP_INC_STATS(net, ICMP_MIB_RATELIMITGLOBAL); return false; } /* * Check the ICMP output rate limit */ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type, struct flowi6 *fl6, bool apply_ratelimit) { struct net *net = sock_net(sk); struct dst_entry *dst; bool res = false; if (!apply_ratelimit) return true; /* * Look up the output route. * XXX: perhaps the expire for routing entries cloned by * this lookup should be more aggressive (not longer than timeout). */ dst = ip6_route_output(net, sk, fl6); if (dst->error) { IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); } else if (dst->dev && (dst->dev->flags&IFF_LOOPBACK)) { res = true; } else { struct rt6_info *rt = dst_rt6_info(dst); int tmo = net->ipv6.sysctl.icmpv6_time; struct inet_peer *peer; /* Give more bandwidth to wider prefixes. */ if (rt->rt6i_dst.plen < 128) tmo >>= ((128 - rt->rt6i_dst.plen)>>5); rcu_read_lock(); peer = inet_getpeer_v6(net->ipv6.peers, &fl6->daddr); res = inet_peer_xrlim_allow(peer, tmo); rcu_read_unlock(); } if (!res) __ICMP6_INC_STATS(net, ip6_dst_idev(dst), ICMP6_MIB_RATELIMITHOST); else icmp_global_consume(net); dst_release(dst); return res; } static bool icmpv6_rt_has_prefsrc(struct sock *sk, u8 type, struct flowi6 *fl6) { struct net *net = sock_net(sk); struct dst_entry *dst; bool res = false; dst = ip6_route_output(net, sk, fl6); if (!dst->error) { struct rt6_info *rt = dst_rt6_info(dst); struct in6_addr prefsrc; rt6_get_prefsrc(rt, &prefsrc); res = !ipv6_addr_any(&prefsrc); } dst_release(dst); return res; } /* * an inline helper for the "simple" if statement below * checks if parameter problem report is caused by an * unrecognized IPv6 option that has the Option Type * highest-order two bits set to 10 */ static bool opt_unrec(struct sk_buff *skb, __u32 offset) { u8 _optval, *op; offset += skb_network_offset(skb); op = skb_header_pointer(skb, offset, sizeof(_optval), &_optval); if (!op) return true; return (*op & 0xC0) == 0x80; } void icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, struct icmp6hdr *thdr, int len) { struct sk_buff *skb; struct icmp6hdr *icmp6h; skb = skb_peek(&sk->sk_write_queue); if (!skb) return; icmp6h = icmp6_hdr(skb); memcpy(icmp6h, thdr, sizeof(struct icmp6hdr)); icmp6h->icmp6_cksum = 0; if (skb_queue_len(&sk->sk_write_queue) == 1) { skb->csum = csum_partial(icmp6h, sizeof(struct icmp6hdr), skb->csum); icmp6h->icmp6_cksum = csum_ipv6_magic(&fl6->saddr, &fl6->daddr, len, fl6->flowi6_proto, skb->csum); } else { __wsum tmp_csum = 0; skb_queue_walk(&sk->sk_write_queue, skb) { tmp_csum = csum_add(tmp_csum, skb->csum); } tmp_csum = csum_partial(icmp6h, sizeof(struct icmp6hdr), tmp_csum); icmp6h->icmp6_cksum = csum_ipv6_magic(&fl6->saddr, &fl6->daddr, len, fl6->flowi6_proto, tmp_csum); } ip6_push_pending_frames(sk); } struct icmpv6_msg { struct sk_buff *skb; int offset; uint8_t type; }; static int icmpv6_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb) { struct icmpv6_msg *msg = (struct icmpv6_msg *) from; struct sk_buff *org_skb = msg->skb; __wsum csum; csum = skb_copy_and_csum_bits(org_skb, msg->offset + offset, to, len); skb->csum = csum_block_add(skb->csum, csum, odd); if (!(msg->type & ICMPV6_INFOMSG_MASK)) nf_ct_attach(skb, org_skb); return 0; } #if IS_ENABLED(CONFIG_IPV6_MIP6) static void mip6_addr_swap(struct sk_buff *skb, const struct inet6_skb_parm *opt) { struct ipv6hdr *iph = ipv6_hdr(skb); struct ipv6_destopt_hao *hao; int off; if (opt->dsthao) { off = ipv6_find_tlv(skb, opt->dsthao, IPV6_TLV_HAO); if (likely(off >= 0)) { hao = (struct ipv6_destopt_hao *) (skb_network_header(skb) + off); swap(iph->saddr, hao->addr); } } } #else static inline void mip6_addr_swap(struct sk_buff *skb, const struct inet6_skb_parm *opt) {} #endif static struct dst_entry *icmpv6_route_lookup(struct net *net, struct sk_buff *skb, struct sock *sk, struct flowi6 *fl6) { struct dst_entry *dst, *dst2; struct flowi6 fl2; int err; err = ip6_dst_lookup(net, sk, &dst, fl6); if (err) return ERR_PTR(err); /* * We won't send icmp if the destination is known * anycast unless we need to treat anycast as unicast. */ if (!READ_ONCE(net->ipv6.sysctl.icmpv6_error_anycast_as_unicast) && ipv6_anycast_destination(dst, &fl6->daddr)) { net_dbg_ratelimited("icmp6_send: acast source\n"); dst_release(dst); return ERR_PTR(-EINVAL); } /* No need to clone since we're just using its address. */ dst2 = dst; dst = xfrm_lookup(net, dst, flowi6_to_flowi(fl6), sk, 0); if (!IS_ERR(dst)) { if (dst != dst2) return dst; } else { if (PTR_ERR(dst) == -EPERM) dst = NULL; else return dst; } err = xfrm_decode_session_reverse(net, skb, flowi6_to_flowi(&fl2), AF_INET6); if (err) goto relookup_failed; err = ip6_dst_lookup(net, sk, &dst2, &fl2); if (err) goto relookup_failed; dst2 = xfrm_lookup(net, dst2, flowi6_to_flowi(&fl2), sk, XFRM_LOOKUP_ICMP); if (!IS_ERR(dst2)) { dst_release(dst); dst = dst2; } else { err = PTR_ERR(dst2); if (err == -EPERM) { dst_release(dst); return dst2; } else goto relookup_failed; } relookup_failed: if (dst) return dst; return ERR_PTR(err); } static struct net_device *icmp6_dev(const struct sk_buff *skb) { struct net_device *dev = skb->dev; /* for local traffic to local address, skb dev is the loopback * device. Check if there is a dst attached to the skb and if so * get the real device index. Same is needed for replies to a link * local address on a device enslaved to an L3 master device */ if (unlikely(dev->ifindex == LOOPBACK_IFINDEX || netif_is_l3_master(skb->dev))) { const struct rt6_info *rt6 = skb_rt6_info(skb); /* The destination could be an external IP in Ext Hdr (SRv6, RPL, etc.), * and ip6_null_entry could be set to skb if no route is found. */ if (rt6 && rt6->rt6i_idev) dev = rt6->rt6i_idev->dev; } return dev; } static int icmp6_iif(const struct sk_buff *skb) { return icmp6_dev(skb)->ifindex; } /* * Send an ICMP message in response to a packet in error */ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, const struct in6_addr *force_saddr, const struct inet6_skb_parm *parm) { struct inet6_dev *idev = NULL; struct ipv6hdr *hdr = ipv6_hdr(skb); struct sock *sk; struct net *net; struct ipv6_pinfo *np; const struct in6_addr *saddr = NULL; bool apply_ratelimit = false; struct dst_entry *dst; struct icmp6hdr tmp_hdr; struct flowi6 fl6; struct icmpv6_msg msg; struct ipcm6_cookie ipc6; int iif = 0; int addr_type = 0; int len; u32 mark; if ((u8 *)hdr < skb->head || (skb_network_header(skb) + sizeof(*hdr)) > skb_tail_pointer(skb)) return; if (!skb->dev) return; net = dev_net(skb->dev); mark = IP6_REPLY_MARK(net, skb->mark); /* * Make sure we respect the rules * i.e. RFC 1885 2.4(e) * Rule (e.1) is enforced by not using icmp6_send * in any code that processes icmp errors. */ addr_type = ipv6_addr_type(&hdr->daddr); if (ipv6_chk_addr(net, &hdr->daddr, skb->dev, 0) || ipv6_chk_acast_addr_src(net, skb->dev, &hdr->daddr)) saddr = &hdr->daddr; /* * Dest addr check */ if (addr_type & IPV6_ADDR_MULTICAST || skb->pkt_type != PACKET_HOST) { if (type != ICMPV6_PKT_TOOBIG && !(type == ICMPV6_PARAMPROB && code == ICMPV6_UNK_OPTION && (opt_unrec(skb, info)))) return; saddr = NULL; } addr_type = ipv6_addr_type(&hdr->saddr); /* * Source addr check */ if (__ipv6_addr_needs_scope_id(addr_type)) { iif = icmp6_iif(skb); } else { /* * The source device is used for looking up which routing table * to use for sending an ICMP error. */ iif = l3mdev_master_ifindex(skb->dev); } /* * Must not send error if the source does not uniquely * identify a single node (RFC2463 Section 2.4). * We check unspecified / multicast addresses here, * and anycast addresses will be checked later. */ if ((addr_type == IPV6_ADDR_ANY) || (addr_type & IPV6_ADDR_MULTICAST)) { net_dbg_ratelimited("icmp6_send: addr_any/mcast source [%pI6c > %pI6c]\n", &hdr->saddr, &hdr->daddr); return; } /* * Never answer to a ICMP packet. */ if (is_ineligible(skb)) { net_dbg_ratelimited("icmp6_send: no reply to icmp error [%pI6c > %pI6c]\n", &hdr->saddr, &hdr->daddr); return; } /* Needed by both icmpv6_global_allow and icmpv6_xmit_lock */ local_bh_disable(); /* Check global sysctl_icmp_msgs_per_sec ratelimit */ if (!(skb->dev->flags & IFF_LOOPBACK) && !icmpv6_global_allow(net, type, &apply_ratelimit)) goto out_bh_enable; mip6_addr_swap(skb, parm); sk = icmpv6_xmit_lock(net); if (!sk) goto out_bh_enable; memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_proto = IPPROTO_ICMPV6; fl6.daddr = hdr->saddr; if (force_saddr) saddr = force_saddr; if (saddr) { fl6.saddr = *saddr; } else if (!icmpv6_rt_has_prefsrc(sk, type, &fl6)) { /* select a more meaningful saddr from input if */ struct net_device *in_netdev; in_netdev = dev_get_by_index(net, parm->iif); if (in_netdev) { ipv6_dev_get_saddr(net, in_netdev, &fl6.daddr, inet6_sk(sk)->srcprefs, &fl6.saddr); dev_put(in_netdev); } } fl6.flowi6_mark = mark; fl6.flowi6_oif = iif; fl6.fl6_icmp_type = type; fl6.fl6_icmp_code = code; fl6.flowi6_uid = sock_net_uid(net, NULL); fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, NULL); security_skb_classify_flow(skb, flowi6_to_flowi_common(&fl6)); np = inet6_sk(sk); if (!icmpv6_xrlim_allow(sk, type, &fl6, apply_ratelimit)) goto out; tmp_hdr.icmp6_type = type; tmp_hdr.icmp6_code = code; tmp_hdr.icmp6_cksum = 0; tmp_hdr.icmp6_pointer = htonl(info); if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) fl6.flowi6_oif = READ_ONCE(np->mcast_oif); else if (!fl6.flowi6_oif) fl6.flowi6_oif = READ_ONCE(np->ucast_oif); ipcm6_init_sk(&ipc6, sk); ipc6.sockc.mark = mark; fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel); dst = icmpv6_route_lookup(net, skb, sk, &fl6); if (IS_ERR(dst)) goto out; ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); msg.skb = skb; msg.offset = skb_network_offset(skb); msg.type = type; len = skb->len - msg.offset; len = min_t(unsigned int, len, IPV6_MIN_MTU - sizeof(struct ipv6hdr) - sizeof(struct icmp6hdr)); if (len < 0) { net_dbg_ratelimited("icmp: len problem [%pI6c > %pI6c]\n", &hdr->saddr, &hdr->daddr); goto out_dst_release; } rcu_read_lock(); idev = __in6_dev_get(skb->dev); if (ip6_append_data(sk, icmpv6_getfrag, &msg, len + sizeof(struct icmp6hdr), sizeof(struct icmp6hdr), &ipc6, &fl6, dst_rt6_info(dst), MSG_DONTWAIT)) { ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTERRORS); ip6_flush_pending_frames(sk); } else { icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr, len + sizeof(struct icmp6hdr)); } rcu_read_unlock(); out_dst_release: dst_release(dst); out: icmpv6_xmit_unlock(sk); out_bh_enable: local_bh_enable(); } EXPORT_SYMBOL(icmp6_send); /* Slightly more convenient version of icmp6_send with drop reasons. */ void icmpv6_param_prob_reason(struct sk_buff *skb, u8 code, int pos, enum skb_drop_reason reason) { icmp6_send(skb, ICMPV6_PARAMPROB, code, pos, NULL, IP6CB(skb)); kfree_skb_reason(skb, reason); } /* Generate icmpv6 with type/code ICMPV6_DEST_UNREACH/ICMPV6_ADDR_UNREACH * if sufficient data bytes are available * @nhs is the size of the tunnel header(s) : * Either an IPv4 header for SIT encap * an IPv4 header + GRE header for GRE encap */ int ip6_err_gen_icmpv6_unreach(struct sk_buff *skb, int nhs, int type, unsigned int data_len) { struct in6_addr temp_saddr; struct rt6_info *rt; struct sk_buff *skb2; u32 info = 0; if (!pskb_may_pull(skb, nhs + sizeof(struct ipv6hdr) + 8)) return 1; /* RFC 4884 (partial) support for ICMP extensions */ if (data_len < 128 || (data_len & 7) || skb->len < data_len) data_len = 0; skb2 = data_len ? skb_copy(skb, GFP_ATOMIC) : skb_clone(skb, GFP_ATOMIC); if (!skb2) return 1; skb_dst_drop(skb2); skb_pull(skb2, nhs); skb_reset_network_header(skb2); rt = rt6_lookup(dev_net(skb->dev), &ipv6_hdr(skb2)->saddr, NULL, 0, skb, 0); if (rt && rt->dst.dev) skb2->dev = rt->dst.dev; ipv6_addr_set_v4mapped(ip_hdr(skb)->saddr, &temp_saddr); if (data_len) { /* RFC 4884 (partial) support : * insert 0 padding at the end, before the extensions */ __skb_push(skb2, nhs); skb_reset_network_header(skb2); memmove(skb2->data, skb2->data + nhs, data_len - nhs); memset(skb2->data + data_len - nhs, 0, nhs); /* RFC 4884 4.5 : Length is measured in 64-bit words, * and stored in reserved[0] */ info = (data_len/8) << 24; } if (type == ICMP_TIME_EXCEEDED) icmp6_send(skb2, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, info, &temp_saddr, IP6CB(skb2)); else icmp6_send(skb2, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, info, &temp_saddr, IP6CB(skb2)); if (rt) ip6_rt_put(rt); kfree_skb(skb2); return 0; } EXPORT_SYMBOL(ip6_err_gen_icmpv6_unreach); static enum skb_drop_reason icmpv6_echo_reply(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); struct sock *sk; struct inet6_dev *idev; struct ipv6_pinfo *np; const struct in6_addr *saddr = NULL; struct icmp6hdr *icmph = icmp6_hdr(skb); bool apply_ratelimit = false; struct icmp6hdr tmp_hdr; struct flowi6 fl6; struct icmpv6_msg msg; struct dst_entry *dst; struct ipcm6_cookie ipc6; u32 mark = IP6_REPLY_MARK(net, skb->mark); SKB_DR(reason); bool acast; u8 type; if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr) && net->ipv6.sysctl.icmpv6_echo_ignore_multicast) return reason; saddr = &ipv6_hdr(skb)->daddr; acast = ipv6_anycast_destination(skb_dst(skb), saddr); if (acast && net->ipv6.sysctl.icmpv6_echo_ignore_anycast) return reason; if (!ipv6_unicast_destination(skb) && !(net->ipv6.sysctl.anycast_src_echo_reply && acast)) saddr = NULL; if (icmph->icmp6_type == ICMPV6_EXT_ECHO_REQUEST) type = ICMPV6_EXT_ECHO_REPLY; else type = ICMPV6_ECHO_REPLY; memcpy(&tmp_hdr, icmph, sizeof(tmp_hdr)); tmp_hdr.icmp6_type = type; memset(&fl6, 0, sizeof(fl6)); if (net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_ICMPV6_ECHO_REPLIES) fl6.flowlabel = ip6_flowlabel(ipv6_hdr(skb)); fl6.flowi6_proto = IPPROTO_ICMPV6; fl6.daddr = ipv6_hdr(skb)->saddr; if (saddr) fl6.saddr = *saddr; fl6.flowi6_oif = icmp6_iif(skb); fl6.fl6_icmp_type = type; fl6.flowi6_mark = mark; fl6.flowi6_uid = sock_net_uid(net, NULL); security_skb_classify_flow(skb, flowi6_to_flowi_common(&fl6)); local_bh_disable(); sk = icmpv6_xmit_lock(net); if (!sk) goto out_bh_enable; np = inet6_sk(sk); if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) fl6.flowi6_oif = READ_ONCE(np->mcast_oif); else if (!fl6.flowi6_oif) fl6.flowi6_oif = READ_ONCE(np->ucast_oif); if (ip6_dst_lookup(net, sk, &dst, &fl6)) goto out; dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), sk, 0); if (IS_ERR(dst)) goto out; /* Check the ratelimit */ if ((!(skb->dev->flags & IFF_LOOPBACK) && !icmpv6_global_allow(net, ICMPV6_ECHO_REPLY, &apply_ratelimit)) || !icmpv6_xrlim_allow(sk, ICMPV6_ECHO_REPLY, &fl6, apply_ratelimit)) goto out_dst_release; idev = __in6_dev_get(skb->dev); msg.skb = skb; msg.offset = 0; msg.type = type; ipcm6_init_sk(&ipc6, sk); ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); ipc6.tclass = ipv6_get_dsfield(ipv6_hdr(skb)); ipc6.sockc.mark = mark; if (icmph->icmp6_type == ICMPV6_EXT_ECHO_REQUEST) if (!icmp_build_probe(skb, (struct icmphdr *)&tmp_hdr)) goto out_dst_release; if (ip6_append_data(sk, icmpv6_getfrag, &msg, skb->len + sizeof(struct icmp6hdr), sizeof(struct icmp6hdr), &ipc6, &fl6, dst_rt6_info(dst), MSG_DONTWAIT)) { __ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTERRORS); ip6_flush_pending_frames(sk); } else { icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr, skb->len + sizeof(struct icmp6hdr)); reason = SKB_CONSUMED; } out_dst_release: dst_release(dst); out: icmpv6_xmit_unlock(sk); out_bh_enable: local_bh_enable(); return reason; } enum skb_drop_reason icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info) { struct inet6_skb_parm *opt = IP6CB(skb); struct net *net = dev_net(skb->dev); const struct inet6_protocol *ipprot; enum skb_drop_reason reason; int inner_offset; __be16 frag_off; u8 nexthdr; reason = pskb_may_pull_reason(skb, sizeof(struct ipv6hdr)); if (reason != SKB_NOT_DROPPED_YET) goto out; seg6_icmp_srh(skb, opt); nexthdr = ((struct ipv6hdr *)skb->data)->nexthdr; if (ipv6_ext_hdr(nexthdr)) { /* now skip over extension headers */ inner_offset = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, &frag_off); if (inner_offset < 0) { SKB_DR_SET(reason, IPV6_BAD_EXTHDR); goto out; } } else { inner_offset = sizeof(struct ipv6hdr); } /* Checkin header including 8 bytes of inner protocol header. */ reason = pskb_may_pull_reason(skb, inner_offset + 8); if (reason != SKB_NOT_DROPPED_YET) goto out; /* BUGGG_FUTURE: we should try to parse exthdrs in this packet. Without this we will not able f.e. to make source routed pmtu discovery. Corresponding argument (opt) to notifiers is already added. --ANK (980726) */ ipprot = rcu_dereference(inet6_protos[nexthdr]); if (ipprot && ipprot->err_handler) ipprot->err_handler(skb, opt, type, code, inner_offset, info); raw6_icmp_error(skb, nexthdr, type, code, inner_offset, info); return SKB_CONSUMED; out: __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), ICMP6_MIB_INERRORS); return reason; } /* * Handle icmp messages */ static int icmpv6_rcv(struct sk_buff *skb) { enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; struct net *net = dev_net(skb->dev); struct net_device *dev = icmp6_dev(skb); struct inet6_dev *idev = __in6_dev_get(dev); const struct in6_addr *saddr, *daddr; struct icmp6hdr *hdr; u8 type; if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) { struct sec_path *sp = skb_sec_path(skb); int nh; if (!(sp && sp->xvec[sp->len - 1]->props.flags & XFRM_STATE_ICMP)) { reason = SKB_DROP_REASON_XFRM_POLICY; goto drop_no_count; } if (!pskb_may_pull(skb, sizeof(*hdr) + sizeof(struct ipv6hdr))) goto drop_no_count; nh = skb_network_offset(skb); skb_set_network_header(skb, sizeof(*hdr)); if (!xfrm6_policy_check_reverse(NULL, XFRM_POLICY_IN, skb)) { reason = SKB_DROP_REASON_XFRM_POLICY; goto drop_no_count; } skb_set_network_header(skb, nh); } __ICMP6_INC_STATS(dev_net(dev), idev, ICMP6_MIB_INMSGS); saddr = &ipv6_hdr(skb)->saddr; daddr = &ipv6_hdr(skb)->daddr; if (skb_checksum_validate(skb, IPPROTO_ICMPV6, ip6_compute_pseudo)) { net_dbg_ratelimited("ICMPv6 checksum failed [%pI6c > %pI6c]\n", saddr, daddr); goto csum_error; } if (!pskb_pull(skb, sizeof(*hdr))) goto discard_it; hdr = icmp6_hdr(skb); type = hdr->icmp6_type; ICMP6MSGIN_INC_STATS(dev_net(dev), idev, type); switch (type) { case ICMPV6_ECHO_REQUEST: if (!net->ipv6.sysctl.icmpv6_echo_ignore_all) reason = icmpv6_echo_reply(skb); break; case ICMPV6_EXT_ECHO_REQUEST: if (!net->ipv6.sysctl.icmpv6_echo_ignore_all && READ_ONCE(net->ipv4.sysctl_icmp_echo_enable_probe)) reason = icmpv6_echo_reply(skb); break; case ICMPV6_ECHO_REPLY: reason = ping_rcv(skb); break; case ICMPV6_EXT_ECHO_REPLY: reason = ping_rcv(skb); break; case ICMPV6_PKT_TOOBIG: /* BUGGG_FUTURE: if packet contains rthdr, we cannot update standard destination cache. Seems, only "advanced" destination cache will allow to solve this problem --ANK (980726) */ if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) goto discard_it; hdr = icmp6_hdr(skb); /* to notify */ fallthrough; case ICMPV6_DEST_UNREACH: case ICMPV6_TIME_EXCEED: case ICMPV6_PARAMPROB: reason = icmpv6_notify(skb, type, hdr->icmp6_code, hdr->icmp6_mtu); break; case NDISC_ROUTER_SOLICITATION: case NDISC_ROUTER_ADVERTISEMENT: case NDISC_NEIGHBOUR_SOLICITATION: case NDISC_NEIGHBOUR_ADVERTISEMENT: case NDISC_REDIRECT: reason = ndisc_rcv(skb); break; case ICMPV6_MGM_QUERY: igmp6_event_query(skb); return 0; case ICMPV6_MGM_REPORT: igmp6_event_report(skb); return 0; case ICMPV6_MGM_REDUCTION: case ICMPV6_NI_QUERY: case ICMPV6_NI_REPLY: case ICMPV6_MLD2_REPORT: case ICMPV6_DHAAD_REQUEST: case ICMPV6_DHAAD_REPLY: case ICMPV6_MOBILE_PREFIX_SOL: case ICMPV6_MOBILE_PREFIX_ADV: break; default: /* informational */ if (type & ICMPV6_INFOMSG_MASK) break; net_dbg_ratelimited("icmpv6: msg of unknown type [%pI6c > %pI6c]\n", saddr, daddr); /* * error of unknown type. * must pass to upper level */ reason = icmpv6_notify(skb, type, hdr->icmp6_code, hdr->icmp6_mtu); } /* until the v6 path can be better sorted assume failure and * preserve the status quo behaviour for the rest of the paths to here */ if (reason) kfree_skb_reason(skb, reason); else consume_skb(skb); return 0; csum_error: reason = SKB_DROP_REASON_ICMP_CSUM; __ICMP6_INC_STATS(dev_net(dev), idev, ICMP6_MIB_CSUMERRORS); discard_it: __ICMP6_INC_STATS(dev_net(dev), idev, ICMP6_MIB_INERRORS); drop_no_count: kfree_skb_reason(skb, reason); return 0; } void icmpv6_flow_init(const struct sock *sk, struct flowi6 *fl6, u8 type, const struct in6_addr *saddr, const struct in6_addr *daddr, int oif) { memset(fl6, 0, sizeof(*fl6)); fl6->saddr = *saddr; fl6->daddr = *daddr; fl6->flowi6_proto = IPPROTO_ICMPV6; fl6->fl6_icmp_type = type; fl6->fl6_icmp_code = 0; fl6->flowi6_oif = oif; security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6)); } int __init icmpv6_init(void) { struct sock *sk; int err, i; for_each_possible_cpu(i) { err = inet_ctl_sock_create(&sk, PF_INET6, SOCK_RAW, IPPROTO_ICMPV6, &init_net); if (err < 0) { pr_err("Failed to initialize the ICMP6 control socket (err %d)\n", err); return err; } per_cpu(ipv6_icmp_sk, i) = sk; /* Enough space for 2 64K ICMP packets, including * sk_buff struct overhead. */ sk->sk_sndbuf = 2 * SKB_TRUESIZE(64 * 1024); } err = -EAGAIN; if (inet6_add_protocol(&icmpv6_protocol, IPPROTO_ICMPV6) < 0) goto fail; err = inet6_register_icmp_sender(icmp6_send); if (err) goto sender_reg_err; return 0; sender_reg_err: inet6_del_protocol(&icmpv6_protocol, IPPROTO_ICMPV6); fail: pr_err("Failed to register ICMP6 protocol\n"); return err; } void icmpv6_cleanup(void) { inet6_unregister_icmp_sender(icmp6_send); inet6_del_protocol(&icmpv6_protocol, IPPROTO_ICMPV6); } static const struct icmp6_err { int err; int fatal; } tab_unreach[] = { { /* NOROUTE */ .err = ENETUNREACH, .fatal = 0, }, { /* ADM_PROHIBITED */ .err = EACCES, .fatal = 1, }, { /* Was NOT_NEIGHBOUR, now reserved */ .err = EHOSTUNREACH, .fatal = 0, }, { /* ADDR_UNREACH */ .err = EHOSTUNREACH, .fatal = 0, }, { /* PORT_UNREACH */ .err = ECONNREFUSED, .fatal = 1, }, { /* POLICY_FAIL */ .err = EACCES, .fatal = 1, }, { /* REJECT_ROUTE */ .err = EACCES, .fatal = 1, }, }; int icmpv6_err_convert(u8 type, u8 code, int *err) { int fatal = 0; *err = EPROTO; switch (type) { case ICMPV6_DEST_UNREACH: fatal = 1; if (code < ARRAY_SIZE(tab_unreach)) { *err = tab_unreach[code].err; fatal = tab_unreach[code].fatal; } break; case ICMPV6_PKT_TOOBIG: *err = EMSGSIZE; break; case ICMPV6_PARAMPROB: *err = EPROTO; fatal = 1; break; case ICMPV6_TIME_EXCEED: *err = EHOSTUNREACH; break; } return fatal; } EXPORT_SYMBOL(icmpv6_err_convert); #ifdef CONFIG_SYSCTL static struct ctl_table ipv6_icmp_table_template[] = { { .procname = "ratelimit", .data = &init_net.ipv6.sysctl.icmpv6_time, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_ms_jiffies, }, { .procname = "echo_ignore_all", .data = &init_net.ipv6.sysctl.icmpv6_echo_ignore_all, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "echo_ignore_multicast", .data = &init_net.ipv6.sysctl.icmpv6_echo_ignore_multicast, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "echo_ignore_anycast", .data = &init_net.ipv6.sysctl.icmpv6_echo_ignore_anycast, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "ratemask", .data = &init_net.ipv6.sysctl.icmpv6_ratemask_ptr, .maxlen = ICMPV6_MSG_MAX + 1, .mode = 0644, .proc_handler = proc_do_large_bitmap, }, { .procname = "error_anycast_as_unicast", .data = &init_net.ipv6.sysctl.icmpv6_error_anycast_as_unicast, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, }; struct ctl_table * __net_init ipv6_icmp_sysctl_init(struct net *net) { struct ctl_table *table; table = kmemdup(ipv6_icmp_table_template, sizeof(ipv6_icmp_table_template), GFP_KERNEL); if (table) { table[0].data = &net->ipv6.sysctl.icmpv6_time; table[1].data = &net->ipv6.sysctl.icmpv6_echo_ignore_all; table[2].data = &net->ipv6.sysctl.icmpv6_echo_ignore_multicast; table[3].data = &net->ipv6.sysctl.icmpv6_echo_ignore_anycast; table[4].data = &net->ipv6.sysctl.icmpv6_ratemask_ptr; table[5].data = &net->ipv6.sysctl.icmpv6_error_anycast_as_unicast; } return table; } size_t ipv6_icmp_sysctl_table_size(void) { return ARRAY_SIZE(ipv6_icmp_table_template); } #endif
9 9 8 1 3 8 76 35 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 // SPDX-License-Identifier: GPL-2.0-or-later /* * * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) */ #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/sockios.h> #include <linux/net.h> #include <net/ax25.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <net/sock.h> #include <linux/uaccess.h> #include <linux/fcntl.h> #include <linux/mm.h> #include <linux/interrupt.h> /* * The default broadcast address of an interface is QST-0; the default address * is LINUX-1. The null address is defined as a callsign of all spaces with * an SSID of zero. */ const ax25_address ax25_bcast = {{'Q' << 1, 'S' << 1, 'T' << 1, ' ' << 1, ' ' << 1, ' ' << 1, 0 << 1}}; const ax25_address ax25_defaddr = {{'L' << 1, 'I' << 1, 'N' << 1, 'U' << 1, 'X' << 1, ' ' << 1, 1 << 1}}; const ax25_address null_ax25_address = {{' ' << 1, ' ' << 1, ' ' << 1, ' ' << 1, ' ' << 1, ' ' << 1, 0 << 1}}; EXPORT_SYMBOL_GPL(ax25_bcast); EXPORT_SYMBOL_GPL(ax25_defaddr); EXPORT_SYMBOL(null_ax25_address); /* * ax25 -> ascii conversion */ char *ax2asc(char *buf, const ax25_address *a) { char c, *s; int n; for (n = 0, s = buf; n < 6; n++) { c = (a->ax25_call[n] >> 1) & 0x7F; if (c != ' ') *s++ = c; } *s++ = '-'; if ((n = ((a->ax25_call[6] >> 1) & 0x0F)) > 9) { *s++ = '1'; n -= 10; } *s++ = n + '0'; *s++ = '\0'; if (*buf == '\0' || *buf == '-') return "*"; return buf; } EXPORT_SYMBOL(ax2asc); /* * ascii -> ax25 conversion */ void asc2ax(ax25_address *addr, const char *callsign) { const char *s; int n; for (s = callsign, n = 0; n < 6; n++) { if (*s != '\0' && *s != '-') addr->ax25_call[n] = *s++; else addr->ax25_call[n] = ' '; addr->ax25_call[n] <<= 1; addr->ax25_call[n] &= 0xFE; } if (*s++ == '\0') { addr->ax25_call[6] = 0x00; return; } addr->ax25_call[6] = *s++ - '0'; if (*s != '\0') { addr->ax25_call[6] *= 10; addr->ax25_call[6] += *s++ - '0'; } addr->ax25_call[6] <<= 1; addr->ax25_call[6] &= 0x1E; } EXPORT_SYMBOL(asc2ax); /* * Compare two ax.25 addresses */ int ax25cmp(const ax25_address *a, const ax25_address *b) { int ct = 0; while (ct < 6) { if ((a->ax25_call[ct] & 0xFE) != (b->ax25_call[ct] & 0xFE)) /* Clean off repeater bits */ return 1; ct++; } if ((a->ax25_call[ct] & 0x1E) == (b->ax25_call[ct] & 0x1E)) /* SSID without control bit */ return 0; return 2; /* Partial match */ } EXPORT_SYMBOL(ax25cmp); /* * Compare two AX.25 digipeater paths. */ int ax25digicmp(const ax25_digi *digi1, const ax25_digi *digi2) { int i; if (digi1->ndigi != digi2->ndigi) return 1; if (digi1->lastrepeat != digi2->lastrepeat) return 1; for (i = 0; i < digi1->ndigi; i++) if (ax25cmp(&digi1->calls[i], &digi2->calls[i]) != 0) return 1; return 0; } /* * Given an AX.25 address pull of to, from, digi list, command/response and the start of data * */ const unsigned char *ax25_addr_parse(const unsigned char *buf, int len, ax25_address *src, ax25_address *dest, ax25_digi *digi, int *flags, int *dama) { int d = 0; if (len < 14) return NULL; if (flags != NULL) { *flags = 0; if (buf[6] & AX25_CBIT) *flags = AX25_COMMAND; if (buf[13] & AX25_CBIT) *flags = AX25_RESPONSE; } if (dama != NULL) *dama = ~buf[13] & AX25_DAMA_FLAG; /* Copy to, from */ if (dest != NULL) memcpy(dest, buf + 0, AX25_ADDR_LEN); if (src != NULL) memcpy(src, buf + 7, AX25_ADDR_LEN); buf += 2 * AX25_ADDR_LEN; len -= 2 * AX25_ADDR_LEN; digi->lastrepeat = -1; digi->ndigi = 0; while (!(buf[-1] & AX25_EBIT)) { if (d >= AX25_MAX_DIGIS) return NULL; if (len < AX25_ADDR_LEN) return NULL; memcpy(&digi->calls[d], buf, AX25_ADDR_LEN); digi->ndigi = d + 1; if (buf[6] & AX25_HBIT) { digi->repeated[d] = 1; digi->lastrepeat = d; } else { digi->repeated[d] = 0; } buf += AX25_ADDR_LEN; len -= AX25_ADDR_LEN; d++; } return buf; } /* * Assemble an AX.25 header from the bits */ int ax25_addr_build(unsigned char *buf, const ax25_address *src, const ax25_address *dest, const ax25_digi *d, int flag, int modulus) { int len = 0; int ct = 0; memcpy(buf, dest, AX25_ADDR_LEN); buf[6] &= ~(AX25_EBIT | AX25_CBIT); buf[6] |= AX25_SSSID_SPARE; if (flag == AX25_COMMAND) buf[6] |= AX25_CBIT; buf += AX25_ADDR_LEN; len += AX25_ADDR_LEN; memcpy(buf, src, AX25_ADDR_LEN); buf[6] &= ~(AX25_EBIT | AX25_CBIT); buf[6] &= ~AX25_SSSID_SPARE; if (modulus == AX25_MODULUS) buf[6] |= AX25_SSSID_SPARE; else buf[6] |= AX25_ESSID_SPARE; if (flag == AX25_RESPONSE) buf[6] |= AX25_CBIT; /* * Fast path the normal digiless path */ if (d == NULL || d->ndigi == 0) { buf[6] |= AX25_EBIT; return 2 * AX25_ADDR_LEN; } buf += AX25_ADDR_LEN; len += AX25_ADDR_LEN; while (ct < d->ndigi) { memcpy(buf, &d->calls[ct], AX25_ADDR_LEN); if (d->repeated[ct]) buf[6] |= AX25_HBIT; else buf[6] &= ~AX25_HBIT; buf[6] &= ~AX25_EBIT; buf[6] |= AX25_SSSID_SPARE; buf += AX25_ADDR_LEN; len += AX25_ADDR_LEN; ct++; } buf[-1] |= AX25_EBIT; return len; } int ax25_addr_size(const ax25_digi *dp) { if (dp == NULL) return 2 * AX25_ADDR_LEN; return AX25_ADDR_LEN * (2 + dp->ndigi); } /* * Reverse Digipeat List. May not pass both parameters as same struct */ void ax25_digi_invert(const ax25_digi *in, ax25_digi *out) { int ct; out->ndigi = in->ndigi; out->lastrepeat = in->ndigi - in->lastrepeat - 2; /* Invert the digipeaters */ for (ct = 0; ct < in->ndigi; ct++) { out->calls[ct] = in->calls[in->ndigi - ct - 1]; if (ct <= out->lastrepeat) { out->calls[ct].ax25_call[6] |= AX25_HBIT; out->repeated[ct] = 1; } else { out->calls[ct].ax25_call[6] &= ~AX25_HBIT; out->repeated[ct] = 0; } } }
5081 30 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM pagemap #if !defined(_TRACE_PAGEMAP_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_PAGEMAP_H #include <linux/tracepoint.h> #include <linux/mm.h> #define PAGEMAP_MAPPED 0x0001u #define PAGEMAP_ANONYMOUS 0x0002u #define PAGEMAP_FILE 0x0004u #define PAGEMAP_SWAPCACHE 0x0008u #define PAGEMAP_SWAPBACKED 0x0010u #define PAGEMAP_MAPPEDDISK 0x0020u #define PAGEMAP_BUFFERS 0x0040u #define trace_pagemap_flags(folio) ( \ (folio_test_anon(folio) ? PAGEMAP_ANONYMOUS : PAGEMAP_FILE) | \ (folio_mapped(folio) ? PAGEMAP_MAPPED : 0) | \ (folio_test_swapcache(folio) ? PAGEMAP_SWAPCACHE : 0) | \ (folio_test_swapbacked(folio) ? PAGEMAP_SWAPBACKED : 0) | \ (folio_test_mappedtodisk(folio) ? PAGEMAP_MAPPEDDISK : 0) | \ (folio_test_private(folio) ? PAGEMAP_BUFFERS : 0) \ ) TRACE_EVENT(mm_lru_insertion, TP_PROTO(struct folio *folio), TP_ARGS(folio), TP_STRUCT__entry( __field(struct folio *, folio ) __field(unsigned long, pfn ) __field(enum lru_list, lru ) __field(unsigned long, flags ) ), TP_fast_assign( __entry->folio = folio; __entry->pfn = folio_pfn(folio); __entry->lru = folio_lru_list(folio); __entry->flags = trace_pagemap_flags(folio); ), /* Flag format is based on page-types.c formatting for pagemap */ TP_printk("folio=%p pfn=0x%lx lru=%d flags=%s%s%s%s%s%s", __entry->folio, __entry->pfn, __entry->lru, __entry->flags & PAGEMAP_MAPPED ? "M" : " ", __entry->flags & PAGEMAP_ANONYMOUS ? "a" : "f", __entry->flags & PAGEMAP_SWAPCACHE ? "s" : " ", __entry->flags & PAGEMAP_SWAPBACKED ? "b" : " ", __entry->flags & PAGEMAP_MAPPEDDISK ? "d" : " ", __entry->flags & PAGEMAP_BUFFERS ? "B" : " ") ); TRACE_EVENT(mm_lru_activate, TP_PROTO(struct folio *folio), TP_ARGS(folio), TP_STRUCT__entry( __field(struct folio *, folio ) __field(unsigned long, pfn ) ), TP_fast_assign( __entry->folio = folio; __entry->pfn = folio_pfn(folio); ), TP_printk("folio=%p pfn=0x%lx", __entry->folio, __entry->pfn) ); #endif /* _TRACE_PAGEMAP_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 // SPDX-License-Identifier: GPL-2.0-or-later /* * locks.c * * Userspace file locking support * * Copyright (C) 2007 Oracle. All rights reserved. */ #include <linux/fs.h> #include <linux/filelock.h> #include <linux/fcntl.h> #include <cluster/masklog.h> #include "ocfs2.h" #include "dlmglue.h" #include "file.h" #include "inode.h" #include "locks.h" static int ocfs2_do_flock(struct file *file, struct inode *inode, int cmd, struct file_lock *fl) { int ret = 0, level = 0, trylock = 0; struct ocfs2_file_private *fp = file->private_data; struct ocfs2_lock_res *lockres = &fp->fp_flock; if (lock_is_write(fl)) level = 1; if (!IS_SETLKW(cmd)) trylock = 1; mutex_lock(&fp->fp_mutex); if (lockres->l_flags & OCFS2_LOCK_ATTACHED && lockres->l_level > LKM_NLMODE) { int old_level = 0; struct file_lock request; if (lockres->l_level == LKM_EXMODE) old_level = 1; if (level == old_level) goto out; /* * Converting an existing lock is not guaranteed to be * atomic, so we can get away with simply unlocking * here and allowing the lock code to try at the new * level. */ locks_init_lock(&request); request.c.flc_type = F_UNLCK; request.c.flc_flags = FL_FLOCK; locks_lock_file_wait(file, &request); ocfs2_file_unlock(file); } ret = ocfs2_file_lock(file, level, trylock); if (ret) { if (ret == -EAGAIN && trylock) ret = -EWOULDBLOCK; else mlog_errno(ret); goto out; } ret = locks_lock_file_wait(file, fl); if (ret) ocfs2_file_unlock(file); out: mutex_unlock(&fp->fp_mutex); return ret; } static int ocfs2_do_funlock(struct file *file, int cmd, struct file_lock *fl) { int ret; struct ocfs2_file_private *fp = file->private_data; mutex_lock(&fp->fp_mutex); ocfs2_file_unlock(file); ret = locks_lock_file_wait(file, fl); mutex_unlock(&fp->fp_mutex); return ret; } /* * Overall flow of ocfs2_flock() was influenced by gfs2_flock(). */ int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl) { struct inode *inode = file->f_mapping->host; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (!(fl->c.flc_flags & FL_FLOCK)) return -ENOLCK; if ((osb->s_mount_opt & OCFS2_MOUNT_LOCALFLOCKS) || ocfs2_mount_local(osb)) return locks_lock_file_wait(file, fl); if (lock_is_unlock(fl)) return ocfs2_do_funlock(file, cmd, fl); else return ocfs2_do_flock(file, inode, cmd, fl); } int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl) { struct inode *inode = file->f_mapping->host; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (!(fl->c.flc_flags & FL_POSIX)) return -ENOLCK; return ocfs2_plock(osb->cconn, OCFS2_I(inode)->ip_blkno, file, cmd, fl); }
358 27 87 16 286 3 202 185 76 52 2 50 1 1 40 40 419 419 3 3 410 411 499 501 3 482 407 479 588 2 1 1 1 589 562 406 322 4 1 1 1 17 11 17 11 1 314 1 146 4 409 3 406 406 44 15 30 406 1 401 2 2 2 2 2 2 2 1 119 101 18 118 118 118 118 478 479 479 21 175 404 403 402 22 407 406 556 552 556 555 417 417 419 419 418 13 13 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/fs_parser.h> #include "bcachefs.h" #include "compress.h" #include "disk_groups.h" #include "error.h" #include "opts.h" #include "recovery_passes.h" #include "super-io.h" #include "util.h" #define x(t, n, ...) [n] = #t, const char * const bch2_error_actions[] = { BCH_ERROR_ACTIONS() NULL }; const char * const bch2_fsck_fix_opts[] = { BCH_FIX_ERRORS_OPTS() NULL }; const char * const bch2_version_upgrade_opts[] = { BCH_VERSION_UPGRADE_OPTS() NULL }; const char * const bch2_sb_features[] = { BCH_SB_FEATURES() NULL }; const char * const bch2_sb_compat[] = { BCH_SB_COMPAT() NULL }; const char * const __bch2_btree_ids[] = { BCH_BTREE_IDS() NULL }; static const char * const __bch2_csum_types[] = { BCH_CSUM_TYPES() NULL }; const char * const __bch2_csum_opts[] = { BCH_CSUM_OPTS() NULL }; const char * const __bch2_compression_types[] = { BCH_COMPRESSION_TYPES() NULL }; const char * const bch2_compression_opts[] = { BCH_COMPRESSION_OPTS() NULL }; const char * const __bch2_str_hash_types[] = { BCH_STR_HASH_TYPES() NULL }; const char * const bch2_str_hash_opts[] = { BCH_STR_HASH_OPTS() NULL }; const char * const __bch2_data_types[] = { BCH_DATA_TYPES() NULL }; const char * const bch2_member_states[] = { BCH_MEMBER_STATES() NULL }; static const char * const __bch2_jset_entry_types[] = { BCH_JSET_ENTRY_TYPES() NULL }; static const char * const __bch2_fs_usage_types[] = { BCH_FS_USAGE_TYPES() NULL }; #undef x static void prt_str_opt_boundscheck(struct printbuf *out, const char * const opts[], unsigned nr, const char *type, unsigned idx) { if (idx < nr) prt_str(out, opts[idx]); else prt_printf(out, "(unknown %s %u)", type, idx); } #define PRT_STR_OPT_BOUNDSCHECKED(name, type) \ void bch2_prt_##name(struct printbuf *out, type t) \ { \ prt_str_opt_boundscheck(out, __bch2_##name##s, ARRAY_SIZE(__bch2_##name##s) - 1, #name, t);\ } PRT_STR_OPT_BOUNDSCHECKED(jset_entry_type, enum bch_jset_entry_type); PRT_STR_OPT_BOUNDSCHECKED(fs_usage_type, enum bch_fs_usage_type); PRT_STR_OPT_BOUNDSCHECKED(data_type, enum bch_data_type); PRT_STR_OPT_BOUNDSCHECKED(csum_opt, enum bch_csum_opt); PRT_STR_OPT_BOUNDSCHECKED(csum_type, enum bch_csum_type); PRT_STR_OPT_BOUNDSCHECKED(compression_type, enum bch_compression_type); PRT_STR_OPT_BOUNDSCHECKED(str_hash_type, enum bch_str_hash_type); static int bch2_opt_fix_errors_parse(struct bch_fs *c, const char *val, u64 *res, struct printbuf *err) { if (!val) { *res = FSCK_FIX_yes; } else { int ret = match_string(bch2_fsck_fix_opts, -1, val); if (ret < 0 && err) prt_str(err, "fix_errors: invalid selection"); if (ret < 0) return ret; *res = ret; } return 0; } static void bch2_opt_fix_errors_to_text(struct printbuf *out, struct bch_fs *c, struct bch_sb *sb, u64 v) { prt_str(out, bch2_fsck_fix_opts[v]); } #define bch2_opt_fix_errors (struct bch_opt_fn) { \ .parse = bch2_opt_fix_errors_parse, \ .to_text = bch2_opt_fix_errors_to_text, \ } const char * const bch2_d_types[BCH_DT_MAX] = { [DT_UNKNOWN] = "unknown", [DT_FIFO] = "fifo", [DT_CHR] = "chr", [DT_DIR] = "dir", [DT_BLK] = "blk", [DT_REG] = "reg", [DT_LNK] = "lnk", [DT_SOCK] = "sock", [DT_WHT] = "whiteout", [DT_SUBVOL] = "subvol", }; u64 BCH2_NO_SB_OPT(const struct bch_sb *sb) { BUG(); } void SET_BCH2_NO_SB_OPT(struct bch_sb *sb, u64 v) { BUG(); } void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src) { #define x(_name, ...) \ if (opt_defined(src, _name)) \ opt_set(*dst, _name, src._name); BCH_OPTS() #undef x } bool bch2_opt_defined_by_id(const struct bch_opts *opts, enum bch_opt_id id) { switch (id) { #define x(_name, ...) \ case Opt_##_name: \ return opt_defined(*opts, _name); BCH_OPTS() #undef x default: BUG(); } } u64 bch2_opt_get_by_id(const struct bch_opts *opts, enum bch_opt_id id) { switch (id) { #define x(_name, ...) \ case Opt_##_name: \ return opts->_name; BCH_OPTS() #undef x default: BUG(); } } void bch2_opt_set_by_id(struct bch_opts *opts, enum bch_opt_id id, u64 v) { switch (id) { #define x(_name, ...) \ case Opt_##_name: \ opt_set(*opts, _name, v); \ break; BCH_OPTS() #undef x default: BUG(); } } const struct bch_option bch2_opt_table[] = { #define OPT_BOOL() .type = BCH_OPT_BOOL, .min = 0, .max = 2 #define OPT_UINT(_min, _max) .type = BCH_OPT_UINT, \ .min = _min, .max = _max #define OPT_STR(_choices) .type = BCH_OPT_STR, \ .min = 0, .max = ARRAY_SIZE(_choices) - 1, \ .choices = _choices #define OPT_STR_NOLIMIT(_choices) .type = BCH_OPT_STR, \ .min = 0, .max = U64_MAX, \ .choices = _choices #define OPT_BITFIELD(_choices) .type = BCH_OPT_BITFIELD, \ .choices = _choices #define OPT_FN(_fn) .type = BCH_OPT_FN, .fn = _fn #define x(_name, _bits, _flags, _type, _sb_opt, _default, _hint, _help) \ [Opt_##_name] = { \ .attr = { \ .name = #_name, \ .mode = (_flags) & OPT_RUNTIME ? 0644 : 0444, \ }, \ .flags = _flags, \ .hint = _hint, \ .help = _help, \ .get_sb = _sb_opt, \ .set_sb = SET_##_sb_opt, \ _type \ }, BCH_OPTS() #undef x }; int bch2_opt_lookup(const char *name) { const struct bch_option *i; for (i = bch2_opt_table; i < bch2_opt_table + ARRAY_SIZE(bch2_opt_table); i++) if (!strcmp(name, i->attr.name)) return i - bch2_opt_table; return -1; } struct synonym { const char *s1, *s2; }; static const struct synonym bch_opt_synonyms[] = { { "quota", "usrquota" }, }; static int bch2_mount_opt_lookup(const char *name) { const struct synonym *i; for (i = bch_opt_synonyms; i < bch_opt_synonyms + ARRAY_SIZE(bch_opt_synonyms); i++) if (!strcmp(name, i->s1)) name = i->s2; return bch2_opt_lookup(name); } int bch2_opt_validate(const struct bch_option *opt, u64 v, struct printbuf *err) { if (v < opt->min) { if (err) prt_printf(err, "%s: too small (min %llu)", opt->attr.name, opt->min); return -BCH_ERR_ERANGE_option_too_small; } if (opt->max && v >= opt->max) { if (err) prt_printf(err, "%s: too big (max %llu)", opt->attr.name, opt->max); return -BCH_ERR_ERANGE_option_too_big; } if ((opt->flags & OPT_SB_FIELD_SECTORS) && (v & 511)) { if (err) prt_printf(err, "%s: not a multiple of 512", opt->attr.name); return -BCH_ERR_opt_parse_error; } if ((opt->flags & OPT_MUST_BE_POW_2) && !is_power_of_2(v)) { if (err) prt_printf(err, "%s: must be a power of two", opt->attr.name); return -BCH_ERR_opt_parse_error; } if (opt->fn.validate) return opt->fn.validate(v, err); return 0; } int bch2_opt_parse(struct bch_fs *c, const struct bch_option *opt, const char *val, u64 *res, struct printbuf *err) { ssize_t ret; switch (opt->type) { case BCH_OPT_BOOL: if (val) { ret = lookup_constant(bool_names, val, -BCH_ERR_option_not_bool); if (ret != -BCH_ERR_option_not_bool) { *res = ret; } else { if (err) prt_printf(err, "%s: must be bool", opt->attr.name); return ret; } } else { *res = 1; } break; case BCH_OPT_UINT: if (!val) { prt_printf(err, "%s: required value", opt->attr.name); return -EINVAL; } ret = opt->flags & OPT_HUMAN_READABLE ? bch2_strtou64_h(val, res) : kstrtou64(val, 10, res); if (ret < 0) { if (err) prt_printf(err, "%s: must be a number", opt->attr.name); return ret; } break; case BCH_OPT_STR: if (!val) { prt_printf(err, "%s: required value", opt->attr.name); return -EINVAL; } ret = match_string(opt->choices, -1, val); if (ret < 0) { if (err) prt_printf(err, "%s: invalid selection", opt->attr.name); return ret; } *res = ret; break; case BCH_OPT_BITFIELD: { s64 v = bch2_read_flag_list(val, opt->choices); if (v < 0) return v; *res = v; break; } case BCH_OPT_FN: ret = opt->fn.parse(c, val, res, err); if (ret == -BCH_ERR_option_needs_open_fs) return ret; if (ret < 0) { if (err) prt_printf(err, "%s: parse error", opt->attr.name); return ret; } } return bch2_opt_validate(opt, *res, err); } void bch2_opt_to_text(struct printbuf *out, struct bch_fs *c, struct bch_sb *sb, const struct bch_option *opt, u64 v, unsigned flags) { if (flags & OPT_SHOW_MOUNT_STYLE) { if (opt->type == BCH_OPT_BOOL) { prt_printf(out, "%s%s", v ? "" : "no", opt->attr.name); return; } prt_printf(out, "%s=", opt->attr.name); } switch (opt->type) { case BCH_OPT_BOOL: case BCH_OPT_UINT: if (opt->flags & OPT_HUMAN_READABLE) prt_human_readable_u64(out, v); else prt_printf(out, "%lli", v); break; case BCH_OPT_STR: if (v < opt->min || v >= opt->max) prt_printf(out, "(invalid option %lli)", v); else if (flags & OPT_SHOW_FULL_LIST) prt_string_option(out, opt->choices, v); else prt_str(out, opt->choices[v]); break; case BCH_OPT_BITFIELD: prt_bitflags(out, opt->choices, v); break; case BCH_OPT_FN: opt->fn.to_text(out, c, sb, v); break; default: BUG(); } } void bch2_opts_to_text(struct printbuf *out, struct bch_opts opts, struct bch_fs *c, struct bch_sb *sb, unsigned show_mask, unsigned hide_mask, unsigned flags) { bool first = true; for (enum bch_opt_id i = 0; i < bch2_opts_nr; i++) { const struct bch_option *opt = &bch2_opt_table[i]; if ((opt->flags & hide_mask) || !(opt->flags & show_mask)) continue; u64 v = bch2_opt_get_by_id(&opts, i); if (v == bch2_opt_get_by_id(&bch2_opts_default, i)) continue; if (!first) prt_char(out, ','); first = false; bch2_opt_to_text(out, c, sb, opt, v, flags); } } int bch2_opt_check_may_set(struct bch_fs *c, int id, u64 v) { int ret = 0; switch (id) { case Opt_compression: case Opt_background_compression: ret = bch2_check_set_has_compressed_data(c, v); break; case Opt_erasure_code: if (v) bch2_check_set_feature(c, BCH_FEATURE_ec); break; } return ret; } int bch2_opts_check_may_set(struct bch_fs *c) { unsigned i; int ret; for (i = 0; i < bch2_opts_nr; i++) { ret = bch2_opt_check_may_set(c, i, bch2_opt_get_by_id(&c->opts, i)); if (ret) return ret; } return 0; } int bch2_parse_one_mount_opt(struct bch_fs *c, struct bch_opts *opts, struct printbuf *parse_later, const char *name, const char *val) { struct printbuf err = PRINTBUF; u64 v; int ret, id; id = bch2_mount_opt_lookup(name); /* Check for the form "noopt", negation of a boolean opt: */ if (id < 0 && !val && !strncmp("no", name, 2)) { id = bch2_mount_opt_lookup(name + 2); val = "0"; } /* Unknown options are ignored: */ if (id < 0) return 0; if (!(bch2_opt_table[id].flags & OPT_MOUNT)) goto bad_opt; if (id == Opt_acl && !IS_ENABLED(CONFIG_BCACHEFS_POSIX_ACL)) goto bad_opt; if ((id == Opt_usrquota || id == Opt_grpquota) && !IS_ENABLED(CONFIG_BCACHEFS_QUOTA)) goto bad_opt; ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v, &err); if (ret == -BCH_ERR_option_needs_open_fs && parse_later) { prt_printf(parse_later, "%s=%s,", name, val); if (parse_later->allocation_failure) { ret = -ENOMEM; goto out; } ret = 0; goto out; } if (ret < 0) goto bad_val; if (opts) bch2_opt_set_by_id(opts, id, v); ret = 0; goto out; bad_opt: pr_err("Bad mount option %s", name); ret = -BCH_ERR_option_name; goto out; bad_val: pr_err("Invalid mount option %s", err.buf); ret = -BCH_ERR_option_value; out: printbuf_exit(&err); return ret; } int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts, struct printbuf *parse_later, char *options) { char *copied_opts, *copied_opts_start; char *opt, *name, *val; int ret; if (!options) return 0; /* * sys_fsconfig() is now occasionally providing us with option lists * starting with a comma - weird. */ if (*options == ',') options++; copied_opts = kstrdup(options, GFP_KERNEL); if (!copied_opts) return -ENOMEM; copied_opts_start = copied_opts; while ((opt = strsep(&copied_opts, ",")) != NULL) { if (!*opt) continue; name = strsep(&opt, "="); val = opt; ret = bch2_parse_one_mount_opt(c, opts, parse_later, name, val); if (ret < 0) goto out; } ret = 0; goto out; out: kfree(copied_opts_start); return ret; } u64 bch2_opt_from_sb(struct bch_sb *sb, enum bch_opt_id id) { const struct bch_option *opt = bch2_opt_table + id; u64 v; v = opt->get_sb(sb); if (opt->flags & OPT_SB_FIELD_ILOG2) v = 1ULL << v; if (opt->flags & OPT_SB_FIELD_SECTORS) v <<= 9; return v; } /* * Initial options from superblock - here we don't want any options undefined, * any options the superblock doesn't specify are set to 0: */ int bch2_opts_from_sb(struct bch_opts *opts, struct bch_sb *sb) { unsigned id; for (id = 0; id < bch2_opts_nr; id++) { const struct bch_option *opt = bch2_opt_table + id; if (opt->get_sb == BCH2_NO_SB_OPT) continue; bch2_opt_set_by_id(opts, id, bch2_opt_from_sb(sb, id)); } return 0; } struct bch_dev_sb_opt_set { void (*set_sb)(struct bch_member *, u64); }; static const struct bch_dev_sb_opt_set bch2_dev_sb_opt_setters [] = { #define x(n, set) [Opt_##n] = { .set_sb = SET_##set }, BCH_DEV_OPT_SETTERS() #undef x }; void __bch2_opt_set_sb(struct bch_sb *sb, int dev_idx, const struct bch_option *opt, u64 v) { enum bch_opt_id id = opt - bch2_opt_table; if (opt->flags & OPT_SB_FIELD_SECTORS) v >>= 9; if (opt->flags & OPT_SB_FIELD_ILOG2) v = ilog2(v); if (opt->flags & OPT_SB_FIELD_ONE_BIAS) v++; if (opt->flags & OPT_FS) { if (opt->set_sb != SET_BCH2_NO_SB_OPT) opt->set_sb(sb, v); } if ((opt->flags & OPT_DEVICE) && dev_idx >= 0) { if (WARN(!bch2_member_exists(sb, dev_idx), "tried to set device option %s on nonexistent device %i", opt->attr.name, dev_idx)) return; struct bch_member *m = bch2_members_v2_get_mut(sb, dev_idx); const struct bch_dev_sb_opt_set *set = bch2_dev_sb_opt_setters + id; if (set->set_sb) set->set_sb(m, v); else pr_err("option %s cannot be set via opt_set_sb()", opt->attr.name); } } void bch2_opt_set_sb(struct bch_fs *c, struct bch_dev *ca, const struct bch_option *opt, u64 v) { mutex_lock(&c->sb_lock); __bch2_opt_set_sb(c->disk_sb.sb, ca ? ca->dev_idx : -1, opt, v); bch2_write_super(c); mutex_unlock(&c->sb_lock); } /* io opts: */ struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts src) { struct bch_io_opts opts = { #define x(_name, _bits) ._name = src._name, BCH_INODE_OPTS() #undef x }; bch2_io_opts_fixups(&opts); return opts; } bool bch2_opt_is_inode_opt(enum bch_opt_id id) { static const enum bch_opt_id inode_opt_list[] = { #define x(_name, _bits) Opt_##_name, BCH_INODE_OPTS() #undef x }; unsigned i; for (i = 0; i < ARRAY_SIZE(inode_opt_list); i++) if (inode_opt_list[i] == id) return true; return false; }
2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 // SPDX-License-Identifier: GPL-2.0-only #include <linux/kernel.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/usb/input.h> #include <linux/unaligned.h> /* * Pressure-threshold modules param code from Alex Perry <alex.perry@ieee.org> */ MODULE_AUTHOR("Josh Myer <josh@joshisanerd.com>"); MODULE_DESCRIPTION("USB KB Gear JamStudio Tablet driver"); MODULE_LICENSE("GPL"); #define USB_VENDOR_ID_KBGEAR 0x084e static int kb_pressure_click = 0x10; module_param(kb_pressure_click, int, 0); MODULE_PARM_DESC(kb_pressure_click, "pressure threshold for clicks"); struct kbtab { unsigned char *data; dma_addr_t data_dma; struct input_dev *dev; struct usb_interface *intf; struct urb *irq; char phys[32]; }; static void kbtab_irq(struct urb *urb) { struct kbtab *kbtab = urb->context; unsigned char *data = kbtab->data; struct input_dev *dev = kbtab->dev; int pressure; int retval; switch (urb->status) { case 0: /* success */ break; case -ECONNRESET: case -ENOENT: case -ESHUTDOWN: /* this urb is terminated, clean up */ dev_dbg(&kbtab->intf->dev, "%s - urb shutting down with status: %d\n", __func__, urb->status); return; default: dev_dbg(&kbtab->intf->dev, "%s - nonzero urb status received: %d\n", __func__, urb->status); goto exit; } input_report_key(dev, BTN_TOOL_PEN, 1); input_report_abs(dev, ABS_X, get_unaligned_le16(&data[1])); input_report_abs(dev, ABS_Y, get_unaligned_le16(&data[3])); /*input_report_key(dev, BTN_TOUCH , data[0] & 0x01);*/ input_report_key(dev, BTN_RIGHT, data[0] & 0x02); pressure = data[5]; if (kb_pressure_click == -1) input_report_abs(dev, ABS_PRESSURE, pressure); else input_report_key(dev, BTN_LEFT, pressure > kb_pressure_click ? 1 : 0); input_sync(dev); exit: retval = usb_submit_urb(urb, GFP_ATOMIC); if (retval) dev_err(&kbtab->intf->dev, "%s - usb_submit_urb failed with result %d\n", __func__, retval); } static const struct usb_device_id kbtab_ids[] = { { USB_DEVICE(USB_VENDOR_ID_KBGEAR, 0x1001), .driver_info = 0 }, { } }; MODULE_DEVICE_TABLE(usb, kbtab_ids); static int kbtab_open(struct input_dev *dev) { struct kbtab *kbtab = input_get_drvdata(dev); struct usb_device *udev = interface_to_usbdev(kbtab->intf); kbtab->irq->dev = udev; if (usb_submit_urb(kbtab->irq, GFP_KERNEL)) return -EIO; return 0; } static void kbtab_close(struct input_dev *dev) { struct kbtab *kbtab = input_get_drvdata(dev); usb_kill_urb(kbtab->irq); } static int kbtab_probe(struct usb_interface *intf, const struct usb_device_id *id) { struct usb_device *dev = interface_to_usbdev(intf); struct usb_endpoint_descriptor *endpoint; struct kbtab *kbtab; struct input_dev *input_dev; int error = -ENOMEM; if (intf->cur_altsetting->desc.bNumEndpoints < 1) return -ENODEV; endpoint = &intf->cur_altsetting->endpoint[0].desc; if (!usb_endpoint_is_int_in(endpoint)) return -ENODEV; kbtab = kzalloc(sizeof(*kbtab), GFP_KERNEL); input_dev = input_allocate_device(); if (!kbtab || !input_dev) goto fail1; kbtab->data = usb_alloc_coherent(dev, 8, GFP_KERNEL, &kbtab->data_dma); if (!kbtab->data) goto fail1; kbtab->irq = usb_alloc_urb(0, GFP_KERNEL); if (!kbtab->irq) goto fail2; kbtab->intf = intf; kbtab->dev = input_dev; usb_make_path(dev, kbtab->phys, sizeof(kbtab->phys)); strlcat(kbtab->phys, "/input0", sizeof(kbtab->phys)); input_dev->name = "KB Gear Tablet"; input_dev->phys = kbtab->phys; usb_to_input_id(dev, &input_dev->id); input_dev->dev.parent = &intf->dev; input_set_drvdata(input_dev, kbtab); input_dev->open = kbtab_open; input_dev->close = kbtab_close; input_dev->evbit[0] |= BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS); input_dev->keybit[BIT_WORD(BTN_LEFT)] |= BIT_MASK(BTN_LEFT) | BIT_MASK(BTN_RIGHT); input_dev->keybit[BIT_WORD(BTN_DIGI)] |= BIT_MASK(BTN_TOOL_PEN) | BIT_MASK(BTN_TOUCH); input_set_abs_params(input_dev, ABS_X, 0, 0x2000, 4, 0); input_set_abs_params(input_dev, ABS_Y, 0, 0x1750, 4, 0); input_set_abs_params(input_dev, ABS_PRESSURE, 0, 0xff, 0, 0); usb_fill_int_urb(kbtab->irq, dev, usb_rcvintpipe(dev, endpoint->bEndpointAddress), kbtab->data, 8, kbtab_irq, kbtab, endpoint->bInterval); kbtab->irq->transfer_dma = kbtab->data_dma; kbtab->irq->transfer_flags |= URB_NO_TRANSFER_DMA_MAP; error = input_register_device(kbtab->dev); if (error) goto fail3; usb_set_intfdata(intf, kbtab); return 0; fail3: usb_free_urb(kbtab->irq); fail2: usb_free_coherent(dev, 8, kbtab->data, kbtab->data_dma); fail1: input_free_device(input_dev); kfree(kbtab); return error; } static void kbtab_disconnect(struct usb_interface *intf) { struct kbtab *kbtab = usb_get_intfdata(intf); struct usb_device *udev = interface_to_usbdev(intf); usb_set_intfdata(intf, NULL); input_unregister_device(kbtab->dev); usb_free_urb(kbtab->irq); usb_free_coherent(udev, 8, kbtab->data, kbtab->data_dma); kfree(kbtab); } static struct usb_driver kbtab_driver = { .name = "kbtab", .probe = kbtab_probe, .disconnect = kbtab_disconnect, .id_table = kbtab_ids, }; module_usb_driver(kbtab_driver);
5 1 1 1 3 59 1 54 6 40 2 6 1 1 1 3 1 3 3 2 1 4 2 4 2 4 56 3 53 11 1 7 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 /* * FUSE: Filesystem in Userspace * Copyright (C) 2001-2016 Miklos Szeredi <miklos@szeredi.hu> * * This program can be distributed under the terms of the GNU GPL. * See the file COPYING. */ #include "fuse_i.h" #include <linux/xattr.h> #include <linux/posix_acl_xattr.h> int fuse_setxattr(struct inode *inode, const char *name, const void *value, size_t size, int flags, unsigned int extra_flags) { struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); struct fuse_setxattr_in inarg; int err; if (fm->fc->no_setxattr) return -EOPNOTSUPP; memset(&inarg, 0, sizeof(inarg)); inarg.size = size; inarg.flags = flags; inarg.setxattr_flags = extra_flags; args.opcode = FUSE_SETXATTR; args.nodeid = get_node_id(inode); args.in_numargs = 3; args.in_args[0].size = fm->fc->setxattr_ext ? sizeof(inarg) : FUSE_COMPAT_SETXATTR_IN_SIZE; args.in_args[0].value = &inarg; args.in_args[1].size = strlen(name) + 1; args.in_args[1].value = name; args.in_args[2].size = size; args.in_args[2].value = value; err = fuse_simple_request(fm, &args); if (err == -ENOSYS) { fm->fc->no_setxattr = 1; err = -EOPNOTSUPP; } if (!err) fuse_update_ctime(inode); return err; } ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value, size_t size) { struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); struct fuse_getxattr_in inarg; struct fuse_getxattr_out outarg; ssize_t ret; if (fm->fc->no_getxattr) return -EOPNOTSUPP; memset(&inarg, 0, sizeof(inarg)); inarg.size = size; args.opcode = FUSE_GETXATTR; args.nodeid = get_node_id(inode); args.in_numargs = 2; args.in_args[0].size = sizeof(inarg); args.in_args[0].value = &inarg; args.in_args[1].size = strlen(name) + 1; args.in_args[1].value = name; /* This is really two different operations rolled into one */ args.out_numargs = 1; if (size) { args.out_argvar = true; args.out_args[0].size = size; args.out_args[0].value = value; } else { args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; } ret = fuse_simple_request(fm, &args); if (!ret && !size) ret = min_t(size_t, outarg.size, XATTR_SIZE_MAX); if (ret == -ENOSYS) { fm->fc->no_getxattr = 1; ret = -EOPNOTSUPP; } return ret; } static int fuse_verify_xattr_list(char *list, size_t size) { size_t origsize = size; while (size) { size_t thislen = strnlen(list, size); if (!thislen || thislen == size) return -EIO; size -= thislen + 1; list += thislen + 1; } return origsize; } ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) { struct inode *inode = d_inode(entry); struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); struct fuse_getxattr_in inarg; struct fuse_getxattr_out outarg; ssize_t ret; if (fuse_is_bad(inode)) return -EIO; if (!fuse_allow_current_process(fm->fc)) return -EACCES; if (fm->fc->no_listxattr) return -EOPNOTSUPP; memset(&inarg, 0, sizeof(inarg)); inarg.size = size; args.opcode = FUSE_LISTXATTR; args.nodeid = get_node_id(inode); args.in_numargs = 1; args.in_args[0].size = sizeof(inarg); args.in_args[0].value = &inarg; /* This is really two different operations rolled into one */ args.out_numargs = 1; if (size) { args.out_argvar = true; args.out_args[0].size = size; args.out_args[0].value = list; } else { args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; } ret = fuse_simple_request(fm, &args); if (!ret && !size) ret = min_t(size_t, outarg.size, XATTR_LIST_MAX); if (ret > 0 && size) ret = fuse_verify_xattr_list(list, ret); if (ret == -ENOSYS) { fm->fc->no_listxattr = 1; ret = -EOPNOTSUPP; } return ret; } int fuse_removexattr(struct inode *inode, const char *name) { struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); int err; if (fm->fc->no_removexattr) return -EOPNOTSUPP; args.opcode = FUSE_REMOVEXATTR; args.nodeid = get_node_id(inode); args.in_numargs = 2; fuse_set_zero_arg0(&args); args.in_args[1].size = strlen(name) + 1; args.in_args[1].value = name; err = fuse_simple_request(fm, &args); if (err == -ENOSYS) { fm->fc->no_removexattr = 1; err = -EOPNOTSUPP; } if (!err) fuse_update_ctime(inode); return err; } static int fuse_xattr_get(const struct xattr_handler *handler, struct dentry *dentry, struct inode *inode, const char *name, void *value, size_t size) { if (fuse_is_bad(inode)) return -EIO; return fuse_getxattr(inode, name, value, size); } static int fuse_xattr_set(const struct xattr_handler *handler, struct mnt_idmap *idmap, struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags) { if (fuse_is_bad(inode)) return -EIO; if (!value) return fuse_removexattr(inode, name); return fuse_setxattr(inode, name, value, size, flags, 0); } static const struct xattr_handler fuse_xattr_handler = { .prefix = "", .get = fuse_xattr_get, .set = fuse_xattr_set, }; const struct xattr_handler * const fuse_xattr_handlers[] = { &fuse_xattr_handler, NULL };
30 30 30 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 // SPDX-License-Identifier: GPL-2.0-or-later /* Service connection management * * Copyright (C) 2016 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/slab.h> #include "ar-internal.h" /* * Find a service connection under RCU conditions. * * We could use a hash table, but that is subject to bucket stuffing by an * attacker as the client gets to pick the epoch and cid values and would know * the hash function. So, instead, we use a hash table for the peer and from * that an rbtree to find the service connection. Under ordinary circumstances * it might be slower than a large hash table, but it is at least limited in * depth. */ struct rxrpc_connection *rxrpc_find_service_conn_rcu(struct rxrpc_peer *peer, struct sk_buff *skb) { struct rxrpc_connection *conn = NULL; struct rxrpc_conn_proto k; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct rb_node *p; unsigned int seq = 1; k.epoch = sp->hdr.epoch; k.cid = sp->hdr.cid & RXRPC_CIDMASK; do { /* Unfortunately, rbtree walking doesn't give reliable results * under just the RCU read lock, so we have to check for * changes. */ seq++; /* 2 on the 1st/lockless path, otherwise odd */ read_seqbegin_or_lock(&peer->service_conn_lock, &seq); p = rcu_dereference_raw(peer->service_conns.rb_node); while (p) { conn = rb_entry(p, struct rxrpc_connection, service_node); if (conn->proto.index_key < k.index_key) p = rcu_dereference_raw(p->rb_left); else if (conn->proto.index_key > k.index_key) p = rcu_dereference_raw(p->rb_right); else break; conn = NULL; } } while (need_seqretry(&peer->service_conn_lock, seq)); done_seqretry(&peer->service_conn_lock, seq); _leave(" = %d", conn ? conn->debug_id : -1); return conn; } /* * Insert a service connection into a peer's tree, thereby making it a target * for incoming packets. */ static void rxrpc_publish_service_conn(struct rxrpc_peer *peer, struct rxrpc_connection *conn) { struct rxrpc_connection *cursor = NULL; struct rxrpc_conn_proto k = conn->proto; struct rb_node **pp, *parent; write_seqlock(&peer->service_conn_lock); pp = &peer->service_conns.rb_node; parent = NULL; while (*pp) { parent = *pp; cursor = rb_entry(parent, struct rxrpc_connection, service_node); if (cursor->proto.index_key < k.index_key) pp = &(*pp)->rb_left; else if (cursor->proto.index_key > k.index_key) pp = &(*pp)->rb_right; else goto found_extant_conn; } rb_link_node_rcu(&conn->service_node, parent, pp); rb_insert_color(&conn->service_node, &peer->service_conns); conn_published: set_bit(RXRPC_CONN_IN_SERVICE_CONNS, &conn->flags); write_sequnlock(&peer->service_conn_lock); _leave(" = %d [new]", conn->debug_id); return; found_extant_conn: if (refcount_read(&cursor->ref) == 0) goto replace_old_connection; write_sequnlock(&peer->service_conn_lock); /* We should not be able to get here. rxrpc_incoming_connection() is * called in a non-reentrant context, so there can't be a race to * insert a new connection. */ BUG(); replace_old_connection: /* The old connection is from an outdated epoch. */ _debug("replace conn"); rb_replace_node_rcu(&cursor->service_node, &conn->service_node, &peer->service_conns); clear_bit(RXRPC_CONN_IN_SERVICE_CONNS, &cursor->flags); goto conn_published; } /* * Preallocate a service connection. The connection is placed on the proc and * reap lists so that we don't have to get the lock from BH context. */ struct rxrpc_connection *rxrpc_prealloc_service_connection(struct rxrpc_net *rxnet, gfp_t gfp) { struct rxrpc_connection *conn = rxrpc_alloc_connection(rxnet, gfp); if (conn) { /* We maintain an extra ref on the connection whilst it is on * the rxrpc_connections list. */ conn->state = RXRPC_CONN_SERVICE_PREALLOC; refcount_set(&conn->ref, 2); atomic_inc(&rxnet->nr_conns); write_lock(&rxnet->conn_lock); list_add_tail(&conn->link, &rxnet->service_conns); list_add_tail(&conn->proc_link, &rxnet->conn_proc_list); write_unlock(&rxnet->conn_lock); rxrpc_see_connection(conn, rxrpc_conn_new_service); } return conn; } /* * Set up an incoming connection. This is called in BH context with the RCU * read lock held. */ void rxrpc_new_incoming_connection(struct rxrpc_sock *rx, struct rxrpc_connection *conn, const struct rxrpc_security *sec, struct sk_buff *skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); _enter(""); conn->proto.epoch = sp->hdr.epoch; conn->proto.cid = sp->hdr.cid & RXRPC_CIDMASK; conn->orig_service_id = sp->hdr.serviceId; conn->service_id = sp->hdr.serviceId; conn->security_ix = sp->hdr.securityIndex; conn->out_clientflag = 0; conn->security = sec; if (conn->security_ix) conn->state = RXRPC_CONN_SERVICE_UNSECURED; else conn->state = RXRPC_CONN_SERVICE; /* See if we should upgrade the service. This can only happen on the * first packet on a new connection. Once done, it applies to all * subsequent calls on that connection. */ if (sp->hdr.userStatus == RXRPC_USERSTATUS_SERVICE_UPGRADE && conn->service_id == rx->service_upgrade.from) conn->service_id = rx->service_upgrade.to; atomic_set(&conn->active, 1); /* Make the connection a target for incoming packets. */ rxrpc_publish_service_conn(conn->peer, conn); } /* * Remove the service connection from the peer's tree, thereby removing it as a * target for incoming packets. */ void rxrpc_unpublish_service_conn(struct rxrpc_connection *conn) { struct rxrpc_peer *peer = conn->peer; write_seqlock(&peer->service_conn_lock); if (test_and_clear_bit(RXRPC_CONN_IN_SERVICE_CONNS, &conn->flags)) rb_erase(&conn->service_node, &peer->service_conns); write_sequnlock(&peer->service_conn_lock); }
1 16 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2013 Trond Myklebust <Trond.Myklebust@netapp.com> */ #undef TRACE_SYSTEM #define TRACE_SYSTEM nfs #if !defined(_TRACE_NFS_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_NFS_H #include <linux/tracepoint.h> #include <linux/iversion.h> #include <trace/misc/fs.h> #include <trace/misc/nfs.h> #include <trace/misc/sunrpc.h> #define nfs_show_cache_validity(v) \ __print_flags(v, "|", \ { NFS_INO_INVALID_DATA, "INVALID_DATA" }, \ { NFS_INO_INVALID_ATIME, "INVALID_ATIME" }, \ { NFS_INO_INVALID_ACCESS, "INVALID_ACCESS" }, \ { NFS_INO_INVALID_ACL, "INVALID_ACL" }, \ { NFS_INO_REVAL_FORCED, "REVAL_FORCED" }, \ { NFS_INO_INVALID_LABEL, "INVALID_LABEL" }, \ { NFS_INO_INVALID_CHANGE, "INVALID_CHANGE" }, \ { NFS_INO_INVALID_CTIME, "INVALID_CTIME" }, \ { NFS_INO_INVALID_MTIME, "INVALID_MTIME" }, \ { NFS_INO_INVALID_SIZE, "INVALID_SIZE" }, \ { NFS_INO_INVALID_OTHER, "INVALID_OTHER" }, \ { NFS_INO_DATA_INVAL_DEFER, "DATA_INVAL_DEFER" }, \ { NFS_INO_INVALID_BLOCKS, "INVALID_BLOCKS" }, \ { NFS_INO_INVALID_XATTR, "INVALID_XATTR" }, \ { NFS_INO_INVALID_NLINK, "INVALID_NLINK" }, \ { NFS_INO_INVALID_MODE, "INVALID_MODE" }) #define nfs_show_nfsi_flags(v) \ __print_flags(v, "|", \ { BIT(NFS_INO_STALE), "STALE" }, \ { BIT(NFS_INO_ACL_LRU_SET), "ACL_LRU_SET" }, \ { BIT(NFS_INO_INVALIDATING), "INVALIDATING" }, \ { BIT(NFS_INO_LAYOUTCOMMIT), "NEED_LAYOUTCOMMIT" }, \ { BIT(NFS_INO_LAYOUTCOMMITTING), "LAYOUTCOMMIT" }, \ { BIT(NFS_INO_LAYOUTSTATS), "LAYOUTSTATS" }, \ { BIT(NFS_INO_ODIRECT), "ODIRECT" }) DECLARE_EVENT_CLASS(nfs_inode_event, TP_PROTO( const struct inode *inode ), TP_ARGS(inode), TP_STRUCT__entry( __field(dev_t, dev) __field(u32, fhandle) __field(u64, fileid) __field(u64, version) ), TP_fast_assign( const struct nfs_inode *nfsi = NFS_I(inode); __entry->dev = inode->i_sb->s_dev; __entry->fileid = nfsi->fileid; __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); __entry->version = inode_peek_iversion_raw(inode); ), TP_printk( "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu ", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, (unsigned long long)__entry->version ) ); DECLARE_EVENT_CLASS(nfs_inode_event_done, TP_PROTO( const struct inode *inode, int error ), TP_ARGS(inode, error), TP_STRUCT__entry( __field(unsigned long, error) __field(dev_t, dev) __field(u32, fhandle) __field(unsigned char, type) __field(u64, fileid) __field(u64, version) __field(loff_t, size) __field(unsigned long, nfsi_flags) __field(unsigned long, cache_validity) ), TP_fast_assign( const struct nfs_inode *nfsi = NFS_I(inode); __entry->error = error < 0 ? -error : 0; __entry->dev = inode->i_sb->s_dev; __entry->fileid = nfsi->fileid; __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); __entry->type = nfs_umode_to_dtype(inode->i_mode); __entry->version = inode_peek_iversion_raw(inode); __entry->size = i_size_read(inode); __entry->nfsi_flags = nfsi->flags; __entry->cache_validity = nfsi->cache_validity; ), TP_printk( "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " "type=%u (%s) version=%llu size=%lld " "cache_validity=0x%lx (%s) nfs_flags=0x%lx (%s)", -__entry->error, show_nfs_status(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, __entry->type, show_fs_dirent_type(__entry->type), (unsigned long long)__entry->version, (long long)__entry->size, __entry->cache_validity, nfs_show_cache_validity(__entry->cache_validity), __entry->nfsi_flags, nfs_show_nfsi_flags(__entry->nfsi_flags) ) ); #define DEFINE_NFS_INODE_EVENT(name) \ DEFINE_EVENT(nfs_inode_event, name, \ TP_PROTO( \ const struct inode *inode \ ), \ TP_ARGS(inode)) #define DEFINE_NFS_INODE_EVENT_DONE(name) \ DEFINE_EVENT(nfs_inode_event_done, name, \ TP_PROTO( \ const struct inode *inode, \ int error \ ), \ TP_ARGS(inode, error)) DEFINE_NFS_INODE_EVENT(nfs_set_inode_stale); DEFINE_NFS_INODE_EVENT(nfs_refresh_inode_enter); DEFINE_NFS_INODE_EVENT_DONE(nfs_refresh_inode_exit); DEFINE_NFS_INODE_EVENT(nfs_revalidate_inode_enter); DEFINE_NFS_INODE_EVENT_DONE(nfs_revalidate_inode_exit); DEFINE_NFS_INODE_EVENT(nfs_invalidate_mapping_enter); DEFINE_NFS_INODE_EVENT_DONE(nfs_invalidate_mapping_exit); DEFINE_NFS_INODE_EVENT(nfs_getattr_enter); DEFINE_NFS_INODE_EVENT_DONE(nfs_getattr_exit); DEFINE_NFS_INODE_EVENT(nfs_setattr_enter); DEFINE_NFS_INODE_EVENT_DONE(nfs_setattr_exit); DEFINE_NFS_INODE_EVENT(nfs_writeback_inode_enter); DEFINE_NFS_INODE_EVENT_DONE(nfs_writeback_inode_exit); DEFINE_NFS_INODE_EVENT(nfs_fsync_enter); DEFINE_NFS_INODE_EVENT_DONE(nfs_fsync_exit); DEFINE_NFS_INODE_EVENT(nfs_access_enter); DEFINE_NFS_INODE_EVENT_DONE(nfs_set_cache_invalid); DEFINE_NFS_INODE_EVENT(nfs_readdir_force_readdirplus); DEFINE_NFS_INODE_EVENT_DONE(nfs_readdir_cache_fill_done); DEFINE_NFS_INODE_EVENT_DONE(nfs_readdir_uncached_done); TRACE_EVENT(nfs_access_exit, TP_PROTO( const struct inode *inode, unsigned int mask, unsigned int permitted, int error ), TP_ARGS(inode, mask, permitted, error), TP_STRUCT__entry( __field(unsigned long, error) __field(dev_t, dev) __field(u32, fhandle) __field(unsigned char, type) __field(u64, fileid) __field(u64, version) __field(loff_t, size) __field(unsigned long, nfsi_flags) __field(unsigned long, cache_validity) __field(unsigned int, mask) __field(unsigned int, permitted) ), TP_fast_assign( const struct nfs_inode *nfsi = NFS_I(inode); __entry->error = error < 0 ? -error : 0; __entry->dev = inode->i_sb->s_dev; __entry->fileid = nfsi->fileid; __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); __entry->type = nfs_umode_to_dtype(inode->i_mode); __entry->version = inode_peek_iversion_raw(inode); __entry->size = i_size_read(inode); __entry->nfsi_flags = nfsi->flags; __entry->cache_validity = nfsi->cache_validity; __entry->mask = mask; __entry->permitted = permitted; ), TP_printk( "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " "type=%u (%s) version=%llu size=%lld " "cache_validity=0x%lx (%s) nfs_flags=0x%lx (%s) " "mask=0x%x permitted=0x%x", -__entry->error, show_nfs_status(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, __entry->type, show_fs_dirent_type(__entry->type), (unsigned long long)__entry->version, (long long)__entry->size, __entry->cache_validity, nfs_show_cache_validity(__entry->cache_validity), __entry->nfsi_flags, nfs_show_nfsi_flags(__entry->nfsi_flags), __entry->mask, __entry->permitted ) ); DECLARE_EVENT_CLASS(nfs_update_size_class, TP_PROTO( const struct inode *inode, loff_t new_size ), TP_ARGS(inode, new_size), TP_STRUCT__entry( __field(dev_t, dev) __field(u32, fhandle) __field(u64, fileid) __field(u64, version) __field(loff_t, cur_size) __field(loff_t, new_size) ), TP_fast_assign( const struct nfs_inode *nfsi = NFS_I(inode); __entry->dev = inode->i_sb->s_dev; __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); __entry->fileid = nfsi->fileid; __entry->version = inode_peek_iversion_raw(inode); __entry->cur_size = i_size_read(inode); __entry->new_size = new_size; ), TP_printk( "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu cursize=%lld newsize=%lld", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, __entry->version, __entry->cur_size, __entry->new_size ) ); #define DEFINE_NFS_UPDATE_SIZE_EVENT(name) \ DEFINE_EVENT(nfs_update_size_class, nfs_size_##name, \ TP_PROTO( \ const struct inode *inode, \ loff_t new_size \ ), \ TP_ARGS(inode, new_size)) DEFINE_NFS_UPDATE_SIZE_EVENT(truncate); DEFINE_NFS_UPDATE_SIZE_EVENT(wcc); DEFINE_NFS_UPDATE_SIZE_EVENT(update); DEFINE_NFS_UPDATE_SIZE_EVENT(grow); DECLARE_EVENT_CLASS(nfs_inode_range_event, TP_PROTO( const struct inode *inode, loff_t range_start, loff_t range_end ), TP_ARGS(inode, range_start, range_end), TP_STRUCT__entry( __field(dev_t, dev) __field(u32, fhandle) __field(u64, fileid) __field(u64, version) __field(loff_t, range_start) __field(loff_t, range_end) ), TP_fast_assign( const struct nfs_inode *nfsi = NFS_I(inode); __entry->dev = inode->i_sb->s_dev; __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); __entry->fileid = nfsi->fileid; __entry->version = inode_peek_iversion_raw(inode); __entry->range_start = range_start; __entry->range_end = range_end; ), TP_printk( "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu " "range=[%lld, %lld]", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, __entry->version, __entry->range_start, __entry->range_end ) ); #define DEFINE_NFS_INODE_RANGE_EVENT(name) \ DEFINE_EVENT(nfs_inode_range_event, name, \ TP_PROTO( \ const struct inode *inode, \ loff_t range_start, \ loff_t range_end \ ), \ TP_ARGS(inode, range_start, range_end)) DEFINE_NFS_INODE_RANGE_EVENT(nfs_readdir_invalidate_cache_range); DECLARE_EVENT_CLASS(nfs_readdir_event, TP_PROTO( const struct file *file, const __be32 *verifier, u64 cookie, pgoff_t page_index, unsigned int dtsize ), TP_ARGS(file, verifier, cookie, page_index, dtsize), TP_STRUCT__entry( __field(dev_t, dev) __field(u32, fhandle) __field(u64, fileid) __field(u64, version) __array(char, verifier, NFS4_VERIFIER_SIZE) __field(u64, cookie) __field(pgoff_t, index) __field(unsigned int, dtsize) ), TP_fast_assign( const struct inode *dir = file_inode(file); const struct nfs_inode *nfsi = NFS_I(dir); __entry->dev = dir->i_sb->s_dev; __entry->fileid = nfsi->fileid; __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); __entry->version = inode_peek_iversion_raw(dir); if (cookie != 0) memcpy(__entry->verifier, verifier, NFS4_VERIFIER_SIZE); else memset(__entry->verifier, 0, NFS4_VERIFIER_SIZE); __entry->cookie = cookie; __entry->index = page_index; __entry->dtsize = dtsize; ), TP_printk( "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu " "cookie=%s:0x%llx cache_index=%lu dtsize=%u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, __entry->version, show_nfs4_verifier(__entry->verifier), (unsigned long long)__entry->cookie, __entry->index, __entry->dtsize ) ); #define DEFINE_NFS_READDIR_EVENT(name) \ DEFINE_EVENT(nfs_readdir_event, name, \ TP_PROTO( \ const struct file *file, \ const __be32 *verifier, \ u64 cookie, \ pgoff_t page_index, \ unsigned int dtsize \ ), \ TP_ARGS(file, verifier, cookie, page_index, dtsize)) DEFINE_NFS_READDIR_EVENT(nfs_readdir_cache_fill); DEFINE_NFS_READDIR_EVENT(nfs_readdir_uncached); DECLARE_EVENT_CLASS(nfs_lookup_event, TP_PROTO( const struct inode *dir, const struct dentry *dentry, unsigned int flags ), TP_ARGS(dir, dentry, flags), TP_STRUCT__entry( __field(unsigned long, flags) __field(dev_t, dev) __field(u64, dir) __field(u64, fileid) __string(name, dentry->d_name.name) ), TP_fast_assign( __entry->dev = dir->i_sb->s_dev; __entry->dir = NFS_FILEID(dir); __entry->flags = flags; __entry->fileid = d_is_negative(dentry) ? 0 : NFS_FILEID(d_inode(dentry)); __assign_str(name); ), TP_printk( "flags=0x%lx (%s) name=%02x:%02x:%llu/%s fileid=%llu", __entry->flags, show_fs_lookup_flags(__entry->flags), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->dir, __get_str(name), __entry->fileid ) ); #define DEFINE_NFS_LOOKUP_EVENT(name) \ DEFINE_EVENT(nfs_lookup_event, name, \ TP_PROTO( \ const struct inode *dir, \ const struct dentry *dentry, \ unsigned int flags \ ), \ TP_ARGS(dir, dentry, flags)) DECLARE_EVENT_CLASS(nfs_lookup_event_done, TP_PROTO( const struct inode *dir, const struct dentry *dentry, unsigned int flags, int error ), TP_ARGS(dir, dentry, flags, error), TP_STRUCT__entry( __field(unsigned long, error) __field(unsigned long, flags) __field(dev_t, dev) __field(u64, dir) __field(u64, fileid) __string(name, dentry->d_name.name) ), TP_fast_assign( __entry->dev = dir->i_sb->s_dev; __entry->dir = NFS_FILEID(dir); __entry->error = error < 0 ? -error : 0; __entry->flags = flags; __entry->fileid = d_is_negative(dentry) ? 0 : NFS_FILEID(d_inode(dentry)); __assign_str(name); ), TP_printk( "error=%ld (%s) flags=0x%lx (%s) name=%02x:%02x:%llu/%s fileid=%llu", -__entry->error, show_nfs_status(__entry->error), __entry->flags, show_fs_lookup_flags(__entry->flags), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->dir, __get_str(name), __entry->fileid ) ); #define DEFINE_NFS_LOOKUP_EVENT_DONE(name) \ DEFINE_EVENT(nfs_lookup_event_done, name, \ TP_PROTO( \ const struct inode *dir, \ const struct dentry *dentry, \ unsigned int flags, \ int error \ ), \ TP_ARGS(dir, dentry, flags, error)) DEFINE_NFS_LOOKUP_EVENT(nfs_lookup_enter); DEFINE_NFS_LOOKUP_EVENT_DONE(nfs_lookup_exit); DEFINE_NFS_LOOKUP_EVENT(nfs_lookup_revalidate_enter); DEFINE_NFS_LOOKUP_EVENT_DONE(nfs_lookup_revalidate_exit); DEFINE_NFS_LOOKUP_EVENT(nfs_readdir_lookup); DEFINE_NFS_LOOKUP_EVENT(nfs_readdir_lookup_revalidate_failed); DEFINE_NFS_LOOKUP_EVENT_DONE(nfs_readdir_lookup_revalidate); TRACE_EVENT(nfs_atomic_open_enter, TP_PROTO( const struct inode *dir, const struct nfs_open_context *ctx, unsigned int flags ), TP_ARGS(dir, ctx, flags), TP_STRUCT__entry( __field(unsigned long, flags) __field(unsigned long, fmode) __field(dev_t, dev) __field(u64, dir) __string(name, ctx->dentry->d_name.name) ), TP_fast_assign( __entry->dev = dir->i_sb->s_dev; __entry->dir = NFS_FILEID(dir); __entry->flags = flags; __entry->fmode = (__force unsigned long)ctx->mode; __assign_str(name); ), TP_printk( "flags=0x%lx (%s) fmode=%s name=%02x:%02x:%llu/%s", __entry->flags, show_fs_fcntl_open_flags(__entry->flags), show_fs_fmode_flags(__entry->fmode), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->dir, __get_str(name) ) ); TRACE_EVENT(nfs_atomic_open_exit, TP_PROTO( const struct inode *dir, const struct nfs_open_context *ctx, unsigned int flags, int error ), TP_ARGS(dir, ctx, flags, error), TP_STRUCT__entry( __field(unsigned long, error) __field(unsigned long, flags) __field(unsigned long, fmode) __field(dev_t, dev) __field(u64, dir) __string(name, ctx->dentry->d_name.name) ), TP_fast_assign( __entry->error = -error; __entry->dev = dir->i_sb->s_dev; __entry->dir = NFS_FILEID(dir); __entry->flags = flags; __entry->fmode = (__force unsigned long)ctx->mode; __assign_str(name); ), TP_printk( "error=%ld (%s) flags=0x%lx (%s) fmode=%s " "name=%02x:%02x:%llu/%s", -__entry->error, show_nfs_status(__entry->error), __entry->flags, show_fs_fcntl_open_flags(__entry->flags), show_fs_fmode_flags(__entry->fmode), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->dir, __get_str(name) ) ); TRACE_EVENT(nfs_create_enter, TP_PROTO( const struct inode *dir, const struct dentry *dentry, unsigned int flags ), TP_ARGS(dir, dentry, flags), TP_STRUCT__entry( __field(unsigned long, flags) __field(dev_t, dev) __field(u64, dir) __string(name, dentry->d_name.name) ), TP_fast_assign( __entry->dev = dir->i_sb->s_dev; __entry->dir = NFS_FILEID(dir); __entry->flags = flags; __assign_str(name); ), TP_printk( "flags=0x%lx (%s) name=%02x:%02x:%llu/%s", __entry->flags, show_fs_fcntl_open_flags(__entry->flags), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->dir, __get_str(name) ) ); TRACE_EVENT(nfs_create_exit, TP_PROTO( const struct inode *dir, const struct dentry *dentry, unsigned int flags, int error ), TP_ARGS(dir, dentry, flags, error), TP_STRUCT__entry( __field(unsigned long, error) __field(unsigned long, flags) __field(dev_t, dev) __field(u64, dir) __string(name, dentry->d_name.name) ), TP_fast_assign( __entry->error = -error; __entry->dev = dir->i_sb->s_dev; __entry->dir = NFS_FILEID(dir); __entry->flags = flags; __assign_str(name); ), TP_printk( "error=%ld (%s) flags=0x%lx (%s) name=%02x:%02x:%llu/%s", -__entry->error, show_nfs_status(__entry->error), __entry->flags, show_fs_fcntl_open_flags(__entry->flags), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->dir, __get_str(name) ) ); DECLARE_EVENT_CLASS(nfs_directory_event, TP_PROTO( const struct inode *dir, const struct dentry *dentry ), TP_ARGS(dir, dentry), TP_STRUCT__entry( __field(dev_t, dev) __field(u64, dir) __string(name, dentry->d_name.name) ), TP_fast_assign( __entry->dev = dir->i_sb->s_dev; __entry->dir = NFS_FILEID(dir); __assign_str(name); ), TP_printk( "name=%02x:%02x:%llu/%s", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->dir, __get_str(name) ) ); #define DEFINE_NFS_DIRECTORY_EVENT(name) \ DEFINE_EVENT(nfs_directory_event, name, \ TP_PROTO( \ const struct inode *dir, \ const struct dentry *dentry \ ), \ TP_ARGS(dir, dentry)) DECLARE_EVENT_CLASS(nfs_directory_event_done, TP_PROTO( const struct inode *dir, const struct dentry *dentry, int error ), TP_ARGS(dir, dentry, error), TP_STRUCT__entry( __field(unsigned long, error) __field(dev_t, dev) __field(u64, dir) __string(name, dentry->d_name.name) ), TP_fast_assign( __entry->dev = dir->i_sb->s_dev; __entry->dir = NFS_FILEID(dir); __entry->error = error < 0 ? -error : 0; __assign_str(name); ), TP_printk( "error=%ld (%s) name=%02x:%02x:%llu/%s", -__entry->error, show_nfs_status(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->dir, __get_str(name) ) ); #define DEFINE_NFS_DIRECTORY_EVENT_DONE(name) \ DEFINE_EVENT(nfs_directory_event_done, name, \ TP_PROTO( \ const struct inode *dir, \ const struct dentry *dentry, \ int error \ ), \ TP_ARGS(dir, dentry, error)) DEFINE_NFS_DIRECTORY_EVENT(nfs_mknod_enter); DEFINE_NFS_DIRECTORY_EVENT_DONE(nfs_mknod_exit); DEFINE_NFS_DIRECTORY_EVENT(nfs_mkdir_enter); DEFINE_NFS_DIRECTORY_EVENT_DONE(nfs_mkdir_exit); DEFINE_NFS_DIRECTORY_EVENT(nfs_rmdir_enter); DEFINE_NFS_DIRECTORY_EVENT_DONE(nfs_rmdir_exit); DEFINE_NFS_DIRECTORY_EVENT(nfs_remove_enter); DEFINE_NFS_DIRECTORY_EVENT_DONE(nfs_remove_exit); DEFINE_NFS_DIRECTORY_EVENT(nfs_unlink_enter); DEFINE_NFS_DIRECTORY_EVENT_DONE(nfs_unlink_exit); DEFINE_NFS_DIRECTORY_EVENT(nfs_symlink_enter); DEFINE_NFS_DIRECTORY_EVENT_DONE(nfs_symlink_exit); TRACE_EVENT(nfs_link_enter, TP_PROTO( const struct inode *inode, const struct inode *dir, const struct dentry *dentry ), TP_ARGS(inode, dir, dentry), TP_STRUCT__entry( __field(dev_t, dev) __field(u64, fileid) __field(u64, dir) __string(name, dentry->d_name.name) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->fileid = NFS_FILEID(inode); __entry->dir = NFS_FILEID(dir); __assign_str(name); ), TP_printk( "fileid=%02x:%02x:%llu name=%02x:%02x:%llu/%s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->fileid, MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->dir, __get_str(name) ) ); TRACE_EVENT(nfs_link_exit, TP_PROTO( const struct inode *inode, const struct inode *dir, const struct dentry *dentry, int error ), TP_ARGS(inode, dir, dentry, error), TP_STRUCT__entry( __field(unsigned long, error) __field(dev_t, dev) __field(u64, fileid) __field(u64, dir) __string(name, dentry->d_name.name) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->fileid = NFS_FILEID(inode); __entry->dir = NFS_FILEID(dir); __entry->error = error < 0 ? -error : 0; __assign_str(name); ), TP_printk( "error=%ld (%s) fileid=%02x:%02x:%llu name=%02x:%02x:%llu/%s", -__entry->error, show_nfs_status(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), __entry->fileid, MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->dir, __get_str(name) ) ); DECLARE_EVENT_CLASS(nfs_rename_event, TP_PROTO( const struct inode *old_dir, const struct dentry *old_dentry, const struct inode *new_dir, const struct dentry *new_dentry ), TP_ARGS(old_dir, old_dentry, new_dir, new_dentry), TP_STRUCT__entry( __field(dev_t, dev) __field(u64, old_dir) __field(u64, new_dir) __string(old_name, old_dentry->d_name.name) __string(new_name, new_dentry->d_name.name) ), TP_fast_assign( __entry->dev = old_dir->i_sb->s_dev; __entry->old_dir = NFS_FILEID(old_dir); __entry->new_dir = NFS_FILEID(new_dir); __assign_str(old_name); __assign_str(new_name); ), TP_printk( "old_name=%02x:%02x:%llu/%s new_name=%02x:%02x:%llu/%s", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->old_dir, __get_str(old_name), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->new_dir, __get_str(new_name) ) ); #define DEFINE_NFS_RENAME_EVENT(name) \ DEFINE_EVENT(nfs_rename_event, name, \ TP_PROTO( \ const struct inode *old_dir, \ const struct dentry *old_dentry, \ const struct inode *new_dir, \ const struct dentry *new_dentry \ ), \ TP_ARGS(old_dir, old_dentry, new_dir, new_dentry)) DECLARE_EVENT_CLASS(nfs_rename_event_done, TP_PROTO( const struct inode *old_dir, const struct dentry *old_dentry, const struct inode *new_dir, const struct dentry *new_dentry, int error ), TP_ARGS(old_dir, old_dentry, new_dir, new_dentry, error), TP_STRUCT__entry( __field(dev_t, dev) __field(unsigned long, error) __field(u64, old_dir) __string(old_name, old_dentry->d_name.name) __field(u64, new_dir) __string(new_name, new_dentry->d_name.name) ), TP_fast_assign( __entry->dev = old_dir->i_sb->s_dev; __entry->error = -error; __entry->old_dir = NFS_FILEID(old_dir); __entry->new_dir = NFS_FILEID(new_dir); __assign_str(old_name); __assign_str(new_name); ), TP_printk( "error=%ld (%s) old_name=%02x:%02x:%llu/%s " "new_name=%02x:%02x:%llu/%s", -__entry->error, show_nfs_status(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->old_dir, __get_str(old_name), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->new_dir, __get_str(new_name) ) ); #define DEFINE_NFS_RENAME_EVENT_DONE(name) \ DEFINE_EVENT(nfs_rename_event_done, name, \ TP_PROTO( \ const struct inode *old_dir, \ const struct dentry *old_dentry, \ const struct inode *new_dir, \ const struct dentry *new_dentry, \ int error \ ), \ TP_ARGS(old_dir, old_dentry, new_dir, \ new_dentry, error)) DEFINE_NFS_RENAME_EVENT(nfs_rename_enter); DEFINE_NFS_RENAME_EVENT_DONE(nfs_rename_exit); DEFINE_NFS_RENAME_EVENT_DONE(nfs_async_rename_done); TRACE_EVENT(nfs_sillyrename_unlink, TP_PROTO( const struct nfs_unlinkdata *data, int error ), TP_ARGS(data, error), TP_STRUCT__entry( __field(dev_t, dev) __field(unsigned long, error) __field(u64, dir) __dynamic_array(char, name, data->args.name.len + 1) ), TP_fast_assign( struct inode *dir = d_inode(data->dentry->d_parent); size_t len = data->args.name.len; __entry->dev = dir->i_sb->s_dev; __entry->dir = NFS_FILEID(dir); __entry->error = -error; memcpy(__get_str(name), data->args.name.name, len); __get_str(name)[len] = 0; ), TP_printk( "error=%ld (%s) name=%02x:%02x:%llu/%s", -__entry->error, show_nfs_status(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->dir, __get_str(name) ) ); DECLARE_EVENT_CLASS(nfs_folio_event, TP_PROTO( const struct inode *inode, loff_t offset, size_t count ), TP_ARGS(inode, offset, count), TP_STRUCT__entry( __field(dev_t, dev) __field(u32, fhandle) __field(u64, fileid) __field(u64, version) __field(loff_t, offset) __field(size_t, count) ), TP_fast_assign( const struct nfs_inode *nfsi = NFS_I(inode); __entry->dev = inode->i_sb->s_dev; __entry->fileid = nfsi->fileid; __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); __entry->version = inode_peek_iversion_raw(inode); __entry->offset = offset, __entry->count = count; ), TP_printk( "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu " "offset=%lld count=%zu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, __entry->version, __entry->offset, __entry->count ) ); #define DEFINE_NFS_FOLIO_EVENT(name) \ DEFINE_EVENT(nfs_folio_event, name, \ TP_PROTO( \ const struct inode *inode, \ loff_t offset, \ size_t count \ ), \ TP_ARGS(inode, offset, count)) DECLARE_EVENT_CLASS(nfs_folio_event_done, TP_PROTO( const struct inode *inode, loff_t offset, size_t count, int ret ), TP_ARGS(inode, offset, count, ret), TP_STRUCT__entry( __field(dev_t, dev) __field(u32, fhandle) __field(int, ret) __field(u64, fileid) __field(u64, version) __field(loff_t, offset) __field(size_t, count) ), TP_fast_assign( const struct nfs_inode *nfsi = NFS_I(inode); __entry->dev = inode->i_sb->s_dev; __entry->fileid = nfsi->fileid; __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); __entry->version = inode_peek_iversion_raw(inode); __entry->offset = offset, __entry->count = count, __entry->ret = ret; ), TP_printk( "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu " "offset=%lld count=%zu ret=%d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, __entry->version, __entry->offset, __entry->count, __entry->ret ) ); #define DEFINE_NFS_FOLIO_EVENT_DONE(name) \ DEFINE_EVENT(nfs_folio_event_done, name, \ TP_PROTO( \ const struct inode *inode, \ loff_t offset, \ size_t count, \ int ret \ ), \ TP_ARGS(inode, offset, count, ret)) DEFINE_NFS_FOLIO_EVENT(nfs_aop_readpage); DEFINE_NFS_FOLIO_EVENT_DONE(nfs_aop_readpage_done); DEFINE_NFS_FOLIO_EVENT(nfs_writeback_folio); DEFINE_NFS_FOLIO_EVENT_DONE(nfs_writeback_folio_done); DEFINE_NFS_FOLIO_EVENT(nfs_invalidate_folio); DEFINE_NFS_FOLIO_EVENT_DONE(nfs_launder_folio_done); TRACE_EVENT(nfs_aop_readahead, TP_PROTO( const struct inode *inode, loff_t pos, unsigned int nr_pages ), TP_ARGS(inode, pos, nr_pages), TP_STRUCT__entry( __field(dev_t, dev) __field(u32, fhandle) __field(u64, fileid) __field(u64, version) __field(loff_t, offset) __field(unsigned int, nr_pages) ), TP_fast_assign( const struct nfs_inode *nfsi = NFS_I(inode); __entry->dev = inode->i_sb->s_dev; __entry->fileid = nfsi->fileid; __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); __entry->version = inode_peek_iversion_raw(inode); __entry->offset = pos; __entry->nr_pages = nr_pages; ), TP_printk( "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu offset=%lld nr_pages=%u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, __entry->version, __entry->offset, __entry->nr_pages ) ); TRACE_EVENT(nfs_aop_readahead_done, TP_PROTO( const struct inode *inode, unsigned int nr_pages, int ret ), TP_ARGS(inode, nr_pages, ret), TP_STRUCT__entry( __field(dev_t, dev) __field(u32, fhandle) __field(int, ret) __field(u64, fileid) __field(u64, version) __field(loff_t, offset) __field(unsigned int, nr_pages) ), TP_fast_assign( const struct nfs_inode *nfsi = NFS_I(inode); __entry->dev = inode->i_sb->s_dev; __entry->fileid = nfsi->fileid; __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); __entry->version = inode_peek_iversion_raw(inode); __entry->nr_pages = nr_pages; __entry->ret = ret; ), TP_printk( "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu nr_pages=%u ret=%d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, __entry->version, __entry->nr_pages, __entry->ret ) ); TRACE_EVENT(nfs_initiate_read, TP_PROTO( const struct nfs_pgio_header *hdr ), TP_ARGS(hdr), TP_STRUCT__entry( __field(dev_t, dev) __field(u32, fhandle) __field(u64, fileid) __field(loff_t, offset) __field(u32, count) ), TP_fast_assign( const struct inode *inode = hdr->inode; const struct nfs_inode *nfsi = NFS_I(inode); const struct nfs_fh *fh = hdr->args.fh ? hdr->args.fh : &nfsi->fh; __entry->offset = hdr->args.offset; __entry->count = hdr->args.count; __entry->dev = inode->i_sb->s_dev; __entry->fileid = nfsi->fileid; __entry->fhandle = nfs_fhandle_hash(fh); ), TP_printk( "fileid=%02x:%02x:%llu fhandle=0x%08x " "offset=%lld count=%u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, (long long)__entry->offset, __entry->count ) ); TRACE_EVENT(nfs_readpage_done, TP_PROTO( const struct rpc_task *task, const struct nfs_pgio_header *hdr ), TP_ARGS(task, hdr), TP_STRUCT__entry( __field(dev_t, dev) __field(u32, fhandle) __field(u64, fileid) __field(loff_t, offset) __field(u32, arg_count) __field(u32, res_count) __field(bool, eof) __field(int, error) ), TP_fast_assign( const struct inode *inode = hdr->inode; const struct nfs_inode *nfsi = NFS_I(inode); const struct nfs_fh *fh = hdr->args.fh ? hdr->args.fh : &nfsi->fh; __entry->error = task->tk_status; __entry->offset = hdr->args.offset; __entry->arg_count = hdr->args.count; __entry->res_count = hdr->res.count; __entry->eof = hdr->res.eof; __entry->dev = inode->i_sb->s_dev; __entry->fileid = nfsi->fileid; __entry->fhandle = nfs_fhandle_hash(fh); ), TP_printk( "error=%d fileid=%02x:%02x:%llu fhandle=0x%08x " "offset=%lld count=%u res=%u%s", __entry->error, MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, (long long)__entry->offset, __entry->arg_count, __entry->res_count, __entry->eof ? " eof" : "" ) ); TRACE_EVENT(nfs_readpage_short, TP_PROTO( const struct rpc_task *task, const struct nfs_pgio_header *hdr ), TP_ARGS(task, hdr), TP_STRUCT__entry( __field(dev_t, dev) __field(u32, fhandle) __field(u64, fileid) __field(loff_t, offset) __field(u32, arg_count) __field(u32, res_count) __field(bool, eof) __field(int, error) ), TP_fast_assign( const struct inode *inode = hdr->inode; const struct nfs_inode *nfsi = NFS_I(inode); const struct nfs_fh *fh = hdr->args.fh ? hdr->args.fh : &nfsi->fh; __entry->error = task->tk_status; __entry->offset = hdr->args.offset; __entry->arg_count = hdr->args.count; __entry->res_count = hdr->res.count; __entry->eof = hdr->res.eof; __entry->dev = inode->i_sb->s_dev; __entry->fileid = nfsi->fileid; __entry->fhandle = nfs_fhandle_hash(fh); ), TP_printk( "error=%d fileid=%02x:%02x:%llu fhandle=0x%08x " "offset=%lld count=%u res=%u%s", __entry->error, MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, (long long)__entry->offset, __entry->arg_count, __entry->res_count, __entry->eof ? " eof" : "" ) ); TRACE_EVENT(nfs_pgio_error, TP_PROTO( const struct nfs_pgio_header *hdr, int error, loff_t pos ), TP_ARGS(hdr, error, pos), TP_STRUCT__entry( __field(dev_t, dev) __field(u32, fhandle) __field(u64, fileid) __field(loff_t, offset) __field(u32, arg_count) __field(u32, res_count) __field(loff_t, pos) __field(int, error) ), TP_fast_assign( const struct inode *inode = hdr->inode; const struct nfs_inode *nfsi = NFS_I(inode); const struct nfs_fh *fh = hdr->args.fh ? hdr->args.fh : &nfsi->fh; __entry->error = error; __entry->offset = hdr->args.offset; __entry->arg_count = hdr->args.count; __entry->res_count = hdr->res.count; __entry->dev = inode->i_sb->s_dev; __entry->fileid = nfsi->fileid; __entry->fhandle = nfs_fhandle_hash(fh); ), TP_printk("error=%d fileid=%02x:%02x:%llu fhandle=0x%08x " "offset=%lld count=%u res=%u pos=%llu", __entry->error, MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, (long long)__entry->offset, __entry->arg_count, __entry->res_count, __entry->pos ) ); TRACE_EVENT(nfs_initiate_write, TP_PROTO( const struct nfs_pgio_header *hdr ), TP_ARGS(hdr), TP_STRUCT__entry( __field(dev_t, dev) __field(u32, fhandle) __field(u64, fileid) __field(loff_t, offset) __field(u32, count) __field(unsigned long, stable) ), TP_fast_assign( const struct inode *inode = hdr->inode; const struct nfs_inode *nfsi = NFS_I(inode); const struct nfs_fh *fh = hdr->args.fh ? hdr->args.fh : &nfsi->fh; __entry->offset = hdr->args.offset; __entry->count = hdr->args.count; __entry->stable = hdr->args.stable; __entry->dev = inode->i_sb->s_dev; __entry->fileid = nfsi->fileid; __entry->fhandle = nfs_fhandle_hash(fh); ), TP_printk( "fileid=%02x:%02x:%llu fhandle=0x%08x " "offset=%lld count=%u stable=%s", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, (long long)__entry->offset, __entry->count, show_nfs_stable_how(__entry->stable) ) ); TRACE_EVENT(nfs_writeback_done, TP_PROTO( const struct rpc_task *task, const struct nfs_pgio_header *hdr ), TP_ARGS(task, hdr), TP_STRUCT__entry( __field(dev_t, dev) __field(u32, fhandle) __field(u64, fileid) __field(loff_t, offset) __field(u32, arg_count) __field(u32, res_count) __field(int, error) __field(unsigned long, stable) __array(char, verifier, NFS4_VERIFIER_SIZE) ), TP_fast_assign( const struct inode *inode = hdr->inode; const struct nfs_inode *nfsi = NFS_I(inode); const struct nfs_fh *fh = hdr->args.fh ? hdr->args.fh : &nfsi->fh; const struct nfs_writeverf *verf = hdr->res.verf; __entry->error = task->tk_status; __entry->offset = hdr->args.offset; __entry->arg_count = hdr->args.count; __entry->res_count = hdr->res.count; __entry->stable = verf->committed; memcpy(__entry->verifier, &verf->verifier, NFS4_VERIFIER_SIZE); __entry->dev = inode->i_sb->s_dev; __entry->fileid = nfsi->fileid; __entry->fhandle = nfs_fhandle_hash(fh); ), TP_printk( "error=%d fileid=%02x:%02x:%llu fhandle=0x%08x " "offset=%lld count=%u res=%u stable=%s " "verifier=%s", __entry->error, MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, (long long)__entry->offset, __entry->arg_count, __entry->res_count, show_nfs_stable_how(__entry->stable), show_nfs4_verifier(__entry->verifier) ) ); DECLARE_EVENT_CLASS(nfs_page_error_class, TP_PROTO( const struct inode *inode, const struct nfs_page *req, int error ), TP_ARGS(inode, req, error), TP_STRUCT__entry( __field(dev_t, dev) __field(u32, fhandle) __field(u64, fileid) __field(loff_t, offset) __field(unsigned int, count) __field(int, error) ), TP_fast_assign( const struct nfs_inode *nfsi = NFS_I(inode); __entry->dev = inode->i_sb->s_dev; __entry->fileid = nfsi->fileid; __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); __entry->offset = req_offset(req); __entry->count = req->wb_bytes; __entry->error = error; ), TP_printk( "error=%d fileid=%02x:%02x:%llu fhandle=0x%08x " "offset=%lld count=%u", __entry->error, MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, __entry->offset, __entry->count ) ); #define DEFINE_NFS_PAGEERR_EVENT(name) \ DEFINE_EVENT(nfs_page_error_class, name, \ TP_PROTO( \ const struct inode *inode, \ const struct nfs_page *req, \ int error \ ), \ TP_ARGS(inode, req, error)) DEFINE_NFS_PAGEERR_EVENT(nfs_write_error); DEFINE_NFS_PAGEERR_EVENT(nfs_comp_error); DEFINE_NFS_PAGEERR_EVENT(nfs_commit_error); TRACE_EVENT(nfs_initiate_commit, TP_PROTO( const struct nfs_commit_data *data ), TP_ARGS(data), TP_STRUCT__entry( __field(dev_t, dev) __field(u32, fhandle) __field(u64, fileid) __field(loff_t, offset) __field(u32, count) ), TP_fast_assign( const struct inode *inode = data->inode; const struct nfs_inode *nfsi = NFS_I(inode); const struct nfs_fh *fh = data->args.fh ? data->args.fh : &nfsi->fh; __entry->offset = data->args.offset; __entry->count = data->args.count; __entry->dev = inode->i_sb->s_dev; __entry->fileid = nfsi->fileid; __entry->fhandle = nfs_fhandle_hash(fh); ), TP_printk( "fileid=%02x:%02x:%llu fhandle=0x%08x " "offset=%lld count=%u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, (long long)__entry->offset, __entry->count ) ); TRACE_EVENT(nfs_commit_done, TP_PROTO( const struct rpc_task *task, const struct nfs_commit_data *data ), TP_ARGS(task, data), TP_STRUCT__entry( __field(dev_t, dev) __field(u32, fhandle) __field(u64, fileid) __field(loff_t, offset) __field(int, error) __field(unsigned long, stable) __array(char, verifier, NFS4_VERIFIER_SIZE) ), TP_fast_assign( const struct inode *inode = data->inode; const struct nfs_inode *nfsi = NFS_I(inode); const struct nfs_fh *fh = data->args.fh ? data->args.fh : &nfsi->fh; const struct nfs_writeverf *verf = data->res.verf; __entry->error = task->tk_status; __entry->offset = data->args.offset; __entry->stable = verf->committed; memcpy(__entry->verifier, &verf->verifier, NFS4_VERIFIER_SIZE); __entry->dev = inode->i_sb->s_dev; __entry->fileid = nfsi->fileid; __entry->fhandle = nfs_fhandle_hash(fh); ), TP_printk( "error=%d fileid=%02x:%02x:%llu fhandle=0x%08x " "offset=%lld stable=%s verifier=%s", __entry->error, MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, (long long)__entry->offset, show_nfs_stable_how(__entry->stable), show_nfs4_verifier(__entry->verifier) ) ); #define nfs_show_direct_req_flags(v) \ __print_flags(v, "|", \ { NFS_ODIRECT_DO_COMMIT, "DO_COMMIT" }, \ { NFS_ODIRECT_RESCHED_WRITES, "RESCHED_WRITES" }, \ { NFS_ODIRECT_SHOULD_DIRTY, "SHOULD DIRTY" }, \ { NFS_ODIRECT_DONE, "DONE" } ) DECLARE_EVENT_CLASS(nfs_direct_req_class, TP_PROTO( const struct nfs_direct_req *dreq ), TP_ARGS(dreq), TP_STRUCT__entry( __field(dev_t, dev) __field(u64, fileid) __field(u32, fhandle) __field(loff_t, offset) __field(ssize_t, count) __field(ssize_t, error) __field(int, flags) ), TP_fast_assign( const struct inode *inode = dreq->inode; const struct nfs_inode *nfsi = NFS_I(inode); const struct nfs_fh *fh = &nfsi->fh; __entry->dev = inode->i_sb->s_dev; __entry->fileid = nfsi->fileid; __entry->fhandle = nfs_fhandle_hash(fh); __entry->offset = dreq->io_start; __entry->count = dreq->count; __entry->error = dreq->error; __entry->flags = dreq->flags; ), TP_printk( "error=%zd fileid=%02x:%02x:%llu fhandle=0x%08x " "offset=%lld count=%zd flags=%s", __entry->error, MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, __entry->offset, __entry->count, nfs_show_direct_req_flags(__entry->flags) ) ); #define DEFINE_NFS_DIRECT_REQ_EVENT(name) \ DEFINE_EVENT(nfs_direct_req_class, name, \ TP_PROTO( \ const struct nfs_direct_req *dreq \ ), \ TP_ARGS(dreq)) DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_commit_complete); DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_resched_write); DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_write_complete); DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_write_completion); DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_write_schedule_iovec); DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_write_reschedule_io); TRACE_EVENT(nfs_fh_to_dentry, TP_PROTO( const struct super_block *sb, const struct nfs_fh *fh, u64 fileid, int error ), TP_ARGS(sb, fh, fileid, error), TP_STRUCT__entry( __field(int, error) __field(dev_t, dev) __field(u32, fhandle) __field(u64, fileid) ), TP_fast_assign( __entry->error = error; __entry->dev = sb->s_dev; __entry->fileid = fileid; __entry->fhandle = nfs_fhandle_hash(fh); ), TP_printk( "error=%d fileid=%02x:%02x:%llu fhandle=0x%08x ", __entry->error, MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle ) ); TRACE_EVENT(nfs_mount_assign, TP_PROTO( const char *option, const char *value ), TP_ARGS(option, value), TP_STRUCT__entry( __string(option, option) __string(value, value) ), TP_fast_assign( __assign_str(option); __assign_str(value); ), TP_printk("option %s=%s", __get_str(option), __get_str(value) ) ); TRACE_EVENT(nfs_mount_option, TP_PROTO( const struct fs_parameter *param ), TP_ARGS(param), TP_STRUCT__entry( __string(option, param->key) ), TP_fast_assign( __assign_str(option); ), TP_printk("option %s", __get_str(option)) ); TRACE_EVENT(nfs_mount_path, TP_PROTO( const char *path ), TP_ARGS(path), TP_STRUCT__entry( __string(path, path) ), TP_fast_assign( __assign_str(path); ), TP_printk("path='%s'", __get_str(path)) ); TRACE_EVENT(nfs_local_open_fh, TP_PROTO( const struct nfs_fh *fh, fmode_t fmode, int error ), TP_ARGS(fh, fmode, error), TP_STRUCT__entry( __field(int, error) __field(u32, fhandle) __field(unsigned int, fmode) ), TP_fast_assign( __entry->error = error; __entry->fhandle = nfs_fhandle_hash(fh); __entry->fmode = (__force unsigned int)fmode; ), TP_printk( "error=%d fhandle=0x%08x mode=%s", __entry->error, __entry->fhandle, show_fs_fmode_flags(__entry->fmode) ) ); DECLARE_EVENT_CLASS(nfs_xdr_event, TP_PROTO( const struct xdr_stream *xdr, int error ), TP_ARGS(xdr, error), TP_STRUCT__entry( __field(unsigned int, task_id) __field(unsigned int, client_id) __field(u32, xid) __field(int, version) __field(unsigned long, error) __string(program, xdr->rqst->rq_task->tk_client->cl_program->name) __string(procedure, xdr->rqst->rq_task->tk_msg.rpc_proc->p_name) ), TP_fast_assign( const struct rpc_rqst *rqstp = xdr->rqst; const struct rpc_task *task = rqstp->rq_task; __entry->task_id = task->tk_pid; __entry->client_id = task->tk_client->cl_clid; __entry->xid = be32_to_cpu(rqstp->rq_xid); __entry->version = task->tk_client->cl_vers; __entry->error = error; __assign_str(program); __assign_str(procedure); ), TP_printk(SUNRPC_TRACE_TASK_SPECIFIER " xid=0x%08x %sv%d %s error=%ld (%s)", __entry->task_id, __entry->client_id, __entry->xid, __get_str(program), __entry->version, __get_str(procedure), -__entry->error, show_nfs_status(__entry->error) ) ); #define DEFINE_NFS_XDR_EVENT(name) \ DEFINE_EVENT(nfs_xdr_event, name, \ TP_PROTO( \ const struct xdr_stream *xdr, \ int error \ ), \ TP_ARGS(xdr, error)) DEFINE_NFS_XDR_EVENT(nfs_xdr_status); DEFINE_NFS_XDR_EVENT(nfs_xdr_bad_filehandle); #endif /* _TRACE_NFS_H */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH . #define TRACE_INCLUDE_FILE nfstrace /* This part must be outside protection */ #include <trace/define_trace.h>
14 37 37 35 37 35 34 10 7 3 33 33 35 8 8 2 7 2 8 5 5 6 3 2 7 8 8 29 29 29 27 28 27 28 27 27 26 12 9 8 5 4 4 3 4 2 18 15 6 6 15 15 15 15 19 19 17 2 19 14 8 14 33 32 8 1 2 4 2 3 1 2 5 4 3 3 4 4 4 4 4 4 4 4 4 4 4 31 31 1 29 16 16 15 2 27 26 26 3 24 3 26 8 8 8 8 8 8 11 7 4 4 3 3 3 3 3 3 2 2 2 2 3 6 2 4 11 11 11 9 2 11 11 11 12 3 11 11 2 11 11 5 5 4 7 7 3 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */ #include <linux/skmsg.h> #include <linux/skbuff.h> #include <linux/scatterlist.h> #include <net/sock.h> #include <net/tcp.h> #include <net/tls.h> #include <trace/events/sock.h> static bool sk_msg_try_coalesce_ok(struct sk_msg *msg, int elem_first_coalesce) { if (msg->sg.end > msg->sg.start && elem_first_coalesce < msg->sg.end) return true; if (msg->sg.end < msg->sg.start && (elem_first_coalesce > msg->sg.start || elem_first_coalesce < msg->sg.end)) return true; return false; } int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len, int elem_first_coalesce) { struct page_frag *pfrag = sk_page_frag(sk); u32 osize = msg->sg.size; int ret = 0; len -= msg->sg.size; while (len > 0) { struct scatterlist *sge; u32 orig_offset; int use, i; if (!sk_page_frag_refill(sk, pfrag)) { ret = -ENOMEM; goto msg_trim; } orig_offset = pfrag->offset; use = min_t(int, len, pfrag->size - orig_offset); if (!sk_wmem_schedule(sk, use)) { ret = -ENOMEM; goto msg_trim; } i = msg->sg.end; sk_msg_iter_var_prev(i); sge = &msg->sg.data[i]; if (sk_msg_try_coalesce_ok(msg, elem_first_coalesce) && sg_page(sge) == pfrag->page && sge->offset + sge->length == orig_offset) { sge->length += use; } else { if (sk_msg_full(msg)) { ret = -ENOSPC; break; } sge = &msg->sg.data[msg->sg.end]; sg_unmark_end(sge); sg_set_page(sge, pfrag->page, use, orig_offset); get_page(pfrag->page); sk_msg_iter_next(msg, end); } sk_mem_charge(sk, use); msg->sg.size += use; pfrag->offset += use; len -= use; } return ret; msg_trim: sk_msg_trim(sk, msg, osize); return ret; } EXPORT_SYMBOL_GPL(sk_msg_alloc); int sk_msg_clone(struct sock *sk, struct sk_msg *dst, struct sk_msg *src, u32 off, u32 len) { int i = src->sg.start; struct scatterlist *sge = sk_msg_elem(src, i); struct scatterlist *sgd = NULL; u32 sge_len, sge_off; while (off) { if (sge->length > off) break; off -= sge->length; sk_msg_iter_var_next(i); if (i == src->sg.end && off) return -ENOSPC; sge = sk_msg_elem(src, i); } while (len) { sge_len = sge->length - off; if (sge_len > len) sge_len = len; if (dst->sg.end) sgd = sk_msg_elem(dst, dst->sg.end - 1); if (sgd && (sg_page(sge) == sg_page(sgd)) && (sg_virt(sge) + off == sg_virt(sgd) + sgd->length)) { sgd->length += sge_len; dst->sg.size += sge_len; } else if (!sk_msg_full(dst)) { sge_off = sge->offset + off; sk_msg_page_add(dst, sg_page(sge), sge_len, sge_off); } else { return -ENOSPC; } off = 0; len -= sge_len; sk_mem_charge(sk, sge_len); sk_msg_iter_var_next(i); if (i == src->sg.end && len) return -ENOSPC; sge = sk_msg_elem(src, i); } return 0; } EXPORT_SYMBOL_GPL(sk_msg_clone); void sk_msg_return_zero(struct sock *sk, struct sk_msg *msg, int bytes) { int i = msg->sg.start; do { struct scatterlist *sge = sk_msg_elem(msg, i); if (bytes < sge->length) { sge->length -= bytes; sge->offset += bytes; sk_mem_uncharge(sk, bytes); break; } sk_mem_uncharge(sk, sge->length); bytes -= sge->length; sge->length = 0; sge->offset = 0; sk_msg_iter_var_next(i); } while (bytes && i != msg->sg.end); msg->sg.start = i; } EXPORT_SYMBOL_GPL(sk_msg_return_zero); void sk_msg_return(struct sock *sk, struct sk_msg *msg, int bytes) { int i = msg->sg.start; do { struct scatterlist *sge = &msg->sg.data[i]; int uncharge = (bytes < sge->length) ? bytes : sge->length; sk_mem_uncharge(sk, uncharge); bytes -= uncharge; sk_msg_iter_var_next(i); } while (i != msg->sg.end); } EXPORT_SYMBOL_GPL(sk_msg_return); static int sk_msg_free_elem(struct sock *sk, struct sk_msg *msg, u32 i, bool charge) { struct scatterlist *sge = sk_msg_elem(msg, i); u32 len = sge->length; /* When the skb owns the memory we free it from consume_skb path. */ if (!msg->skb) { if (charge) sk_mem_uncharge(sk, len); put_page(sg_page(sge)); } memset(sge, 0, sizeof(*sge)); return len; } static int __sk_msg_free(struct sock *sk, struct sk_msg *msg, u32 i, bool charge) { struct scatterlist *sge = sk_msg_elem(msg, i); int freed = 0; while (msg->sg.size) { msg->sg.size -= sge->length; freed += sk_msg_free_elem(sk, msg, i, charge); sk_msg_iter_var_next(i); sk_msg_check_to_free(msg, i, msg->sg.size); sge = sk_msg_elem(msg, i); } consume_skb(msg->skb); sk_msg_init(msg); return freed; } int sk_msg_free_nocharge(struct sock *sk, struct sk_msg *msg) { return __sk_msg_free(sk, msg, msg->sg.start, false); } EXPORT_SYMBOL_GPL(sk_msg_free_nocharge); int sk_msg_free(struct sock *sk, struct sk_msg *msg) { return __sk_msg_free(sk, msg, msg->sg.start, true); } EXPORT_SYMBOL_GPL(sk_msg_free); static void __sk_msg_free_partial(struct sock *sk, struct sk_msg *msg, u32 bytes, bool charge) { struct scatterlist *sge; u32 i = msg->sg.start; while (bytes) { sge = sk_msg_elem(msg, i); if (!sge->length) break; if (bytes < sge->length) { if (charge) sk_mem_uncharge(sk, bytes); sge->length -= bytes; sge->offset += bytes; msg->sg.size -= bytes; break; } msg->sg.size -= sge->length; bytes -= sge->length; sk_msg_free_elem(sk, msg, i, charge); sk_msg_iter_var_next(i); sk_msg_check_to_free(msg, i, bytes); } msg->sg.start = i; } void sk_msg_free_partial(struct sock *sk, struct sk_msg *msg, u32 bytes) { __sk_msg_free_partial(sk, msg, bytes, true); } EXPORT_SYMBOL_GPL(sk_msg_free_partial); void sk_msg_free_partial_nocharge(struct sock *sk, struct sk_msg *msg, u32 bytes) { __sk_msg_free_partial(sk, msg, bytes, false); } void sk_msg_trim(struct sock *sk, struct sk_msg *msg, int len) { int trim = msg->sg.size - len; u32 i = msg->sg.end; if (trim <= 0) { WARN_ON(trim < 0); return; } sk_msg_iter_var_prev(i); msg->sg.size = len; while (msg->sg.data[i].length && trim >= msg->sg.data[i].length) { trim -= msg->sg.data[i].length; sk_msg_free_elem(sk, msg, i, true); sk_msg_iter_var_prev(i); if (!trim) goto out; } msg->sg.data[i].length -= trim; sk_mem_uncharge(sk, trim); /* Adjust copybreak if it falls into the trimmed part of last buf */ if (msg->sg.curr == i && msg->sg.copybreak > msg->sg.data[i].length) msg->sg.copybreak = msg->sg.data[i].length; out: sk_msg_iter_var_next(i); msg->sg.end = i; /* If we trim data a full sg elem before curr pointer update * copybreak and current so that any future copy operations * start at new copy location. * However trimmed data that has not yet been used in a copy op * does not require an update. */ if (!msg->sg.size) { msg->sg.curr = msg->sg.start; msg->sg.copybreak = 0; } else if (sk_msg_iter_dist(msg->sg.start, msg->sg.curr) >= sk_msg_iter_dist(msg->sg.start, msg->sg.end)) { sk_msg_iter_var_prev(i); msg->sg.curr = i; msg->sg.copybreak = msg->sg.data[i].length; } } EXPORT_SYMBOL_GPL(sk_msg_trim); int sk_msg_zerocopy_from_iter(struct sock *sk, struct iov_iter *from, struct sk_msg *msg, u32 bytes) { int i, maxpages, ret = 0, num_elems = sk_msg_elem_used(msg); const int to_max_pages = MAX_MSG_FRAGS; struct page *pages[MAX_MSG_FRAGS]; ssize_t orig, copied, use, offset; orig = msg->sg.size; while (bytes > 0) { i = 0; maxpages = to_max_pages - num_elems; if (maxpages == 0) { ret = -EFAULT; goto out; } copied = iov_iter_get_pages2(from, pages, bytes, maxpages, &offset); if (copied <= 0) { ret = -EFAULT; goto out; } bytes -= copied; msg->sg.size += copied; while (copied) { use = min_t(int, copied, PAGE_SIZE - offset); sg_set_page(&msg->sg.data[msg->sg.end], pages[i], use, offset); sg_unmark_end(&msg->sg.data[msg->sg.end]); sk_mem_charge(sk, use); offset = 0; copied -= use; sk_msg_iter_next(msg, end); num_elems++; i++; } /* When zerocopy is mixed with sk_msg_*copy* operations we * may have a copybreak set in this case clear and prefer * zerocopy remainder when possible. */ msg->sg.copybreak = 0; msg->sg.curr = msg->sg.end; } out: /* Revert iov_iter updates, msg will need to use 'trim' later if it * also needs to be cleared. */ if (ret) iov_iter_revert(from, msg->sg.size - orig); return ret; } EXPORT_SYMBOL_GPL(sk_msg_zerocopy_from_iter); int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, struct sk_msg *msg, u32 bytes) { int ret = -ENOSPC, i = msg->sg.curr; u32 copy, buf_size, copied = 0; struct scatterlist *sge; void *to; do { sge = sk_msg_elem(msg, i); /* This is possible if a trim operation shrunk the buffer */ if (msg->sg.copybreak >= sge->length) { msg->sg.copybreak = 0; sk_msg_iter_var_next(i); if (i == msg->sg.end) break; sge = sk_msg_elem(msg, i); } buf_size = sge->length - msg->sg.copybreak; copy = (buf_size > bytes) ? bytes : buf_size; to = sg_virt(sge) + msg->sg.copybreak; msg->sg.copybreak += copy; if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) ret = copy_from_iter_nocache(to, copy, from); else ret = copy_from_iter(to, copy, from); if (ret != copy) { ret = -EFAULT; goto out; } bytes -= copy; copied += copy; if (!bytes) break; msg->sg.copybreak = 0; sk_msg_iter_var_next(i); } while (i != msg->sg.end); out: msg->sg.curr = i; return (ret < 0) ? ret : copied; } EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter); /* Receive sk_msg from psock->ingress_msg to @msg. */ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, int len, int flags) { struct iov_iter *iter = &msg->msg_iter; int peek = flags & MSG_PEEK; struct sk_msg *msg_rx; int i, copied = 0; msg_rx = sk_psock_peek_msg(psock); while (copied != len) { struct scatterlist *sge; if (unlikely(!msg_rx)) break; i = msg_rx->sg.start; do { struct page *page; int copy; sge = sk_msg_elem(msg_rx, i); copy = sge->length; page = sg_page(sge); if (copied + copy > len) copy = len - copied; if (copy) copy = copy_page_to_iter(page, sge->offset, copy, iter); if (!copy) { copied = copied ? copied : -EFAULT; goto out; } copied += copy; if (likely(!peek)) { sge->offset += copy; sge->length -= copy; if (!msg_rx->skb) { sk_mem_uncharge(sk, copy); atomic_sub(copy, &sk->sk_rmem_alloc); } msg_rx->sg.size -= copy; if (!sge->length) { sk_msg_iter_var_next(i); if (!msg_rx->skb) put_page(page); } } else { /* Lets not optimize peek case if copy_page_to_iter * didn't copy the entire length lets just break. */ if (copy != sge->length) goto out; sk_msg_iter_var_next(i); } if (copied == len) break; } while ((i != msg_rx->sg.end) && !sg_is_last(sge)); if (unlikely(peek)) { msg_rx = sk_psock_next_msg(psock, msg_rx); if (!msg_rx) break; continue; } msg_rx->sg.start = i; if (!sge->length && (i == msg_rx->sg.end || sg_is_last(sge))) { msg_rx = sk_psock_dequeue_msg(psock); kfree_sk_msg(msg_rx); } msg_rx = sk_psock_peek_msg(psock); } out: return copied; } EXPORT_SYMBOL_GPL(sk_msg_recvmsg); bool sk_msg_is_readable(struct sock *sk) { struct sk_psock *psock; bool empty = true; rcu_read_lock(); psock = sk_psock(sk); if (likely(psock)) empty = list_empty(&psock->ingress_msg); rcu_read_unlock(); return !empty; } EXPORT_SYMBOL_GPL(sk_msg_is_readable); static struct sk_msg *alloc_sk_msg(gfp_t gfp) { struct sk_msg *msg; msg = kzalloc(sizeof(*msg), gfp | __GFP_NOWARN); if (unlikely(!msg)) return NULL; sg_init_marker(msg->sg.data, NR_MSG_FRAG_IDS); return msg; } static struct sk_msg *sk_psock_create_ingress_msg(struct sock *sk, struct sk_buff *skb) { if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) return NULL; if (!sk_rmem_schedule(sk, skb, skb->truesize)) return NULL; return alloc_sk_msg(GFP_KERNEL); } static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb, u32 off, u32 len, struct sk_psock *psock, struct sock *sk, struct sk_msg *msg) { int num_sge, copied; num_sge = skb_to_sgvec(skb, msg->sg.data, off, len); if (num_sge < 0) { /* skb linearize may fail with ENOMEM, but lets simply try again * later if this happens. Under memory pressure we don't want to * drop the skb. We need to linearize the skb so that the mapping * in skb_to_sgvec can not error. */ if (skb_linearize(skb)) return -EAGAIN; num_sge = skb_to_sgvec(skb, msg->sg.data, off, len); if (unlikely(num_sge < 0)) return num_sge; } copied = len; msg->sg.start = 0; msg->sg.size = copied; msg->sg.end = num_sge; msg->skb = skb; sk_psock_queue_msg(psock, msg); sk_psock_data_ready(sk, psock); return copied; } static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb, u32 off, u32 len); static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb, u32 off, u32 len) { struct sock *sk = psock->sk; struct sk_msg *msg; int err; /* If we are receiving on the same sock skb->sk is already assigned, * skip memory accounting and owner transition seeing it already set * correctly. */ if (unlikely(skb->sk == sk)) return sk_psock_skb_ingress_self(psock, skb, off, len); msg = sk_psock_create_ingress_msg(sk, skb); if (!msg) return -EAGAIN; /* This will transition ownership of the data from the socket where * the BPF program was run initiating the redirect to the socket * we will eventually receive this data on. The data will be released * from skb_consume found in __tcp_bpf_recvmsg() after its been copied * into user buffers. */ skb_set_owner_r(skb, sk); err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg); if (err < 0) kfree(msg); return err; } /* Puts an skb on the ingress queue of the socket already assigned to the * skb. In this case we do not need to check memory limits or skb_set_owner_r * because the skb is already accounted for here. */ static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb, u32 off, u32 len) { struct sk_msg *msg = alloc_sk_msg(GFP_ATOMIC); struct sock *sk = psock->sk; int err; if (unlikely(!msg)) return -EAGAIN; skb_set_owner_r(skb, sk); err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg); if (err < 0) kfree(msg); return err; } static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb, u32 off, u32 len, bool ingress) { int err = 0; if (!ingress) { if (!sock_writeable(psock->sk)) return -EAGAIN; return skb_send_sock(psock->sk, skb, off, len); } skb_get(skb); err = sk_psock_skb_ingress(psock, skb, off, len); if (err < 0) kfree_skb(skb); return err; } static void sk_psock_skb_state(struct sk_psock *psock, struct sk_psock_work_state *state, int len, int off) { spin_lock_bh(&psock->ingress_lock); if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { state->len = len; state->off = off; } spin_unlock_bh(&psock->ingress_lock); } static void sk_psock_backlog(struct work_struct *work) { struct delayed_work *dwork = to_delayed_work(work); struct sk_psock *psock = container_of(dwork, struct sk_psock, work); struct sk_psock_work_state *state = &psock->work_state; struct sk_buff *skb = NULL; u32 len = 0, off = 0; bool ingress; int ret; mutex_lock(&psock->work_mutex); if (unlikely(state->len)) { len = state->len; off = state->off; } while ((skb = skb_peek(&psock->ingress_skb))) { len = skb->len; off = 0; if (skb_bpf_strparser(skb)) { struct strp_msg *stm = strp_msg(skb); off = stm->offset; len = stm->full_len; } ingress = skb_bpf_ingress(skb); skb_bpf_redirect_clear(skb); do { ret = -EIO; if (!sock_flag(psock->sk, SOCK_DEAD)) ret = sk_psock_handle_skb(psock, skb, off, len, ingress); if (ret <= 0) { if (ret == -EAGAIN) { sk_psock_skb_state(psock, state, len, off); /* Delay slightly to prioritize any * other work that might be here. */ if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) schedule_delayed_work(&psock->work, 1); goto end; } /* Hard errors break pipe and stop xmit. */ sk_psock_report_error(psock, ret ? -ret : EPIPE); sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); goto end; } off += ret; len -= ret; } while (len); skb = skb_dequeue(&psock->ingress_skb); kfree_skb(skb); } end: mutex_unlock(&psock->work_mutex); } struct sk_psock *sk_psock_init(struct sock *sk, int node) { struct sk_psock *psock; struct proto *prot; write_lock_bh(&sk->sk_callback_lock); if (sk_is_inet(sk) && inet_csk_has_ulp(sk)) { psock = ERR_PTR(-EINVAL); goto out; } if (sk->sk_user_data) { psock = ERR_PTR(-EBUSY); goto out; } psock = kzalloc_node(sizeof(*psock), GFP_ATOMIC | __GFP_NOWARN, node); if (!psock) { psock = ERR_PTR(-ENOMEM); goto out; } prot = READ_ONCE(sk->sk_prot); psock->sk = sk; psock->eval = __SK_NONE; psock->sk_proto = prot; psock->saved_unhash = prot->unhash; psock->saved_destroy = prot->destroy; psock->saved_close = prot->close; psock->saved_write_space = sk->sk_write_space; INIT_LIST_HEAD(&psock->link); spin_lock_init(&psock->link_lock); INIT_DELAYED_WORK(&psock->work, sk_psock_backlog); mutex_init(&psock->work_mutex); INIT_LIST_HEAD(&psock->ingress_msg); spin_lock_init(&psock->ingress_lock); skb_queue_head_init(&psock->ingress_skb); sk_psock_set_state(psock, SK_PSOCK_TX_ENABLED); refcount_set(&psock->refcnt, 1); __rcu_assign_sk_user_data_with_flags(sk, psock, SK_USER_DATA_NOCOPY | SK_USER_DATA_PSOCK); sock_hold(sk); out: write_unlock_bh(&sk->sk_callback_lock); return psock; } EXPORT_SYMBOL_GPL(sk_psock_init); struct sk_psock_link *sk_psock_link_pop(struct sk_psock *psock) { struct sk_psock_link *link; spin_lock_bh(&psock->link_lock); link = list_first_entry_or_null(&psock->link, struct sk_psock_link, list); if (link) list_del(&link->list); spin_unlock_bh(&psock->link_lock); return link; } static void __sk_psock_purge_ingress_msg(struct sk_psock *psock) { struct sk_msg *msg, *tmp; list_for_each_entry_safe(msg, tmp, &psock->ingress_msg, list) { list_del(&msg->list); if (!msg->skb) atomic_sub(msg->sg.size, &psock->sk->sk_rmem_alloc); sk_msg_free(psock->sk, msg); kfree(msg); } } static void __sk_psock_zap_ingress(struct sk_psock *psock) { struct sk_buff *skb; while ((skb = skb_dequeue(&psock->ingress_skb)) != NULL) { skb_bpf_redirect_clear(skb); sock_drop(psock->sk, skb); } __sk_psock_purge_ingress_msg(psock); } static void sk_psock_link_destroy(struct sk_psock *psock) { struct sk_psock_link *link, *tmp; list_for_each_entry_safe(link, tmp, &psock->link, list) { list_del(&link->list); sk_psock_free_link(link); } } void sk_psock_stop(struct sk_psock *psock) { spin_lock_bh(&psock->ingress_lock); sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); sk_psock_cork_free(psock); spin_unlock_bh(&psock->ingress_lock); } static void sk_psock_done_strp(struct sk_psock *psock); static void sk_psock_destroy(struct work_struct *work) { struct sk_psock *psock = container_of(to_rcu_work(work), struct sk_psock, rwork); /* No sk_callback_lock since already detached. */ sk_psock_done_strp(psock); cancel_delayed_work_sync(&psock->work); __sk_psock_zap_ingress(psock); mutex_destroy(&psock->work_mutex); psock_progs_drop(&psock->progs); sk_psock_link_destroy(psock); sk_psock_cork_free(psock); if (psock->sk_redir) sock_put(psock->sk_redir); if (psock->sk_pair) sock_put(psock->sk_pair); sock_put(psock->sk); kfree(psock); } void sk_psock_drop(struct sock *sk, struct sk_psock *psock) { write_lock_bh(&sk->sk_callback_lock); sk_psock_restore_proto(sk, psock); rcu_assign_sk_user_data(sk, NULL); if (psock->progs.stream_parser) sk_psock_stop_strp(sk, psock); else if (psock->progs.stream_verdict || psock->progs.skb_verdict) sk_psock_stop_verdict(sk, psock); write_unlock_bh(&sk->sk_callback_lock); sk_psock_stop(psock); INIT_RCU_WORK(&psock->rwork, sk_psock_destroy); queue_rcu_work(system_wq, &psock->rwork); } EXPORT_SYMBOL_GPL(sk_psock_drop); static int sk_psock_map_verd(int verdict, bool redir) { switch (verdict) { case SK_PASS: return redir ? __SK_REDIRECT : __SK_PASS; case SK_DROP: default: break; } return __SK_DROP; } int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock, struct sk_msg *msg) { struct bpf_prog *prog; int ret; rcu_read_lock(); prog = READ_ONCE(psock->progs.msg_parser); if (unlikely(!prog)) { ret = __SK_PASS; goto out; } sk_msg_compute_data_pointers(msg); msg->sk = sk; ret = bpf_prog_run_pin_on_cpu(prog, msg); ret = sk_psock_map_verd(ret, msg->sk_redir); psock->apply_bytes = msg->apply_bytes; if (ret == __SK_REDIRECT) { if (psock->sk_redir) { sock_put(psock->sk_redir); psock->sk_redir = NULL; } if (!msg->sk_redir) { ret = __SK_DROP; goto out; } psock->redir_ingress = sk_msg_to_ingress(msg); psock->sk_redir = msg->sk_redir; sock_hold(psock->sk_redir); } out: rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(sk_psock_msg_verdict); static int sk_psock_skb_redirect(struct sk_psock *from, struct sk_buff *skb) { struct sk_psock *psock_other; struct sock *sk_other; sk_other = skb_bpf_redirect_fetch(skb); /* This error is a buggy BPF program, it returned a redirect * return code, but then didn't set a redirect interface. */ if (unlikely(!sk_other)) { skb_bpf_redirect_clear(skb); sock_drop(from->sk, skb); return -EIO; } psock_other = sk_psock(sk_other); /* This error indicates the socket is being torn down or had another * error that caused the pipe to break. We can't send a packet on * a socket that is in this state so we drop the skb. */ if (!psock_other || sock_flag(sk_other, SOCK_DEAD)) { skb_bpf_redirect_clear(skb); sock_drop(from->sk, skb); return -EIO; } spin_lock_bh(&psock_other->ingress_lock); if (!sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) { spin_unlock_bh(&psock_other->ingress_lock); skb_bpf_redirect_clear(skb); sock_drop(from->sk, skb); return -EIO; } skb_queue_tail(&psock_other->ingress_skb, skb); schedule_delayed_work(&psock_other->work, 0); spin_unlock_bh(&psock_other->ingress_lock); return 0; } static void sk_psock_tls_verdict_apply(struct sk_buff *skb, struct sk_psock *from, int verdict) { switch (verdict) { case __SK_REDIRECT: sk_psock_skb_redirect(from, skb); break; case __SK_PASS: case __SK_DROP: default: break; } } int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb) { struct bpf_prog *prog; int ret = __SK_PASS; rcu_read_lock(); prog = READ_ONCE(psock->progs.stream_verdict); if (likely(prog)) { skb->sk = psock->sk; skb_dst_drop(skb); skb_bpf_redirect_clear(skb); ret = bpf_prog_run_pin_on_cpu(prog, skb); ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); skb->sk = NULL; } sk_psock_tls_verdict_apply(skb, psock, ret); rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(sk_psock_tls_strp_read); static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb, int verdict) { struct sock *sk_other; int err = 0; u32 len, off; switch (verdict) { case __SK_PASS: err = -EIO; sk_other = psock->sk; if (sock_flag(sk_other, SOCK_DEAD) || !sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) goto out_free; skb_bpf_set_ingress(skb); /* If the queue is empty then we can submit directly * into the msg queue. If its not empty we have to * queue work otherwise we may get OOO data. Otherwise, * if sk_psock_skb_ingress errors will be handled by * retrying later from workqueue. */ if (skb_queue_empty(&psock->ingress_skb)) { len = skb->len; off = 0; if (skb_bpf_strparser(skb)) { struct strp_msg *stm = strp_msg(skb); off = stm->offset; len = stm->full_len; } err = sk_psock_skb_ingress_self(psock, skb, off, len); } if (err < 0) { spin_lock_bh(&psock->ingress_lock); if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { skb_queue_tail(&psock->ingress_skb, skb); schedule_delayed_work(&psock->work, 0); err = 0; } spin_unlock_bh(&psock->ingress_lock); if (err < 0) goto out_free; } break; case __SK_REDIRECT: tcp_eat_skb(psock->sk, skb); err = sk_psock_skb_redirect(psock, skb); break; case __SK_DROP: default: out_free: skb_bpf_redirect_clear(skb); tcp_eat_skb(psock->sk, skb); sock_drop(psock->sk, skb); } return err; } static void sk_psock_write_space(struct sock *sk) { struct sk_psock *psock; void (*write_space)(struct sock *sk) = NULL; rcu_read_lock(); psock = sk_psock(sk); if (likely(psock)) { if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) schedule_delayed_work(&psock->work, 0); write_space = psock->saved_write_space; } rcu_read_unlock(); if (write_space) write_space(sk); } #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb) { struct sk_psock *psock; struct bpf_prog *prog; int ret = __SK_DROP; struct sock *sk; rcu_read_lock(); sk = strp->sk; psock = sk_psock(sk); if (unlikely(!psock)) { sock_drop(sk, skb); goto out; } prog = READ_ONCE(psock->progs.stream_verdict); if (likely(prog)) { skb->sk = sk; skb_dst_drop(skb); skb_bpf_redirect_clear(skb); ret = bpf_prog_run_pin_on_cpu(prog, skb); skb_bpf_set_strparser(skb); ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); skb->sk = NULL; } sk_psock_verdict_apply(psock, skb, ret); out: rcu_read_unlock(); } static int sk_psock_strp_read_done(struct strparser *strp, int err) { return err; } static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb) { struct sk_psock *psock = container_of(strp, struct sk_psock, strp); struct bpf_prog *prog; int ret = skb->len; rcu_read_lock(); prog = READ_ONCE(psock->progs.stream_parser); if (likely(prog)) { skb->sk = psock->sk; ret = bpf_prog_run_pin_on_cpu(prog, skb); skb->sk = NULL; } rcu_read_unlock(); return ret; } /* Called with socket lock held. */ static void sk_psock_strp_data_ready(struct sock *sk) { struct sk_psock *psock; trace_sk_data_ready(sk); rcu_read_lock(); psock = sk_psock(sk); if (likely(psock)) { if (tls_sw_has_ctx_rx(sk)) { psock->saved_data_ready(sk); } else { read_lock_bh(&sk->sk_callback_lock); strp_data_ready(&psock->strp); read_unlock_bh(&sk->sk_callback_lock); } } rcu_read_unlock(); } int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock) { int ret; static const struct strp_callbacks cb = { .rcv_msg = sk_psock_strp_read, .read_sock_done = sk_psock_strp_read_done, .parse_msg = sk_psock_strp_parse, }; ret = strp_init(&psock->strp, sk, &cb); if (!ret) sk_psock_set_state(psock, SK_PSOCK_RX_STRP_ENABLED); return ret; } void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock) { if (psock->saved_data_ready) return; psock->saved_data_ready = sk->sk_data_ready; sk->sk_data_ready = sk_psock_strp_data_ready; sk->sk_write_space = sk_psock_write_space; } void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock) { psock_set_prog(&psock->progs.stream_parser, NULL); if (!psock->saved_data_ready) return; sk->sk_data_ready = psock->saved_data_ready; psock->saved_data_ready = NULL; strp_stop(&psock->strp); } static void sk_psock_done_strp(struct sk_psock *psock) { /* Parser has been stopped */ if (sk_psock_test_state(psock, SK_PSOCK_RX_STRP_ENABLED)) strp_done(&psock->strp); } #else static void sk_psock_done_strp(struct sk_psock *psock) { } #endif /* CONFIG_BPF_STREAM_PARSER */ static int sk_psock_verdict_recv(struct sock *sk, struct sk_buff *skb) { struct sk_psock *psock; struct bpf_prog *prog; int ret = __SK_DROP; int len = skb->len; rcu_read_lock(); psock = sk_psock(sk); if (unlikely(!psock)) { len = 0; tcp_eat_skb(sk, skb); sock_drop(sk, skb); goto out; } prog = READ_ONCE(psock->progs.stream_verdict); if (!prog) prog = READ_ONCE(psock->progs.skb_verdict); if (likely(prog)) { skb_dst_drop(skb); skb_bpf_redirect_clear(skb); ret = bpf_prog_run_pin_on_cpu(prog, skb); ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); } ret = sk_psock_verdict_apply(psock, skb, ret); if (ret < 0) len = ret; out: rcu_read_unlock(); return len; } static void sk_psock_verdict_data_ready(struct sock *sk) { struct socket *sock = sk->sk_socket; const struct proto_ops *ops; int copied; trace_sk_data_ready(sk); if (unlikely(!sock)) return; ops = READ_ONCE(sock->ops); if (!ops || !ops->read_skb) return; copied = ops->read_skb(sk, sk_psock_verdict_recv); if (copied >= 0) { struct sk_psock *psock; rcu_read_lock(); psock = sk_psock(sk); if (psock) sk_psock_data_ready(sk, psock); rcu_read_unlock(); } } void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock) { if (psock->saved_data_ready) return; psock->saved_data_ready = sk->sk_data_ready; sk->sk_data_ready = sk_psock_verdict_data_ready; sk->sk_write_space = sk_psock_write_space; } void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock) { psock_set_prog(&psock->progs.stream_verdict, NULL); psock_set_prog(&psock->progs.skb_verdict, NULL); if (!psock->saved_data_ready) return; sk->sk_data_ready = psock->saved_data_ready; psock->saved_data_ready = NULL; }
8 213 1 1 178 8 209 99 62 66 18 28 83 377 377 8 33 218 231 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 /* SPDX-License-Identifier: GPL-2.0 */ /* * * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. * */ // clang-format off #ifndef _LINUX_NTFS3_NTFS_FS_H #define _LINUX_NTFS3_NTFS_FS_H #include <linux/blkdev.h> #include <linux/buffer_head.h> #include <linux/fs.h> #include <linux/highmem.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/mutex.h> #include <linux/page-flags.h> #include <linux/pagemap.h> #include <linux/rbtree.h> #include <linux/rwsem.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/time64.h> #include <linux/types.h> #include <linux/uidgid.h> #include <asm/div64.h> #include <asm/page.h> #include "debug.h" #include "ntfs.h" struct dentry; struct fiemap_extent_info; struct user_namespace; struct page; struct writeback_control; enum utf16_endian; #define MINUS_ONE_T ((size_t)(-1)) /* Biggest MFT / smallest cluster */ #define MAXIMUM_BYTES_PER_MFT 4096 #define MAXIMUM_SHIFT_BYTES_PER_MFT 12 #define NTFS_BLOCKS_PER_MFT_RECORD (MAXIMUM_BYTES_PER_MFT / 512) #define MAXIMUM_BYTES_PER_INDEX 4096 #define MAXIMUM_SHIFT_BYTES_PER_INDEX 12 #define NTFS_BLOCKS_PER_INODE (MAXIMUM_BYTES_PER_INDEX / 512) /* NTFS specific error code when fixup failed. */ #define E_NTFS_FIXUP 555 /* NTFS specific error code about resident->nonresident. */ #define E_NTFS_NONRESIDENT 556 /* NTFS specific error code about punch hole. */ #define E_NTFS_NOTALIGNED 557 /* NTFS specific error code when on-disk struct is corrupted. */ #define E_NTFS_CORRUPT 558 /* sbi->flags */ #define NTFS_FLAGS_NODISCARD 0x00000001 /* ntfs in shutdown state. */ #define NTFS_FLAGS_SHUTDOWN_BIT 0x00000002 /* == 4*/ /* Set when LogFile is replaying. */ #define NTFS_FLAGS_LOG_REPLAYING 0x00000008 /* Set when we changed first MFT's which copy must be updated in $MftMirr. */ #define NTFS_FLAGS_MFTMIRR 0x00001000 #define NTFS_FLAGS_NEED_REPLAY 0x04000000 /* ni->ni_flags */ /* * Data attribute is external compressed (LZX/Xpress) * 1 - WOF_COMPRESSION_XPRESS4K * 2 - WOF_COMPRESSION_XPRESS8K * 3 - WOF_COMPRESSION_XPRESS16K * 4 - WOF_COMPRESSION_LZX32K */ #define NI_FLAG_COMPRESSED_MASK 0x0000000f /* Data attribute is deduplicated. */ #define NI_FLAG_DEDUPLICATED 0x00000010 #define NI_FLAG_EA 0x00000020 #define NI_FLAG_DIR 0x00000040 #define NI_FLAG_RESIDENT 0x00000080 #define NI_FLAG_UPDATE_PARENT 0x00000100 // clang-format on struct ntfs_mount_options { char *nls_name; struct nls_table *nls; kuid_t fs_uid; kgid_t fs_gid; u16 fs_fmask_inv; u16 fs_dmask_inv; unsigned fmask : 1; /* fmask was set. */ unsigned dmask : 1; /*dmask was set. */ unsigned sys_immutable : 1; /* Immutable system files. */ unsigned discard : 1; /* Issue discard requests on deletions. */ unsigned sparse : 1; /* Create sparse files. */ unsigned showmeta : 1; /* Show meta files. */ unsigned nohidden : 1; /* Do not show hidden files. */ unsigned hide_dot_files : 1; /* Set hidden flag on dot files. */ unsigned windows_names : 1; /* Disallow names forbidden by Windows. */ unsigned force : 1; /* RW mount dirty volume. */ unsigned prealloc : 1; /* Preallocate space when file is growing. */ unsigned nocase : 1; /* case insensitive. */ }; /* Special value to unpack and deallocate. */ #define RUN_DEALLOCATE ((struct runs_tree *)(size_t)1) /* TODO: Use rb tree instead of array. */ struct runs_tree { struct ntfs_run *runs; size_t count; /* Currently used size a ntfs_run storage. */ size_t allocated; /* Currently allocated ntfs_run storage size. */ }; struct ntfs_buffers { /* Biggest MFT / smallest cluster = 4096 / 512 = 8 */ /* Biggest index / smallest cluster = 4096 / 512 = 8 */ struct buffer_head *bh[PAGE_SIZE >> SECTOR_SHIFT]; u32 bytes; u32 nbufs; u32 off; }; enum ALLOCATE_OPT { ALLOCATE_DEF = 0, // Allocate all clusters. ALLOCATE_MFT = 1, // Allocate for MFT. ALLOCATE_ZERO = 2, // Zeroout new allocated clusters }; enum bitmap_mutex_classes { BITMAP_MUTEX_CLUSTERS = 0, BITMAP_MUTEX_MFT = 1, }; struct wnd_bitmap { struct super_block *sb; struct rw_semaphore rw_lock; struct runs_tree run; size_t nbits; size_t total_zeroes; // Total number of free bits. u16 *free_bits; // Free bits in each window. size_t nwnd; u32 bits_last; // Bits in last window. struct rb_root start_tree; // Extents, sorted by 'start'. struct rb_root count_tree; // Extents, sorted by 'count + start'. size_t count; // Extents count. /* * -1 Tree is activated but not updated (too many fragments). * 0 - Tree is not activated. * 1 - Tree is activated and updated. */ int uptodated; size_t extent_min; // Minimal extent used while building. size_t extent_max; // Upper estimate of biggest free block. /* Zone [bit, end) */ size_t zone_bit; size_t zone_end; bool inited; }; typedef int (*NTFS_CMP_FUNC)(const void *key1, size_t len1, const void *key2, size_t len2, const void *param); enum index_mutex_classed { INDEX_MUTEX_I30 = 0, INDEX_MUTEX_SII = 1, INDEX_MUTEX_SDH = 2, INDEX_MUTEX_SO = 3, INDEX_MUTEX_SQ = 4, INDEX_MUTEX_SR = 5, INDEX_MUTEX_TOTAL }; /* ntfs_index - Allocation unit inside directory. */ struct ntfs_index { struct runs_tree bitmap_run; struct runs_tree alloc_run; /* read/write access to 'bitmap_run'/'alloc_run' while ntfs_readdir */ struct rw_semaphore run_lock; /*TODO: Remove 'cmp'. */ NTFS_CMP_FUNC cmp; u8 index_bits; // log2(root->index_block_size) u8 idx2vbn_bits; // log2(root->index_block_clst) u8 vbn2vbo_bits; // index_block_size < cluster? 9 : cluster_bits u8 type; // index_mutex_classed }; /* Minimum MFT zone. */ #define NTFS_MIN_MFT_ZONE 100 /* Step to increase the MFT. */ #define NTFS_MFT_INCREASE_STEP 1024 /* Ntfs file system in-core superblock data. */ struct ntfs_sb_info { struct super_block *sb; u32 discard_granularity; u64 discard_granularity_mask_inv; // ~(discard_granularity_mask_inv-1) u32 cluster_size; // bytes per cluster u32 cluster_mask; // == cluster_size - 1 u64 cluster_mask_inv; // ~(cluster_size - 1) u32 block_mask; // sb->s_blocksize - 1 u32 blocks_per_cluster; // cluster_size / sb->s_blocksize u32 record_size; u32 index_size; u8 cluster_bits; u8 record_bits; u64 maxbytes; // Maximum size for normal files. u64 maxbytes_sparse; // Maximum size for sparse file. unsigned long flags; // See NTFS_FLAGS_ CLST zone_max; // Maximum MFT zone length in clusters CLST bad_clusters; // The count of marked bad clusters. u16 max_bytes_per_attr; // Maximum attribute size in record. u16 attr_size_tr; // Attribute size threshold (320 bytes). /* Records in $Extend. */ CLST objid_no; CLST quota_no; CLST reparse_no; CLST usn_jrnl_no; struct ATTR_DEF_ENTRY *def_table; // Attribute definition table. u32 def_entries; u32 ea_max_size; struct MFT_REC *new_rec; u16 *upcase; struct { u64 lbo, lbo2; struct ntfs_inode *ni; struct wnd_bitmap bitmap; // $MFT::Bitmap /* * MFT records [11-24) used to expand MFT itself. * They always marked as used in $MFT::Bitmap * 'reserved_bitmap' contains real bitmap of these records. */ ulong reserved_bitmap; // Bitmap of used records [11 - 24) size_t next_free; // The next record to allocate from size_t used; // MFT valid size in records. u32 recs_mirr; // Number of records in MFTMirr u8 next_reserved; u8 reserved_bitmap_inited; } mft; struct { struct wnd_bitmap bitmap; // $Bitmap::Data CLST next_free_lcn; } used; struct { u64 size; // In bytes. u64 blocks; // In blocks. u64 ser_num; struct ntfs_inode *ni; __le16 flags; // Cached current VOLUME_INFO::flags, VOLUME_FLAG_DIRTY. u8 major_ver; u8 minor_ver; char label[256]; bool real_dirty; // Real fs state. } volume; struct { struct ntfs_index index_sii; struct ntfs_index index_sdh; struct ntfs_inode *ni; u32 next_id; u64 next_off; __le32 def_security_id; } security; struct { struct ntfs_index index_r; struct ntfs_inode *ni; u64 max_size; // 16K } reparse; struct { struct ntfs_index index_o; struct ntfs_inode *ni; } objid; struct { struct mutex mtx_lznt; struct lznt *lznt; #ifdef CONFIG_NTFS3_LZX_XPRESS struct mutex mtx_xpress; struct xpress_decompressor *xpress; struct mutex mtx_lzx; struct lzx_decompressor *lzx; #endif } compress; struct ntfs_mount_options *options; struct ratelimit_state msg_ratelimit; struct proc_dir_entry *procdir; }; /* One MFT record(usually 1024 bytes), consists of attributes. */ struct mft_inode { struct rb_node node; struct ntfs_sb_info *sbi; struct MFT_REC *mrec; struct ntfs_buffers nb; CLST rno; bool dirty; }; /* Nested class for ntfs_inode::ni_lock. */ enum ntfs_inode_mutex_lock_class { NTFS_INODE_MUTEX_DIRTY = 1, NTFS_INODE_MUTEX_SECURITY, NTFS_INODE_MUTEX_OBJID, NTFS_INODE_MUTEX_REPARSE, NTFS_INODE_MUTEX_NORMAL, NTFS_INODE_MUTEX_PARENT, NTFS_INODE_MUTEX_PARENT2, }; /* * struct ntfs_inode * * Ntfs inode - extends linux inode. consists of one or more MFT inodes. */ struct ntfs_inode { struct mft_inode mi; // base record /* * Valid size: [0 - i_valid) - these range in file contains valid data. * Range [i_valid - inode->i_size) - contains 0. * Usually i_valid <= inode->i_size. */ u64 i_valid; struct timespec64 i_crtime; struct mutex ni_lock; /* File attributes from std. */ enum FILE_ATTRIBUTE std_fa; __le32 std_security_id; /* * Tree of mft_inode. * Not empty when primary MFT record (usually 1024 bytes) can't save all attributes * e.g. file becomes too fragmented or contains a lot of names. */ struct rb_root mi_tree; /* * This member is used in ntfs_readdir to ensure that all subrecords are loaded */ u8 mi_loaded; union { struct ntfs_index dir; struct { struct rw_semaphore run_lock; struct runs_tree run; #ifdef CONFIG_NTFS3_LZX_XPRESS struct folio *offs_folio; #endif } file; }; struct { struct runs_tree run; struct ATTR_LIST_ENTRY *le; // 1K aligned memory. size_t size; bool dirty; } attr_list; size_t ni_flags; // NI_FLAG_XXX struct inode vfs_inode; }; struct indx_node { struct ntfs_buffers nb; struct INDEX_BUFFER *index; }; struct ntfs_fnd { int level; struct indx_node *nodes[20]; struct NTFS_DE *de[20]; struct NTFS_DE *root_de; }; enum REPARSE_SIGN { REPARSE_NONE = 0, REPARSE_COMPRESSED = 1, REPARSE_DEDUPLICATED = 2, REPARSE_LINK = 3 }; /* Functions from attrib.c */ int attr_allocate_clusters(struct ntfs_sb_info *sbi, struct runs_tree *run, CLST vcn, CLST lcn, CLST len, CLST *pre_alloc, enum ALLOCATE_OPT opt, CLST *alen, const size_t fr, CLST *new_lcn, CLST *new_len); int attr_make_nonresident(struct ntfs_inode *ni, struct ATTRIB *attr, struct ATTR_LIST_ENTRY *le, struct mft_inode *mi, u64 new_size, struct runs_tree *run, struct ATTRIB **ins_attr, struct page *page); int attr_set_size(struct ntfs_inode *ni, enum ATTR_TYPE type, const __le16 *name, u8 name_len, struct runs_tree *run, u64 new_size, const u64 *new_valid, bool keep_prealloc, struct ATTRIB **ret); int attr_data_get_block(struct ntfs_inode *ni, CLST vcn, CLST clen, CLST *lcn, CLST *len, bool *new, bool zero); int attr_data_read_resident(struct ntfs_inode *ni, struct folio *folio); int attr_data_write_resident(struct ntfs_inode *ni, struct folio *folio); int attr_load_runs_vcn(struct ntfs_inode *ni, enum ATTR_TYPE type, const __le16 *name, u8 name_len, struct runs_tree *run, CLST vcn); int attr_load_runs_range(struct ntfs_inode *ni, enum ATTR_TYPE type, const __le16 *name, u8 name_len, struct runs_tree *run, u64 from, u64 to); int attr_wof_frame_info(struct ntfs_inode *ni, struct ATTRIB *attr, struct runs_tree *run, u64 frame, u64 frames, u8 frame_bits, u32 *ondisk_size, u64 *vbo_data); int attr_is_frame_compressed(struct ntfs_inode *ni, struct ATTRIB *attr, CLST frame, CLST *clst_data, struct runs_tree *run); int attr_allocate_frame(struct ntfs_inode *ni, CLST frame, size_t compr_size, u64 new_valid); int attr_collapse_range(struct ntfs_inode *ni, u64 vbo, u64 bytes); int attr_insert_range(struct ntfs_inode *ni, u64 vbo, u64 bytes); int attr_punch_hole(struct ntfs_inode *ni, u64 vbo, u64 bytes, u32 *frame_size); int attr_force_nonresident(struct ntfs_inode *ni); int attr_set_compress(struct ntfs_inode *ni, bool compr); /* Functions from attrlist.c */ void al_destroy(struct ntfs_inode *ni); bool al_verify(struct ntfs_inode *ni); int ntfs_load_attr_list(struct ntfs_inode *ni, struct ATTRIB *attr); struct ATTR_LIST_ENTRY *al_enumerate(struct ntfs_inode *ni, struct ATTR_LIST_ENTRY *le); struct ATTR_LIST_ENTRY *al_find_le(struct ntfs_inode *ni, struct ATTR_LIST_ENTRY *le, const struct ATTRIB *attr); struct ATTR_LIST_ENTRY *al_find_ex(struct ntfs_inode *ni, struct ATTR_LIST_ENTRY *le, enum ATTR_TYPE type, const __le16 *name, u8 name_len, const CLST *vcn); int al_add_le(struct ntfs_inode *ni, enum ATTR_TYPE type, const __le16 *name, u8 name_len, CLST svcn, __le16 id, const struct MFT_REF *ref, struct ATTR_LIST_ENTRY **new_le); bool al_remove_le(struct ntfs_inode *ni, struct ATTR_LIST_ENTRY *le); int al_update(struct ntfs_inode *ni, int sync); static inline size_t al_aligned(size_t size) { return size_add(size, 1023) & ~(size_t)1023; } /* Globals from bitfunc.c */ bool are_bits_clear(const void *map, size_t bit, size_t nbits); bool are_bits_set(const void *map, size_t bit, size_t nbits); size_t get_set_bits_ex(const void *map, size_t bit, size_t nbits); /* Globals from dir.c */ int ntfs_utf16_to_nls(struct ntfs_sb_info *sbi, const __le16 *name, u32 len, u8 *buf, int buf_len); int ntfs_nls_to_utf16(struct ntfs_sb_info *sbi, const u8 *name, u32 name_len, struct cpu_str *uni, u32 max_ulen, enum utf16_endian endian); struct inode *dir_search_u(struct inode *dir, const struct cpu_str *uni, struct ntfs_fnd *fnd); bool dir_is_empty(struct inode *dir); extern const struct file_operations ntfs_dir_operations; extern const struct file_operations ntfs_legacy_dir_operations; /* Globals from file.c */ int ntfs_fileattr_get(struct dentry *dentry, struct fileattr *fa); int ntfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); int ntfs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, u32 flags); int ntfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); int ntfs_file_open(struct inode *inode, struct file *file); int ntfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); long ntfs_ioctl(struct file *filp, u32 cmd, unsigned long arg); long ntfs_compat_ioctl(struct file *filp, u32 cmd, unsigned long arg); extern const struct inode_operations ntfs_special_inode_operations; extern const struct inode_operations ntfs_file_inode_operations; extern const struct file_operations ntfs_file_operations; extern const struct file_operations ntfs_legacy_file_operations; /* Globals from frecord.c */ void ni_remove_mi(struct ntfs_inode *ni, struct mft_inode *mi); struct ATTR_STD_INFO *ni_std(struct ntfs_inode *ni); struct ATTR_STD_INFO5 *ni_std5(struct ntfs_inode *ni); void ni_clear(struct ntfs_inode *ni); int ni_load_mi_ex(struct ntfs_inode *ni, CLST rno, struct mft_inode **mi); int ni_load_mi(struct ntfs_inode *ni, const struct ATTR_LIST_ENTRY *le, struct mft_inode **mi); struct ATTRIB *ni_find_attr(struct ntfs_inode *ni, struct ATTRIB *attr, struct ATTR_LIST_ENTRY **entry_o, enum ATTR_TYPE type, const __le16 *name, u8 name_len, const CLST *vcn, struct mft_inode **mi); struct ATTRIB *ni_enum_attr_ex(struct ntfs_inode *ni, struct ATTRIB *attr, struct ATTR_LIST_ENTRY **le, struct mft_inode **mi); struct ATTRIB *ni_load_attr(struct ntfs_inode *ni, enum ATTR_TYPE type, const __le16 *name, u8 name_len, CLST vcn, struct mft_inode **pmi); int ni_load_all_mi(struct ntfs_inode *ni); bool ni_add_subrecord(struct ntfs_inode *ni, CLST rno, struct mft_inode **mi); int ni_remove_attr(struct ntfs_inode *ni, enum ATTR_TYPE type, const __le16 *name, u8 name_len, bool base_only, const __le16 *id); int ni_create_attr_list(struct ntfs_inode *ni); int ni_expand_list(struct ntfs_inode *ni); int ni_insert_nonresident(struct ntfs_inode *ni, enum ATTR_TYPE type, const __le16 *name, u8 name_len, const struct runs_tree *run, CLST svcn, CLST len, __le16 flags, struct ATTRIB **new_attr, struct mft_inode **mi, struct ATTR_LIST_ENTRY **le); int ni_insert_resident(struct ntfs_inode *ni, u32 data_size, enum ATTR_TYPE type, const __le16 *name, u8 name_len, struct ATTRIB **new_attr, struct mft_inode **mi, struct ATTR_LIST_ENTRY **le); void ni_remove_attr_le(struct ntfs_inode *ni, struct ATTRIB *attr, struct mft_inode *mi, struct ATTR_LIST_ENTRY *le); int ni_delete_all(struct ntfs_inode *ni); struct ATTR_FILE_NAME *ni_fname_name(struct ntfs_inode *ni, const struct le_str *uni, const struct MFT_REF *home, struct mft_inode **mi, struct ATTR_LIST_ENTRY **entry); struct ATTR_FILE_NAME *ni_fname_type(struct ntfs_inode *ni, u8 name_type, struct mft_inode **mi, struct ATTR_LIST_ENTRY **entry); int ni_new_attr_flags(struct ntfs_inode *ni, enum FILE_ATTRIBUTE new_fa); enum REPARSE_SIGN ni_parse_reparse(struct ntfs_inode *ni, struct ATTRIB *attr, struct REPARSE_DATA_BUFFER *buffer); int ni_write_inode(struct inode *inode, int sync, const char *hint); #define _ni_write_inode(i, w) ni_write_inode(i, w, __func__) int ni_fiemap(struct ntfs_inode *ni, struct fiemap_extent_info *fieinfo, __u64 vbo, __u64 len); int ni_readpage_cmpr(struct ntfs_inode *ni, struct folio *folio); int ni_decompress_file(struct ntfs_inode *ni); int ni_read_frame(struct ntfs_inode *ni, u64 frame_vbo, struct page **pages, u32 pages_per_frame); int ni_write_frame(struct ntfs_inode *ni, struct page **pages, u32 pages_per_frame); int ni_remove_name(struct ntfs_inode *dir_ni, struct ntfs_inode *ni, struct NTFS_DE *de, struct NTFS_DE **de2, int *undo_step); bool ni_remove_name_undo(struct ntfs_inode *dir_ni, struct ntfs_inode *ni, struct NTFS_DE *de, struct NTFS_DE *de2, int undo_step); int ni_add_name(struct ntfs_inode *dir_ni, struct ntfs_inode *ni, struct NTFS_DE *de); int ni_rename(struct ntfs_inode *dir_ni, struct ntfs_inode *new_dir_ni, struct ntfs_inode *ni, struct NTFS_DE *de, struct NTFS_DE *new_de, bool *is_bad); bool ni_is_dirty(struct inode *inode); int ni_set_compress(struct inode *inode, bool compr); /* Globals from fslog.c */ bool check_index_header(const struct INDEX_HDR *hdr, size_t bytes); int log_replay(struct ntfs_inode *ni, bool *initialized); /* Globals from fsntfs.c */ struct buffer_head *ntfs_bread(struct super_block *sb, sector_t block); bool ntfs_fix_pre_write(struct NTFS_RECORD_HEADER *rhdr, size_t bytes); int ntfs_fix_post_read(struct NTFS_RECORD_HEADER *rhdr, size_t bytes, bool simple); int ntfs_extend_init(struct ntfs_sb_info *sbi); int ntfs_loadlog_and_replay(struct ntfs_inode *ni, struct ntfs_sb_info *sbi); int ntfs_look_for_free_space(struct ntfs_sb_info *sbi, CLST lcn, CLST len, CLST *new_lcn, CLST *new_len, enum ALLOCATE_OPT opt); bool ntfs_check_for_free_space(struct ntfs_sb_info *sbi, CLST clen, CLST mlen); int ntfs_look_free_mft(struct ntfs_sb_info *sbi, CLST *rno, bool mft, struct ntfs_inode *ni, struct mft_inode **mi); void ntfs_mark_rec_free(struct ntfs_sb_info *sbi, CLST rno, bool is_mft); int ntfs_clear_mft_tail(struct ntfs_sb_info *sbi, size_t from, size_t to); int ntfs_refresh_zone(struct ntfs_sb_info *sbi); void ntfs_update_mftmirr(struct ntfs_sb_info *sbi, int wait); void ntfs_bad_inode(struct inode *inode, const char *hint); #define _ntfs_bad_inode(i) ntfs_bad_inode(i, __func__) enum NTFS_DIRTY_FLAGS { NTFS_DIRTY_CLEAR = 0, NTFS_DIRTY_DIRTY = 1, NTFS_DIRTY_ERROR = 2, }; int ntfs_set_state(struct ntfs_sb_info *sbi, enum NTFS_DIRTY_FLAGS dirty); int ntfs_sb_read(struct super_block *sb, u64 lbo, size_t bytes, void *buffer); int ntfs_sb_write(struct super_block *sb, u64 lbo, size_t bytes, const void *buffer, int wait); int ntfs_sb_write_run(struct ntfs_sb_info *sbi, const struct runs_tree *run, u64 vbo, const void *buf, size_t bytes, int sync); struct buffer_head *ntfs_bread_run(struct ntfs_sb_info *sbi, const struct runs_tree *run, u64 vbo); int ntfs_read_run_nb(struct ntfs_sb_info *sbi, const struct runs_tree *run, u64 vbo, void *buf, u32 bytes, struct ntfs_buffers *nb); int ntfs_read_bh(struct ntfs_sb_info *sbi, const struct runs_tree *run, u64 vbo, struct NTFS_RECORD_HEADER *rhdr, u32 bytes, struct ntfs_buffers *nb); int ntfs_get_bh(struct ntfs_sb_info *sbi, const struct runs_tree *run, u64 vbo, u32 bytes, struct ntfs_buffers *nb); int ntfs_write_bh(struct ntfs_sb_info *sbi, struct NTFS_RECORD_HEADER *rhdr, struct ntfs_buffers *nb, int sync); int ntfs_bio_pages(struct ntfs_sb_info *sbi, const struct runs_tree *run, struct page **pages, u32 nr_pages, u64 vbo, u32 bytes, enum req_op op); int ntfs_bio_fill_1(struct ntfs_sb_info *sbi, const struct runs_tree *run); int ntfs_vbo_to_lbo(struct ntfs_sb_info *sbi, const struct runs_tree *run, u64 vbo, u64 *lbo, u64 *bytes); struct ntfs_inode *ntfs_new_inode(struct ntfs_sb_info *sbi, CLST nRec, enum RECORD_FLAG flag); extern const u8 s_default_security[0x50]; bool is_sd_valid(const struct SECURITY_DESCRIPTOR_RELATIVE *sd, u32 len); int ntfs_security_init(struct ntfs_sb_info *sbi); int ntfs_get_security_by_id(struct ntfs_sb_info *sbi, __le32 security_id, struct SECURITY_DESCRIPTOR_RELATIVE **sd, size_t *size); int ntfs_insert_security(struct ntfs_sb_info *sbi, const struct SECURITY_DESCRIPTOR_RELATIVE *sd, u32 size, __le32 *security_id, bool *inserted); int ntfs_reparse_init(struct ntfs_sb_info *sbi); int ntfs_objid_init(struct ntfs_sb_info *sbi); int ntfs_objid_remove(struct ntfs_sb_info *sbi, struct GUID *guid); int ntfs_insert_reparse(struct ntfs_sb_info *sbi, __le32 rtag, const struct MFT_REF *ref); int ntfs_remove_reparse(struct ntfs_sb_info *sbi, __le32 rtag, const struct MFT_REF *ref); void mark_as_free_ex(struct ntfs_sb_info *sbi, CLST lcn, CLST len, bool trim); int run_deallocate(struct ntfs_sb_info *sbi, const struct runs_tree *run, bool trim); bool valid_windows_name(struct ntfs_sb_info *sbi, const struct le_str *name); int ntfs_set_label(struct ntfs_sb_info *sbi, u8 *label, int len); /* Globals from index.c */ int indx_used_bit(struct ntfs_index *indx, struct ntfs_inode *ni, size_t *bit); void fnd_clear(struct ntfs_fnd *fnd); static inline struct ntfs_fnd *fnd_get(void) { return kzalloc(sizeof(struct ntfs_fnd), GFP_NOFS); } static inline void fnd_put(struct ntfs_fnd *fnd) { if (fnd) { fnd_clear(fnd); kfree(fnd); } } void indx_clear(struct ntfs_index *idx); int indx_init(struct ntfs_index *indx, struct ntfs_sb_info *sbi, const struct ATTRIB *attr, enum index_mutex_classed type); struct INDEX_ROOT *indx_get_root(struct ntfs_index *indx, struct ntfs_inode *ni, struct ATTRIB **attr, struct mft_inode **mi); int indx_read(struct ntfs_index *idx, struct ntfs_inode *ni, CLST vbn, struct indx_node **node); int indx_find(struct ntfs_index *indx, struct ntfs_inode *dir, const struct INDEX_ROOT *root, const void *Key, size_t KeyLen, const void *param, int *diff, struct NTFS_DE **entry, struct ntfs_fnd *fnd); int indx_find_sort(struct ntfs_index *indx, struct ntfs_inode *ni, const struct INDEX_ROOT *root, struct NTFS_DE **entry, struct ntfs_fnd *fnd); int indx_find_raw(struct ntfs_index *indx, struct ntfs_inode *ni, const struct INDEX_ROOT *root, struct NTFS_DE **entry, size_t *off, struct ntfs_fnd *fnd); int indx_insert_entry(struct ntfs_index *indx, struct ntfs_inode *ni, const struct NTFS_DE *new_de, const void *param, struct ntfs_fnd *fnd, bool undo); int indx_delete_entry(struct ntfs_index *indx, struct ntfs_inode *ni, const void *key, u32 key_len, const void *param); int indx_update_dup(struct ntfs_inode *ni, struct ntfs_sb_info *sbi, const struct ATTR_FILE_NAME *fname, const struct NTFS_DUP_INFO *dup, int sync); /* Globals from inode.c */ struct inode *ntfs_iget5(struct super_block *sb, const struct MFT_REF *ref, const struct cpu_str *name); int ntfs_set_size(struct inode *inode, u64 new_size); int ntfs_get_block(struct inode *inode, sector_t vbn, struct buffer_head *bh_result, int create); int ntfs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, u32 len, struct folio **foliop, void **fsdata); int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, u32 len, u32 copied, struct folio *folio, void *fsdata); int ntfs3_write_inode(struct inode *inode, struct writeback_control *wbc); int ntfs_sync_inode(struct inode *inode); int ntfs_flush_inodes(struct super_block *sb, struct inode *i1, struct inode *i2); int inode_read_data(struct inode *inode, void *data, size_t bytes); int ntfs_create_inode(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const struct cpu_str *uni, umode_t mode, dev_t dev, const char *symname, u32 size, struct ntfs_fnd *fnd); int ntfs_link_inode(struct inode *inode, struct dentry *dentry); int ntfs_unlink_inode(struct inode *dir, const struct dentry *dentry); void ntfs_evict_inode(struct inode *inode); extern const struct inode_operations ntfs_link_inode_operations; extern const struct address_space_operations ntfs_aops; extern const struct address_space_operations ntfs_aops_cmpr; /* Globals from name_i.c */ int fill_name_de(struct ntfs_sb_info *sbi, void *buf, const struct qstr *name, const struct cpu_str *uni); struct dentry *ntfs3_get_parent(struct dentry *child); extern const struct inode_operations ntfs_dir_inode_operations; extern const struct inode_operations ntfs_special_inode_operations; extern const struct dentry_operations ntfs_dentry_ops; /* Globals from record.c */ int mi_get(struct ntfs_sb_info *sbi, CLST rno, struct mft_inode **mi); void mi_put(struct mft_inode *mi); int mi_init(struct mft_inode *mi, struct ntfs_sb_info *sbi, CLST rno); int mi_read(struct mft_inode *mi, bool is_mft); struct ATTRIB *mi_enum_attr(struct ntfs_inode *ni, struct mft_inode *mi, struct ATTRIB *attr); struct ATTRIB *mi_find_attr(struct ntfs_inode *ni, struct mft_inode *mi, struct ATTRIB *attr, enum ATTR_TYPE type, const __le16 *name, u8 name_len, const __le16 *id); static inline struct ATTRIB *rec_find_attr_le(struct ntfs_inode *ni, struct mft_inode *rec, struct ATTR_LIST_ENTRY *le) { return mi_find_attr(ni, rec, NULL, le->type, le_name(le), le->name_len, &le->id); } int mi_write(struct mft_inode *mi, int wait); int mi_format_new(struct mft_inode *mi, struct ntfs_sb_info *sbi, CLST rno, __le16 flags, bool is_mft); struct ATTRIB *mi_insert_attr(struct ntfs_inode *ni, struct mft_inode *mi, enum ATTR_TYPE type, const __le16 *name, u8 name_len, u32 asize, u16 name_off); bool mi_remove_attr(struct ntfs_inode *ni, struct mft_inode *mi, struct ATTRIB *attr); bool mi_resize_attr(struct mft_inode *mi, struct ATTRIB *attr, int bytes); int mi_pack_runs(struct mft_inode *mi, struct ATTRIB *attr, struct runs_tree *run, CLST len); static inline bool mi_is_ref(const struct mft_inode *mi, const struct MFT_REF *ref) { if (le32_to_cpu(ref->low) != mi->rno) return false; if (ref->seq != mi->mrec->seq) return false; #ifdef CONFIG_NTFS3_64BIT_CLUSTER return le16_to_cpu(ref->high) == (mi->rno >> 32); #else return !ref->high; #endif } static inline void mi_get_ref(const struct mft_inode *mi, struct MFT_REF *ref) { ref->low = cpu_to_le32(mi->rno); #ifdef CONFIG_NTFS3_64BIT_CLUSTER ref->high = cpu_to_le16(mi->rno >> 32); #else ref->high = 0; #endif ref->seq = mi->mrec->seq; } /* Globals from run.c */ bool run_lookup_entry(const struct runs_tree *run, CLST vcn, CLST *lcn, CLST *len, size_t *index); void run_truncate(struct runs_tree *run, CLST vcn); void run_truncate_head(struct runs_tree *run, CLST vcn); void run_truncate_around(struct runs_tree *run, CLST vcn); bool run_add_entry(struct runs_tree *run, CLST vcn, CLST lcn, CLST len, bool is_mft); bool run_collapse_range(struct runs_tree *run, CLST vcn, CLST len); bool run_insert_range(struct runs_tree *run, CLST vcn, CLST len); bool run_get_entry(const struct runs_tree *run, size_t index, CLST *vcn, CLST *lcn, CLST *len); bool run_is_mapped_full(const struct runs_tree *run, CLST svcn, CLST evcn); int run_pack(const struct runs_tree *run, CLST svcn, CLST len, u8 *run_buf, u32 run_buf_size, CLST *packed_vcns); int run_unpack(struct runs_tree *run, struct ntfs_sb_info *sbi, CLST ino, CLST svcn, CLST evcn, CLST vcn, const u8 *run_buf, int run_buf_size); #ifdef NTFS3_CHECK_FREE_CLST int run_unpack_ex(struct runs_tree *run, struct ntfs_sb_info *sbi, CLST ino, CLST svcn, CLST evcn, CLST vcn, const u8 *run_buf, int run_buf_size); #else #define run_unpack_ex run_unpack #endif int run_get_highest_vcn(CLST vcn, const u8 *run_buf, u64 *highest_vcn); int run_clone(const struct runs_tree *run, struct runs_tree *new_run); /* Globals from super.c */ void *ntfs_set_shared(void *ptr, u32 bytes); void *ntfs_put_shared(void *ptr); void ntfs_unmap_meta(struct super_block *sb, CLST lcn, CLST len); int ntfs_discard(struct ntfs_sb_info *sbi, CLST Lcn, CLST Len); /* Globals from bitmap.c*/ int __init ntfs3_init_bitmap(void); void ntfs3_exit_bitmap(void); void wnd_close(struct wnd_bitmap *wnd); static inline size_t wnd_zeroes(const struct wnd_bitmap *wnd) { return wnd->total_zeroes; } int wnd_init(struct wnd_bitmap *wnd, struct super_block *sb, size_t nbits); int wnd_set_free(struct wnd_bitmap *wnd, size_t bit, size_t bits); int wnd_set_used(struct wnd_bitmap *wnd, size_t bit, size_t bits); int wnd_set_used_safe(struct wnd_bitmap *wnd, size_t bit, size_t bits, size_t *done); bool wnd_is_free(struct wnd_bitmap *wnd, size_t bit, size_t bits); bool wnd_is_used(struct wnd_bitmap *wnd, size_t bit, size_t bits); /* Possible values for 'flags' 'wnd_find'. */ #define BITMAP_FIND_MARK_AS_USED 0x01 #define BITMAP_FIND_FULL 0x02 size_t wnd_find(struct wnd_bitmap *wnd, size_t to_alloc, size_t hint, size_t flags, size_t *allocated); int wnd_extend(struct wnd_bitmap *wnd, size_t new_bits); void wnd_zone_set(struct wnd_bitmap *wnd, size_t Lcn, size_t Len); int ntfs_trim_fs(struct ntfs_sb_info *sbi, struct fstrim_range *range); void ntfs_bitmap_set_le(void *map, unsigned int start, int len); void ntfs_bitmap_clear_le(void *map, unsigned int start, int len); unsigned int ntfs_bitmap_weight_le(const void *bitmap, int bits); /* Globals from upcase.c */ int ntfs_cmp_names(const __le16 *s1, size_t l1, const __le16 *s2, size_t l2, const u16 *upcase, bool bothcase); int ntfs_cmp_names_cpu(const struct cpu_str *uni1, const struct le_str *uni2, const u16 *upcase, bool bothcase); unsigned long ntfs_names_hash(const u16 *name, size_t len, const u16 *upcase, unsigned long hash); /* globals from xattr.c */ #ifdef CONFIG_NTFS3_FS_POSIX_ACL struct posix_acl *ntfs_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, int type); int ntfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type); int ntfs_init_acl(struct mnt_idmap *idmap, struct inode *inode, struct inode *dir); #else #define ntfs_get_acl NULL #define ntfs_set_acl NULL #endif int ntfs_acl_chmod(struct mnt_idmap *idmap, struct dentry *dentry); ssize_t ntfs_listxattr(struct dentry *dentry, char *buffer, size_t size); extern const struct xattr_handler *const ntfs_xattr_handlers[]; int ntfs_save_wsl_perm(struct inode *inode, __le16 *ea_size); void ntfs_get_wsl_perm(struct inode *inode); /* globals from lznt.c */ struct lznt *get_lznt_ctx(int level); size_t compress_lznt(const void *uncompressed, size_t uncompressed_size, void *compressed, size_t compressed_size, struct lznt *ctx); ssize_t decompress_lznt(const void *compressed, size_t compressed_size, void *uncompressed, size_t uncompressed_size); static inline bool is_ntfs3(struct ntfs_sb_info *sbi) { return sbi->volume.major_ver >= 3; } /* (sb->s_flags & SB_ACTIVE) */ static inline bool is_mounted(struct ntfs_sb_info *sbi) { return !!sbi->sb->s_root; } static inline bool ntfs_is_meta_file(struct ntfs_sb_info *sbi, CLST rno) { return rno < MFT_REC_FREE || rno == sbi->objid_no || rno == sbi->quota_no || rno == sbi->reparse_no || rno == sbi->usn_jrnl_no; } static inline size_t wnd_zone_bit(const struct wnd_bitmap *wnd) { return wnd->zone_bit; } static inline size_t wnd_zone_len(const struct wnd_bitmap *wnd) { return wnd->zone_end - wnd->zone_bit; } static inline void run_init(struct runs_tree *run) { run->runs = NULL; run->count = 0; run->allocated = 0; } static inline struct runs_tree *run_alloc(void) { return kzalloc(sizeof(struct runs_tree), GFP_NOFS); } static inline void run_close(struct runs_tree *run) { kvfree(run->runs); memset(run, 0, sizeof(*run)); } static inline void run_free(struct runs_tree *run) { if (run) { kvfree(run->runs); kfree(run); } } static inline bool run_is_empty(struct runs_tree *run) { return !run->count; } /* NTFS uses quad aligned bitmaps. */ static inline size_t ntfs3_bitmap_size(size_t bits) { return BITS_TO_U64(bits) * sizeof(u64); } #define _100ns2seconds 10000000 #define SecondsToStartOf1970 0x00000002B6109100 #define NTFS_TIME_GRAN 100 /* * kernel2nt - Converts in-memory kernel timestamp into nt time. */ static inline __le64 kernel2nt(const struct timespec64 *ts) { // 10^7 units of 100 nanoseconds one second return cpu_to_le64(_100ns2seconds * (ts->tv_sec + SecondsToStartOf1970) + ts->tv_nsec / NTFS_TIME_GRAN); } /* * nt2kernel - Converts on-disk nt time into kernel timestamp. */ static inline void nt2kernel(const __le64 tm, struct timespec64 *ts) { u64 t = le64_to_cpu(tm) - _100ns2seconds * SecondsToStartOf1970; // WARNING: do_div changes its first argument(!) ts->tv_nsec = do_div(t, _100ns2seconds) * 100; ts->tv_sec = t; } static inline struct ntfs_sb_info *ntfs_sb(struct super_block *sb) { return sb->s_fs_info; } static inline int ntfs3_forced_shutdown(struct super_block *sb) { return test_bit(NTFS_FLAGS_SHUTDOWN_BIT, &ntfs_sb(sb)->flags); } /* * ntfs_up_cluster - Align up on cluster boundary. */ static inline u64 ntfs_up_cluster(const struct ntfs_sb_info *sbi, u64 size) { return (size + sbi->cluster_mask) & sbi->cluster_mask_inv; } /* * ntfs_up_block - Align up on cluster boundary. */ static inline u64 ntfs_up_block(const struct super_block *sb, u64 size) { return (size + sb->s_blocksize - 1) & ~(u64)(sb->s_blocksize - 1); } static inline CLST bytes_to_cluster(const struct ntfs_sb_info *sbi, u64 size) { return (size + sbi->cluster_mask) >> sbi->cluster_bits; } static inline u64 bytes_to_block(const struct super_block *sb, u64 size) { return (size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; } static inline struct ntfs_inode *ntfs_i(struct inode *inode) { return container_of(inode, struct ntfs_inode, vfs_inode); } static inline bool is_compressed(const struct ntfs_inode *ni) { return (ni->std_fa & FILE_ATTRIBUTE_COMPRESSED) || (ni->ni_flags & NI_FLAG_COMPRESSED_MASK); } static inline int ni_ext_compress_bits(const struct ntfs_inode *ni) { return 0xb + (ni->ni_flags & NI_FLAG_COMPRESSED_MASK); } /* Bits - 0xc, 0xd, 0xe, 0xf, 0x10 */ static inline void ni_set_ext_compress_bits(struct ntfs_inode *ni, u8 bits) { ni->ni_flags |= (bits - 0xb) & NI_FLAG_COMPRESSED_MASK; } static inline bool is_dedup(const struct ntfs_inode *ni) { return ni->ni_flags & NI_FLAG_DEDUPLICATED; } static inline bool is_encrypted(const struct ntfs_inode *ni) { return ni->std_fa & FILE_ATTRIBUTE_ENCRYPTED; } static inline bool is_sparsed(const struct ntfs_inode *ni) { return ni->std_fa & FILE_ATTRIBUTE_SPARSE_FILE; } static inline int is_resident(struct ntfs_inode *ni) { return ni->ni_flags & NI_FLAG_RESIDENT; } static inline void le16_sub_cpu(__le16 *var, u16 val) { *var = cpu_to_le16(le16_to_cpu(*var) - val); } static inline void le32_sub_cpu(__le32 *var, u32 val) { *var = cpu_to_le32(le32_to_cpu(*var) - val); } static inline void nb_put(struct ntfs_buffers *nb) { u32 i, nbufs = nb->nbufs; if (!nbufs) return; for (i = 0; i < nbufs; i++) put_bh(nb->bh[i]); nb->nbufs = 0; } static inline void put_indx_node(struct indx_node *in) { if (!in) return; kfree(in->index); nb_put(&in->nb); kfree(in); } static inline void mi_clear(struct mft_inode *mi) { nb_put(&mi->nb); kfree(mi->mrec); mi->mrec = NULL; } static inline void ni_lock(struct ntfs_inode *ni) { mutex_lock_nested(&ni->ni_lock, NTFS_INODE_MUTEX_NORMAL); } static inline void ni_lock_dir(struct ntfs_inode *ni) { mutex_lock_nested(&ni->ni_lock, NTFS_INODE_MUTEX_PARENT); } static inline void ni_lock_dir2(struct ntfs_inode *ni) { mutex_lock_nested(&ni->ni_lock, NTFS_INODE_MUTEX_PARENT2); } static inline void ni_unlock(struct ntfs_inode *ni) { mutex_unlock(&ni->ni_lock); } static inline int ni_trylock(struct ntfs_inode *ni) { return mutex_trylock(&ni->ni_lock); } static inline int attr_load_runs_attr(struct ntfs_inode *ni, struct ATTRIB *attr, struct runs_tree *run, CLST vcn) { return attr_load_runs_vcn(ni, attr->type, attr_name(attr), attr->name_len, run, vcn); } static inline void le64_sub_cpu(__le64 *var, u64 val) { *var = cpu_to_le64(le64_to_cpu(*var) - val); } #if IS_ENABLED(CONFIG_NTFS_FS) bool is_legacy_ntfs(struct super_block *sb); #else static inline bool is_legacy_ntfs(struct super_block *sb) { return false; } #endif #endif /* _LINUX_NTFS3_NTFS_FS_H */
2 1 64 1 64 64 49 20 71 71 2 69 69 1 52 18 53 53 51 2 53 44 7 8 46 53 8 45 2 53 46 7 6 9 9 1 4 3 2 6 6 6 6 4 7 1 6 2 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 // SPDX-License-Identifier: GPL-2.0-or-later /* * IPV4 GSO/GRO offload support * Linux INET implementation * * TCPv4 GSO/GRO support */ #include <linux/indirect_call_wrapper.h> #include <linux/skbuff.h> #include <net/gro.h> #include <net/gso.h> #include <net/tcp.h> #include <net/protocol.h> static void tcp_gso_tstamp(struct sk_buff *skb, unsigned int ts_seq, unsigned int seq, unsigned int mss) { while (skb) { if (before(ts_seq, seq + mss)) { skb_shinfo(skb)->tx_flags |= SKBTX_SW_TSTAMP; skb_shinfo(skb)->tskey = ts_seq; return; } skb = skb->next; seq += mss; } } static void __tcpv4_gso_segment_csum(struct sk_buff *seg, __be32 *oldip, __be32 newip, __be16 *oldport, __be16 newport) { struct tcphdr *th; struct iphdr *iph; if (*oldip == newip && *oldport == newport) return; th = tcp_hdr(seg); iph = ip_hdr(seg); inet_proto_csum_replace4(&th->check, seg, *oldip, newip, true); inet_proto_csum_replace2(&th->check, seg, *oldport, newport, false); *oldport = newport; csum_replace4(&iph->check, *oldip, newip); *oldip = newip; } static struct sk_buff *__tcpv4_gso_segment_list_csum(struct sk_buff *segs) { const struct tcphdr *th; const struct iphdr *iph; struct sk_buff *seg; struct tcphdr *th2; struct iphdr *iph2; seg = segs; th = tcp_hdr(seg); iph = ip_hdr(seg); th2 = tcp_hdr(seg->next); iph2 = ip_hdr(seg->next); if (!(*(const u32 *)&th->source ^ *(const u32 *)&th2->source) && iph->daddr == iph2->daddr && iph->saddr == iph2->saddr) return segs; while ((seg = seg->next)) { th2 = tcp_hdr(seg); iph2 = ip_hdr(seg); __tcpv4_gso_segment_csum(seg, &iph2->saddr, iph->saddr, &th2->source, th->source); __tcpv4_gso_segment_csum(seg, &iph2->daddr, iph->daddr, &th2->dest, th->dest); } return segs; } static struct sk_buff *__tcp4_gso_segment_list(struct sk_buff *skb, netdev_features_t features) { skb = skb_segment_list(skb, features, skb_mac_header_len(skb)); if (IS_ERR(skb)) return skb; return __tcpv4_gso_segment_list_csum(skb); } static struct sk_buff *tcp4_gso_segment(struct sk_buff *skb, netdev_features_t features) { if (!(skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4)) return ERR_PTR(-EINVAL); if (!pskb_may_pull(skb, sizeof(struct tcphdr))) return ERR_PTR(-EINVAL); if (skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST) { struct tcphdr *th = tcp_hdr(skb); if (skb_pagelen(skb) - th->doff * 4 == skb_shinfo(skb)->gso_size) return __tcp4_gso_segment_list(skb, features); skb->ip_summed = CHECKSUM_NONE; } if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) { const struct iphdr *iph = ip_hdr(skb); struct tcphdr *th = tcp_hdr(skb); /* Set up checksum pseudo header, usually expect stack to * have done this already. */ th->check = 0; skb->ip_summed = CHECKSUM_PARTIAL; __tcp_v4_send_check(skb, iph->saddr, iph->daddr); } return tcp_gso_segment(skb, features); } struct sk_buff *tcp_gso_segment(struct sk_buff *skb, netdev_features_t features) { struct sk_buff *segs = ERR_PTR(-EINVAL); unsigned int sum_truesize = 0; struct tcphdr *th; unsigned int thlen; unsigned int seq; unsigned int oldlen; unsigned int mss; struct sk_buff *gso_skb = skb; __sum16 newcheck; bool ooo_okay, copy_destructor; __wsum delta; th = tcp_hdr(skb); thlen = th->doff * 4; if (thlen < sizeof(*th)) goto out; if (unlikely(skb_checksum_start(skb) != skb_transport_header(skb))) goto out; if (!pskb_may_pull(skb, thlen)) goto out; oldlen = ~skb->len; __skb_pull(skb, thlen); mss = skb_shinfo(skb)->gso_size; if (unlikely(skb->len <= mss)) goto out; if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) { /* Packet is from an untrusted source, reset gso_segs. */ skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss); segs = NULL; goto out; } copy_destructor = gso_skb->destructor == tcp_wfree; ooo_okay = gso_skb->ooo_okay; /* All segments but the first should have ooo_okay cleared */ skb->ooo_okay = 0; segs = skb_segment(skb, features); if (IS_ERR(segs)) goto out; /* Only first segment might have ooo_okay set */ segs->ooo_okay = ooo_okay; /* GSO partial and frag_list segmentation only requires splitting * the frame into an MSS multiple and possibly a remainder, both * cases return a GSO skb. So update the mss now. */ if (skb_is_gso(segs)) mss *= skb_shinfo(segs)->gso_segs; delta = (__force __wsum)htonl(oldlen + thlen + mss); skb = segs; th = tcp_hdr(skb); seq = ntohl(th->seq); if (unlikely(skb_shinfo(gso_skb)->tx_flags & SKBTX_SW_TSTAMP)) tcp_gso_tstamp(segs, skb_shinfo(gso_skb)->tskey, seq, mss); newcheck = ~csum_fold(csum_add(csum_unfold(th->check), delta)); while (skb->next) { th->fin = th->psh = 0; th->check = newcheck; if (skb->ip_summed == CHECKSUM_PARTIAL) gso_reset_checksum(skb, ~th->check); else th->check = gso_make_checksum(skb, ~th->check); seq += mss; if (copy_destructor) { skb->destructor = gso_skb->destructor; skb->sk = gso_skb->sk; sum_truesize += skb->truesize; } skb = skb->next; th = tcp_hdr(skb); th->seq = htonl(seq); th->cwr = 0; } /* Following permits TCP Small Queues to work well with GSO : * The callback to TCP stack will be called at the time last frag * is freed at TX completion, and not right now when gso_skb * is freed by GSO engine */ if (copy_destructor) { int delta; swap(gso_skb->sk, skb->sk); swap(gso_skb->destructor, skb->destructor); sum_truesize += skb->truesize; delta = sum_truesize - gso_skb->truesize; /* In some pathological cases, delta can be negative. * We need to either use refcount_add() or refcount_sub_and_test() */ if (likely(delta >= 0)) refcount_add(delta, &skb->sk->sk_wmem_alloc); else WARN_ON_ONCE(refcount_sub_and_test(-delta, &skb->sk->sk_wmem_alloc)); } delta = (__force __wsum)htonl(oldlen + (skb_tail_pointer(skb) - skb_transport_header(skb)) + skb->data_len); th->check = ~csum_fold(csum_add(csum_unfold(th->check), delta)); if (skb->ip_summed == CHECKSUM_PARTIAL) gso_reset_checksum(skb, ~th->check); else th->check = gso_make_checksum(skb, ~th->check); out: return segs; } struct sk_buff *tcp_gro_lookup(struct list_head *head, struct tcphdr *th) { struct tcphdr *th2; struct sk_buff *p; list_for_each_entry(p, head, list) { if (!NAPI_GRO_CB(p)->same_flow) continue; th2 = tcp_hdr(p); if (*(u32 *)&th->source ^ *(u32 *)&th2->source) { NAPI_GRO_CB(p)->same_flow = 0; continue; } return p; } return NULL; } struct tcphdr *tcp_gro_pull_header(struct sk_buff *skb) { unsigned int thlen, hlen, off; struct tcphdr *th; off = skb_gro_offset(skb); hlen = off + sizeof(*th); th = skb_gro_header(skb, hlen, off); if (unlikely(!th)) return NULL; thlen = th->doff * 4; if (thlen < sizeof(*th)) return NULL; hlen = off + thlen; if (!skb_gro_may_pull(skb, hlen)) { th = skb_gro_header_slow(skb, hlen, off); if (unlikely(!th)) return NULL; } skb_gro_pull(skb, thlen); return th; } struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb, struct tcphdr *th) { unsigned int thlen = th->doff * 4; struct sk_buff *pp = NULL; struct sk_buff *p; struct tcphdr *th2; unsigned int len; __be32 flags; unsigned int mss = 1; int flush = 1; int i; len = skb_gro_len(skb); flags = tcp_flag_word(th); p = tcp_gro_lookup(head, th); if (!p) goto out_check_final; th2 = tcp_hdr(p); flush = (__force int)(flags & TCP_FLAG_CWR); flush |= (__force int)((flags ^ tcp_flag_word(th2)) & ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH)); flush |= (__force int)(th->ack_seq ^ th2->ack_seq); for (i = sizeof(*th); i < thlen; i += 4) flush |= *(u32 *)((u8 *)th + i) ^ *(u32 *)((u8 *)th2 + i); flush |= gro_receive_network_flush(th, th2, p); mss = skb_shinfo(p)->gso_size; /* If skb is a GRO packet, make sure its gso_size matches prior packet mss. * If it is a single frame, do not aggregate it if its length * is bigger than our mss. */ if (unlikely(skb_is_gso(skb))) flush |= (mss != skb_shinfo(skb)->gso_size); else flush |= (len - 1) >= mss; flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq); flush |= skb_cmp_decrypted(p, skb); if (unlikely(NAPI_GRO_CB(p)->is_flist)) { flush |= (__force int)(flags ^ tcp_flag_word(th2)); flush |= skb->ip_summed != p->ip_summed; flush |= skb->csum_level != p->csum_level; flush |= NAPI_GRO_CB(p)->count >= 64; if (flush || skb_gro_receive_list(p, skb)) mss = 1; goto out_check_final; } if (flush || skb_gro_receive(p, skb)) { mss = 1; goto out_check_final; } tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH); out_check_final: /* Force a flush if last segment is smaller than mss. */ if (unlikely(skb_is_gso(skb))) flush = len != NAPI_GRO_CB(skb)->count * skb_shinfo(skb)->gso_size; else flush = len < mss; flush |= (__force int)(flags & (TCP_FLAG_URG | TCP_FLAG_PSH | TCP_FLAG_RST | TCP_FLAG_SYN | TCP_FLAG_FIN)); if (p && (!NAPI_GRO_CB(skb)->same_flow || flush)) pp = p; NAPI_GRO_CB(skb)->flush |= (flush != 0); return pp; } void tcp_gro_complete(struct sk_buff *skb) { struct tcphdr *th = tcp_hdr(skb); struct skb_shared_info *shinfo; if (skb->encapsulation) skb->inner_transport_header = skb->transport_header; skb->csum_start = (unsigned char *)th - skb->head; skb->csum_offset = offsetof(struct tcphdr, check); skb->ip_summed = CHECKSUM_PARTIAL; shinfo = skb_shinfo(skb); shinfo->gso_segs = NAPI_GRO_CB(skb)->count; if (th->cwr) shinfo->gso_type |= SKB_GSO_TCP_ECN; } EXPORT_SYMBOL(tcp_gro_complete); static void tcp4_check_fraglist_gro(struct list_head *head, struct sk_buff *skb, struct tcphdr *th) { const struct iphdr *iph; struct sk_buff *p; struct sock *sk; struct net *net; int iif, sdif; if (likely(!(skb->dev->features & NETIF_F_GRO_FRAGLIST))) return; p = tcp_gro_lookup(head, th); if (p) { NAPI_GRO_CB(skb)->is_flist = NAPI_GRO_CB(p)->is_flist; return; } inet_get_iif_sdif(skb, &iif, &sdif); iph = skb_gro_network_header(skb); net = dev_net(skb->dev); sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, iph->saddr, th->source, iph->daddr, ntohs(th->dest), iif, sdif); NAPI_GRO_CB(skb)->is_flist = !sk; if (sk) sock_put(sk); } INDIRECT_CALLABLE_SCOPE struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb) { struct tcphdr *th; /* Don't bother verifying checksum if we're going to flush anyway. */ if (!NAPI_GRO_CB(skb)->flush && skb_gro_checksum_validate(skb, IPPROTO_TCP, inet_gro_compute_pseudo)) goto flush; th = tcp_gro_pull_header(skb); if (!th) goto flush; tcp4_check_fraglist_gro(head, skb, th); return tcp_gro_receive(head, skb, th); flush: NAPI_GRO_CB(skb)->flush = 1; return NULL; } INDIRECT_CALLABLE_SCOPE int tcp4_gro_complete(struct sk_buff *skb, int thoff) { const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation]; const struct iphdr *iph = (struct iphdr *)(skb->data + offset); struct tcphdr *th = tcp_hdr(skb); if (unlikely(NAPI_GRO_CB(skb)->is_flist)) { skb_shinfo(skb)->gso_type |= SKB_GSO_FRAGLIST | SKB_GSO_TCPV4; skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count; __skb_incr_checksum_unnecessary(skb); return 0; } th->check = ~tcp_v4_check(skb->len - thoff, iph->saddr, iph->daddr, 0); skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4 | (NAPI_GRO_CB(skb)->ip_fixedid * SKB_GSO_TCP_FIXEDID); tcp_gro_complete(skb); return 0; } int __init tcpv4_offload_init(void) { net_hotdata.tcpv4_offload = (struct net_offload) { .callbacks = { .gso_segment = tcp4_gso_segment, .gro_receive = tcp4_gro_receive, .gro_complete = tcp4_gro_complete, }, }; return inet_add_offload(&net_hotdata.tcpv4_offload, IPPROTO_TCP); }
4 4 60 5 1 54 53 52 49 49 4 4 4 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 // SPDX-License-Identifier: GPL-2.0 /* * fs/partitions/atari.c * * Code extracted from drivers/block/genhd.c * * Copyright (C) 1991-1998 Linus Torvalds * Re-organised Feb 1998 Russell King */ #include <linux/ctype.h> #include "check.h" #include "atari.h" /* ++guenther: this should be settable by the user ("make config")?. */ #define ICD_PARTS /* check if a partition entry looks valid -- Atari format is assumed if at least one of the primary entries is ok this way */ #define VALID_PARTITION(pi,hdsiz) \ (((pi)->flg & 1) && \ isalnum((pi)->id[0]) && isalnum((pi)->id[1]) && isalnum((pi)->id[2]) && \ be32_to_cpu((pi)->st) <= (hdsiz) && \ be32_to_cpu((pi)->st) + be32_to_cpu((pi)->siz) <= (hdsiz)) static inline int OK_id(char *s) { return memcmp (s, "GEM", 3) == 0 || memcmp (s, "BGM", 3) == 0 || memcmp (s, "LNX", 3) == 0 || memcmp (s, "SWP", 3) == 0 || memcmp (s, "RAW", 3) == 0 ; } int atari_partition(struct parsed_partitions *state) { Sector sect; struct rootsector *rs; struct partition_info *pi; u32 extensect; u32 hd_size; int slot; #ifdef ICD_PARTS int part_fmt = 0; /* 0:unknown, 1:AHDI, 2:ICD/Supra */ #endif /* * ATARI partition scheme supports 512 lba only. If this is not * the case, bail early to avoid miscalculating hd_size. */ if (queue_logical_block_size(state->disk->queue) != 512) return 0; rs = read_part_sector(state, 0, &sect); if (!rs) return -1; /* Verify this is an Atari rootsector: */ hd_size = get_capacity(state->disk); if (!VALID_PARTITION(&rs->part[0], hd_size) && !VALID_PARTITION(&rs->part[1], hd_size) && !VALID_PARTITION(&rs->part[2], hd_size) && !VALID_PARTITION(&rs->part[3], hd_size)) { /* * if there's no valid primary partition, assume that no Atari * format partition table (there's no reliable magic or the like * :-() */ put_dev_sector(sect); return 0; } pi = &rs->part[0]; strlcat(state->pp_buf, " AHDI", PAGE_SIZE); for (slot = 1; pi < &rs->part[4] && slot < state->limit; slot++, pi++) { struct rootsector *xrs; Sector sect2; ulong partsect; if ( !(pi->flg & 1) ) continue; /* active partition */ if (memcmp (pi->id, "XGM", 3) != 0) { /* we don't care about other id's */ put_partition (state, slot, be32_to_cpu(pi->st), be32_to_cpu(pi->siz)); continue; } /* extension partition */ #ifdef ICD_PARTS part_fmt = 1; #endif strlcat(state->pp_buf, " XGM<", PAGE_SIZE); partsect = extensect = be32_to_cpu(pi->st); while (1) { xrs = read_part_sector(state, partsect, &sect2); if (!xrs) { printk (" block %ld read failed\n", partsect); put_dev_sector(sect); return -1; } /* ++roman: sanity check: bit 0 of flg field must be set */ if (!(xrs->part[0].flg & 1)) { printk( "\nFirst sub-partition in extended partition is not valid!\n" ); put_dev_sector(sect2); break; } put_partition(state, slot, partsect + be32_to_cpu(xrs->part[0].st), be32_to_cpu(xrs->part[0].siz)); if (!(xrs->part[1].flg & 1)) { /* end of linked partition list */ put_dev_sector(sect2); break; } if (memcmp( xrs->part[1].id, "XGM", 3 ) != 0) { printk("\nID of extended partition is not XGM!\n"); put_dev_sector(sect2); break; } partsect = be32_to_cpu(xrs->part[1].st) + extensect; put_dev_sector(sect2); if (++slot == state->limit) { printk( "\nMaximum number of partitions reached!\n" ); break; } } strlcat(state->pp_buf, " >", PAGE_SIZE); } #ifdef ICD_PARTS if ( part_fmt!=1 ) { /* no extended partitions -> test ICD-format */ pi = &rs->icdpart[0]; /* sanity check: no ICD format if first partition invalid */ if (OK_id(pi->id)) { strlcat(state->pp_buf, " ICD<", PAGE_SIZE); for (; pi < &rs->icdpart[8] && slot < state->limit; slot++, pi++) { /* accept only GEM,BGM,RAW,LNX,SWP partitions */ if (!((pi->flg & 1) && OK_id(pi->id))) continue; put_partition (state, slot, be32_to_cpu(pi->st), be32_to_cpu(pi->siz)); } strlcat(state->pp_buf, " >", PAGE_SIZE); } } #endif put_dev_sector(sect); strlcat(state->pp_buf, "\n", PAGE_SIZE); return 1; }
143 7 137 10 1 9 10 9 9 252 294 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright Red Hat Inc. 2017 * * This file is part of the SCTP kernel implementation * * These functions implement sctp stream message interleaving, mostly * including I-DATA and I-FORWARD-TSN chunks process. * * Please send any bug reports or fixes you make to the * email addresched(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * Xin Long <lucien.xin@gmail.com> */ #include <net/busy_poll.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> #include <net/sctp/ulpevent.h> #include <linux/sctp.h> static struct sctp_chunk *sctp_make_idatafrag_empty( const struct sctp_association *asoc, const struct sctp_sndrcvinfo *sinfo, int len, __u8 flags, gfp_t gfp) { struct sctp_chunk *retval; struct sctp_idatahdr dp; memset(&dp, 0, sizeof(dp)); dp.stream = htons(sinfo->sinfo_stream); if (sinfo->sinfo_flags & SCTP_UNORDERED) flags |= SCTP_DATA_UNORDERED; retval = sctp_make_idata(asoc, flags, sizeof(dp) + len, gfp); if (!retval) return NULL; retval->subh.idata_hdr = sctp_addto_chunk(retval, sizeof(dp), &dp); memcpy(&retval->sinfo, sinfo, sizeof(struct sctp_sndrcvinfo)); return retval; } static void sctp_chunk_assign_mid(struct sctp_chunk *chunk) { struct sctp_stream *stream; struct sctp_chunk *lchunk; __u32 cfsn = 0; __u16 sid; if (chunk->has_mid) return; sid = sctp_chunk_stream_no(chunk); stream = &chunk->asoc->stream; list_for_each_entry(lchunk, &chunk->msg->chunks, frag_list) { struct sctp_idatahdr *hdr; __u32 mid; lchunk->has_mid = 1; hdr = lchunk->subh.idata_hdr; if (lchunk->chunk_hdr->flags & SCTP_DATA_FIRST_FRAG) hdr->ppid = lchunk->sinfo.sinfo_ppid; else hdr->fsn = htonl(cfsn++); if (lchunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) { mid = lchunk->chunk_hdr->flags & SCTP_DATA_LAST_FRAG ? sctp_mid_uo_next(stream, out, sid) : sctp_mid_uo_peek(stream, out, sid); } else { mid = lchunk->chunk_hdr->flags & SCTP_DATA_LAST_FRAG ? sctp_mid_next(stream, out, sid) : sctp_mid_peek(stream, out, sid); } hdr->mid = htonl(mid); } } static bool sctp_validate_data(struct sctp_chunk *chunk) { struct sctp_stream *stream; __u16 sid, ssn; if (chunk->chunk_hdr->type != SCTP_CID_DATA) return false; if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) return true; stream = &chunk->asoc->stream; sid = sctp_chunk_stream_no(chunk); ssn = ntohs(chunk->subh.data_hdr->ssn); return !SSN_lt(ssn, sctp_ssn_peek(stream, in, sid)); } static bool sctp_validate_idata(struct sctp_chunk *chunk) { struct sctp_stream *stream; __u32 mid; __u16 sid; if (chunk->chunk_hdr->type != SCTP_CID_I_DATA) return false; if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) return true; stream = &chunk->asoc->stream; sid = sctp_chunk_stream_no(chunk); mid = ntohl(chunk->subh.idata_hdr->mid); return !MID_lt(mid, sctp_mid_peek(stream, in, sid)); } static void sctp_intl_store_reasm(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sctp_ulpevent *cevent; struct sk_buff *pos, *loc; pos = skb_peek_tail(&ulpq->reasm); if (!pos) { __skb_queue_tail(&ulpq->reasm, sctp_event2skb(event)); return; } cevent = sctp_skb2event(pos); if (event->stream == cevent->stream && event->mid == cevent->mid && (cevent->msg_flags & SCTP_DATA_FIRST_FRAG || (!(event->msg_flags & SCTP_DATA_FIRST_FRAG) && event->fsn > cevent->fsn))) { __skb_queue_tail(&ulpq->reasm, sctp_event2skb(event)); return; } if ((event->stream == cevent->stream && MID_lt(cevent->mid, event->mid)) || event->stream > cevent->stream) { __skb_queue_tail(&ulpq->reasm, sctp_event2skb(event)); return; } loc = NULL; skb_queue_walk(&ulpq->reasm, pos) { cevent = sctp_skb2event(pos); if (event->stream < cevent->stream || (event->stream == cevent->stream && MID_lt(event->mid, cevent->mid))) { loc = pos; break; } if (event->stream == cevent->stream && event->mid == cevent->mid && !(cevent->msg_flags & SCTP_DATA_FIRST_FRAG) && (event->msg_flags & SCTP_DATA_FIRST_FRAG || event->fsn < cevent->fsn)) { loc = pos; break; } } if (!loc) __skb_queue_tail(&ulpq->reasm, sctp_event2skb(event)); else __skb_queue_before(&ulpq->reasm, loc, sctp_event2skb(event)); } static struct sctp_ulpevent *sctp_intl_retrieve_partial( struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sk_buff *first_frag = NULL; struct sk_buff *last_frag = NULL; struct sctp_ulpevent *retval; struct sctp_stream_in *sin; struct sk_buff *pos; __u32 next_fsn = 0; int is_last = 0; sin = sctp_stream_in(&ulpq->asoc->stream, event->stream); skb_queue_walk(&ulpq->reasm, pos) { struct sctp_ulpevent *cevent = sctp_skb2event(pos); if (cevent->stream < event->stream) continue; if (cevent->stream > event->stream || cevent->mid != sin->mid) break; switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { case SCTP_DATA_FIRST_FRAG: goto out; case SCTP_DATA_MIDDLE_FRAG: if (!first_frag) { if (cevent->fsn == sin->fsn) { first_frag = pos; last_frag = pos; next_fsn = cevent->fsn + 1; } } else if (cevent->fsn == next_fsn) { last_frag = pos; next_fsn++; } else { goto out; } break; case SCTP_DATA_LAST_FRAG: if (!first_frag) { if (cevent->fsn == sin->fsn) { first_frag = pos; last_frag = pos; next_fsn = 0; is_last = 1; } } else if (cevent->fsn == next_fsn) { last_frag = pos; next_fsn = 0; is_last = 1; } goto out; default: goto out; } } out: if (!first_frag) return NULL; retval = sctp_make_reassembled_event(ulpq->asoc->base.net, &ulpq->reasm, first_frag, last_frag); if (retval) { sin->fsn = next_fsn; if (is_last) { retval->msg_flags |= MSG_EOR; sin->pd_mode = 0; } } return retval; } static struct sctp_ulpevent *sctp_intl_retrieve_reassembled( struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sctp_association *asoc = ulpq->asoc; struct sk_buff *pos, *first_frag = NULL; struct sctp_ulpevent *retval = NULL; struct sk_buff *pd_first = NULL; struct sk_buff *pd_last = NULL; struct sctp_stream_in *sin; __u32 next_fsn = 0; __u32 pd_point = 0; __u32 pd_len = 0; __u32 mid = 0; sin = sctp_stream_in(&ulpq->asoc->stream, event->stream); skb_queue_walk(&ulpq->reasm, pos) { struct sctp_ulpevent *cevent = sctp_skb2event(pos); if (cevent->stream < event->stream) continue; if (cevent->stream > event->stream) break; if (MID_lt(cevent->mid, event->mid)) continue; if (MID_lt(event->mid, cevent->mid)) break; switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { case SCTP_DATA_FIRST_FRAG: if (cevent->mid == sin->mid) { pd_first = pos; pd_last = pos; pd_len = pos->len; } first_frag = pos; next_fsn = 0; mid = cevent->mid; break; case SCTP_DATA_MIDDLE_FRAG: if (first_frag && cevent->mid == mid && cevent->fsn == next_fsn) { next_fsn++; if (pd_first) { pd_last = pos; pd_len += pos->len; } } else { first_frag = NULL; } break; case SCTP_DATA_LAST_FRAG: if (first_frag && cevent->mid == mid && cevent->fsn == next_fsn) goto found; else first_frag = NULL; break; } } if (!pd_first) goto out; pd_point = sctp_sk(asoc->base.sk)->pd_point; if (pd_point && pd_point <= pd_len) { retval = sctp_make_reassembled_event(asoc->base.net, &ulpq->reasm, pd_first, pd_last); if (retval) { sin->fsn = next_fsn; sin->pd_mode = 1; } } goto out; found: retval = sctp_make_reassembled_event(asoc->base.net, &ulpq->reasm, first_frag, pos); if (retval) retval->msg_flags |= MSG_EOR; out: return retval; } static struct sctp_ulpevent *sctp_intl_reasm(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sctp_ulpevent *retval = NULL; struct sctp_stream_in *sin; if (SCTP_DATA_NOT_FRAG == (event->msg_flags & SCTP_DATA_FRAG_MASK)) { event->msg_flags |= MSG_EOR; return event; } sctp_intl_store_reasm(ulpq, event); sin = sctp_stream_in(&ulpq->asoc->stream, event->stream); if (sin->pd_mode && event->mid == sin->mid && event->fsn == sin->fsn) retval = sctp_intl_retrieve_partial(ulpq, event); if (!retval) retval = sctp_intl_retrieve_reassembled(ulpq, event); return retval; } static void sctp_intl_store_ordered(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sctp_ulpevent *cevent; struct sk_buff *pos, *loc; pos = skb_peek_tail(&ulpq->lobby); if (!pos) { __skb_queue_tail(&ulpq->lobby, sctp_event2skb(event)); return; } cevent = (struct sctp_ulpevent *)pos->cb; if (event->stream == cevent->stream && MID_lt(cevent->mid, event->mid)) { __skb_queue_tail(&ulpq->lobby, sctp_event2skb(event)); return; } if (event->stream > cevent->stream) { __skb_queue_tail(&ulpq->lobby, sctp_event2skb(event)); return; } loc = NULL; skb_queue_walk(&ulpq->lobby, pos) { cevent = (struct sctp_ulpevent *)pos->cb; if (cevent->stream > event->stream) { loc = pos; break; } if (cevent->stream == event->stream && MID_lt(event->mid, cevent->mid)) { loc = pos; break; } } if (!loc) __skb_queue_tail(&ulpq->lobby, sctp_event2skb(event)); else __skb_queue_before(&ulpq->lobby, loc, sctp_event2skb(event)); } static void sctp_intl_retrieve_ordered(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sk_buff_head *event_list; struct sctp_stream *stream; struct sk_buff *pos, *tmp; __u16 sid = event->stream; stream = &ulpq->asoc->stream; event_list = (struct sk_buff_head *)sctp_event2skb(event)->prev; sctp_skb_for_each(pos, &ulpq->lobby, tmp) { struct sctp_ulpevent *cevent = (struct sctp_ulpevent *)pos->cb; if (cevent->stream > sid) break; if (cevent->stream < sid) continue; if (cevent->mid != sctp_mid_peek(stream, in, sid)) break; sctp_mid_next(stream, in, sid); __skb_unlink(pos, &ulpq->lobby); __skb_queue_tail(event_list, pos); } } static struct sctp_ulpevent *sctp_intl_order(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sctp_stream *stream; __u16 sid; stream = &ulpq->asoc->stream; sid = event->stream; if (event->mid != sctp_mid_peek(stream, in, sid)) { sctp_intl_store_ordered(ulpq, event); return NULL; } sctp_mid_next(stream, in, sid); sctp_intl_retrieve_ordered(ulpq, event); return event; } static int sctp_enqueue_event(struct sctp_ulpq *ulpq, struct sk_buff_head *skb_list) { struct sock *sk = ulpq->asoc->base.sk; struct sctp_sock *sp = sctp_sk(sk); struct sctp_ulpevent *event; struct sk_buff *skb; skb = __skb_peek(skb_list); event = sctp_skb2event(skb); if (sk->sk_shutdown & RCV_SHUTDOWN && (sk->sk_shutdown & SEND_SHUTDOWN || !sctp_ulpevent_is_notification(event))) goto out_free; if (!sctp_ulpevent_is_notification(event)) { sk_mark_napi_id(sk, skb); sk_incoming_cpu_update(sk); } if (!sctp_ulpevent_is_enabled(event, ulpq->asoc->subscribe)) goto out_free; skb_queue_splice_tail_init(skb_list, &sk->sk_receive_queue); if (!sp->data_ready_signalled) { sp->data_ready_signalled = 1; sk->sk_data_ready(sk); } return 1; out_free: sctp_queue_purge_ulpevents(skb_list); return 0; } static void sctp_intl_store_reasm_uo(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sctp_ulpevent *cevent; struct sk_buff *pos; pos = skb_peek_tail(&ulpq->reasm_uo); if (!pos) { __skb_queue_tail(&ulpq->reasm_uo, sctp_event2skb(event)); return; } cevent = sctp_skb2event(pos); if (event->stream == cevent->stream && event->mid == cevent->mid && (cevent->msg_flags & SCTP_DATA_FIRST_FRAG || (!(event->msg_flags & SCTP_DATA_FIRST_FRAG) && event->fsn > cevent->fsn))) { __skb_queue_tail(&ulpq->reasm_uo, sctp_event2skb(event)); return; } if ((event->stream == cevent->stream && MID_lt(cevent->mid, event->mid)) || event->stream > cevent->stream) { __skb_queue_tail(&ulpq->reasm_uo, sctp_event2skb(event)); return; } skb_queue_walk(&ulpq->reasm_uo, pos) { cevent = sctp_skb2event(pos); if (event->stream < cevent->stream || (event->stream == cevent->stream && MID_lt(event->mid, cevent->mid))) break; if (event->stream == cevent->stream && event->mid == cevent->mid && !(cevent->msg_flags & SCTP_DATA_FIRST_FRAG) && (event->msg_flags & SCTP_DATA_FIRST_FRAG || event->fsn < cevent->fsn)) break; } __skb_queue_before(&ulpq->reasm_uo, pos, sctp_event2skb(event)); } static struct sctp_ulpevent *sctp_intl_retrieve_partial_uo( struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sk_buff *first_frag = NULL; struct sk_buff *last_frag = NULL; struct sctp_ulpevent *retval; struct sctp_stream_in *sin; struct sk_buff *pos; __u32 next_fsn = 0; int is_last = 0; sin = sctp_stream_in(&ulpq->asoc->stream, event->stream); skb_queue_walk(&ulpq->reasm_uo, pos) { struct sctp_ulpevent *cevent = sctp_skb2event(pos); if (cevent->stream < event->stream) continue; if (cevent->stream > event->stream) break; if (MID_lt(cevent->mid, sin->mid_uo)) continue; if (MID_lt(sin->mid_uo, cevent->mid)) break; switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { case SCTP_DATA_FIRST_FRAG: goto out; case SCTP_DATA_MIDDLE_FRAG: if (!first_frag) { if (cevent->fsn == sin->fsn_uo) { first_frag = pos; last_frag = pos; next_fsn = cevent->fsn + 1; } } else if (cevent->fsn == next_fsn) { last_frag = pos; next_fsn++; } else { goto out; } break; case SCTP_DATA_LAST_FRAG: if (!first_frag) { if (cevent->fsn == sin->fsn_uo) { first_frag = pos; last_frag = pos; next_fsn = 0; is_last = 1; } } else if (cevent->fsn == next_fsn) { last_frag = pos; next_fsn = 0; is_last = 1; } goto out; default: goto out; } } out: if (!first_frag) return NULL; retval = sctp_make_reassembled_event(ulpq->asoc->base.net, &ulpq->reasm_uo, first_frag, last_frag); if (retval) { sin->fsn_uo = next_fsn; if (is_last) { retval->msg_flags |= MSG_EOR; sin->pd_mode_uo = 0; } } return retval; } static struct sctp_ulpevent *sctp_intl_retrieve_reassembled_uo( struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sctp_association *asoc = ulpq->asoc; struct sk_buff *pos, *first_frag = NULL; struct sctp_ulpevent *retval = NULL; struct sk_buff *pd_first = NULL; struct sk_buff *pd_last = NULL; struct sctp_stream_in *sin; __u32 next_fsn = 0; __u32 pd_point = 0; __u32 pd_len = 0; __u32 mid = 0; sin = sctp_stream_in(&ulpq->asoc->stream, event->stream); skb_queue_walk(&ulpq->reasm_uo, pos) { struct sctp_ulpevent *cevent = sctp_skb2event(pos); if (cevent->stream < event->stream) continue; if (cevent->stream > event->stream) break; if (MID_lt(cevent->mid, event->mid)) continue; if (MID_lt(event->mid, cevent->mid)) break; switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { case SCTP_DATA_FIRST_FRAG: if (!sin->pd_mode_uo) { sin->mid_uo = cevent->mid; pd_first = pos; pd_last = pos; pd_len = pos->len; } first_frag = pos; next_fsn = 0; mid = cevent->mid; break; case SCTP_DATA_MIDDLE_FRAG: if (first_frag && cevent->mid == mid && cevent->fsn == next_fsn) { next_fsn++; if (pd_first) { pd_last = pos; pd_len += pos->len; } } else { first_frag = NULL; } break; case SCTP_DATA_LAST_FRAG: if (first_frag && cevent->mid == mid && cevent->fsn == next_fsn) goto found; else first_frag = NULL; break; } } if (!pd_first) goto out; pd_point = sctp_sk(asoc->base.sk)->pd_point; if (pd_point && pd_point <= pd_len) { retval = sctp_make_reassembled_event(asoc->base.net, &ulpq->reasm_uo, pd_first, pd_last); if (retval) { sin->fsn_uo = next_fsn; sin->pd_mode_uo = 1; } } goto out; found: retval = sctp_make_reassembled_event(asoc->base.net, &ulpq->reasm_uo, first_frag, pos); if (retval) retval->msg_flags |= MSG_EOR; out: return retval; } static struct sctp_ulpevent *sctp_intl_reasm_uo(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sctp_ulpevent *retval = NULL; struct sctp_stream_in *sin; if (SCTP_DATA_NOT_FRAG == (event->msg_flags & SCTP_DATA_FRAG_MASK)) { event->msg_flags |= MSG_EOR; return event; } sctp_intl_store_reasm_uo(ulpq, event); sin = sctp_stream_in(&ulpq->asoc->stream, event->stream); if (sin->pd_mode_uo && event->mid == sin->mid_uo && event->fsn == sin->fsn_uo) retval = sctp_intl_retrieve_partial_uo(ulpq, event); if (!retval) retval = sctp_intl_retrieve_reassembled_uo(ulpq, event); return retval; } static struct sctp_ulpevent *sctp_intl_retrieve_first_uo(struct sctp_ulpq *ulpq) { struct sctp_stream_in *csin, *sin = NULL; struct sk_buff *first_frag = NULL; struct sk_buff *last_frag = NULL; struct sctp_ulpevent *retval; struct sk_buff *pos; __u32 next_fsn = 0; __u16 sid = 0; skb_queue_walk(&ulpq->reasm_uo, pos) { struct sctp_ulpevent *cevent = sctp_skb2event(pos); csin = sctp_stream_in(&ulpq->asoc->stream, cevent->stream); if (csin->pd_mode_uo) continue; switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { case SCTP_DATA_FIRST_FRAG: if (first_frag) goto out; first_frag = pos; last_frag = pos; next_fsn = 0; sin = csin; sid = cevent->stream; sin->mid_uo = cevent->mid; break; case SCTP_DATA_MIDDLE_FRAG: if (!first_frag) break; if (cevent->stream == sid && cevent->mid == sin->mid_uo && cevent->fsn == next_fsn) { next_fsn++; last_frag = pos; } else { goto out; } break; case SCTP_DATA_LAST_FRAG: if (first_frag) goto out; break; default: break; } } if (!first_frag) return NULL; out: retval = sctp_make_reassembled_event(ulpq->asoc->base.net, &ulpq->reasm_uo, first_frag, last_frag); if (retval) { sin->fsn_uo = next_fsn; sin->pd_mode_uo = 1; } return retval; } static int sctp_ulpevent_idata(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk, gfp_t gfp) { struct sctp_ulpevent *event; struct sk_buff_head temp; int event_eor = 0; event = sctp_ulpevent_make_rcvmsg(chunk->asoc, chunk, gfp); if (!event) return -ENOMEM; event->mid = ntohl(chunk->subh.idata_hdr->mid); if (event->msg_flags & SCTP_DATA_FIRST_FRAG) event->ppid = chunk->subh.idata_hdr->ppid; else event->fsn = ntohl(chunk->subh.idata_hdr->fsn); if (!(event->msg_flags & SCTP_DATA_UNORDERED)) { event = sctp_intl_reasm(ulpq, event); if (event) { skb_queue_head_init(&temp); __skb_queue_tail(&temp, sctp_event2skb(event)); if (event->msg_flags & MSG_EOR) event = sctp_intl_order(ulpq, event); } } else { event = sctp_intl_reasm_uo(ulpq, event); if (event) { skb_queue_head_init(&temp); __skb_queue_tail(&temp, sctp_event2skb(event)); } } if (event) { event_eor = (event->msg_flags & MSG_EOR) ? 1 : 0; sctp_enqueue_event(ulpq, &temp); } return event_eor; } static struct sctp_ulpevent *sctp_intl_retrieve_first(struct sctp_ulpq *ulpq) { struct sctp_stream_in *csin, *sin = NULL; struct sk_buff *first_frag = NULL; struct sk_buff *last_frag = NULL; struct sctp_ulpevent *retval; struct sk_buff *pos; __u32 next_fsn = 0; __u16 sid = 0; skb_queue_walk(&ulpq->reasm, pos) { struct sctp_ulpevent *cevent = sctp_skb2event(pos); csin = sctp_stream_in(&ulpq->asoc->stream, cevent->stream); if (csin->pd_mode) continue; switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { case SCTP_DATA_FIRST_FRAG: if (first_frag) goto out; if (cevent->mid == csin->mid) { first_frag = pos; last_frag = pos; next_fsn = 0; sin = csin; sid = cevent->stream; } break; case SCTP_DATA_MIDDLE_FRAG: if (!first_frag) break; if (cevent->stream == sid && cevent->mid == sin->mid && cevent->fsn == next_fsn) { next_fsn++; last_frag = pos; } else { goto out; } break; case SCTP_DATA_LAST_FRAG: if (first_frag) goto out; break; default: break; } } if (!first_frag) return NULL; out: retval = sctp_make_reassembled_event(ulpq->asoc->base.net, &ulpq->reasm, first_frag, last_frag); if (retval) { sin->fsn = next_fsn; sin->pd_mode = 1; } return retval; } static void sctp_intl_start_pd(struct sctp_ulpq *ulpq, gfp_t gfp) { struct sctp_ulpevent *event; struct sk_buff_head temp; if (!skb_queue_empty(&ulpq->reasm)) { do { event = sctp_intl_retrieve_first(ulpq); if (event) { skb_queue_head_init(&temp); __skb_queue_tail(&temp, sctp_event2skb(event)); sctp_enqueue_event(ulpq, &temp); } } while (event); } if (!skb_queue_empty(&ulpq->reasm_uo)) { do { event = sctp_intl_retrieve_first_uo(ulpq); if (event) { skb_queue_head_init(&temp); __skb_queue_tail(&temp, sctp_event2skb(event)); sctp_enqueue_event(ulpq, &temp); } } while (event); } } static void sctp_renege_events(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk, gfp_t gfp) { struct sctp_association *asoc = ulpq->asoc; __u32 freed = 0; __u16 needed; needed = ntohs(chunk->chunk_hdr->length) - sizeof(struct sctp_idata_chunk); if (skb_queue_empty(&asoc->base.sk->sk_receive_queue)) { freed = sctp_ulpq_renege_list(ulpq, &ulpq->lobby, needed); if (freed < needed) freed += sctp_ulpq_renege_list(ulpq, &ulpq->reasm, needed); if (freed < needed) freed += sctp_ulpq_renege_list(ulpq, &ulpq->reasm_uo, needed); } if (freed >= needed && sctp_ulpevent_idata(ulpq, chunk, gfp) <= 0) sctp_intl_start_pd(ulpq, gfp); } static void sctp_intl_stream_abort_pd(struct sctp_ulpq *ulpq, __u16 sid, __u32 mid, __u16 flags, gfp_t gfp) { struct sock *sk = ulpq->asoc->base.sk; struct sctp_ulpevent *ev = NULL; if (!sctp_ulpevent_type_enabled(ulpq->asoc->subscribe, SCTP_PARTIAL_DELIVERY_EVENT)) return; ev = sctp_ulpevent_make_pdapi(ulpq->asoc, SCTP_PARTIAL_DELIVERY_ABORTED, sid, mid, flags, gfp); if (ev) { struct sctp_sock *sp = sctp_sk(sk); __skb_queue_tail(&sk->sk_receive_queue, sctp_event2skb(ev)); if (!sp->data_ready_signalled) { sp->data_ready_signalled = 1; sk->sk_data_ready(sk); } } } static void sctp_intl_reap_ordered(struct sctp_ulpq *ulpq, __u16 sid) { struct sctp_stream *stream = &ulpq->asoc->stream; struct sctp_ulpevent *cevent, *event = NULL; struct sk_buff_head *lobby = &ulpq->lobby; struct sk_buff *pos, *tmp; struct sk_buff_head temp; __u16 csid; __u32 cmid; skb_queue_head_init(&temp); sctp_skb_for_each(pos, lobby, tmp) { cevent = (struct sctp_ulpevent *)pos->cb; csid = cevent->stream; cmid = cevent->mid; if (csid > sid) break; if (csid < sid) continue; if (!MID_lt(cmid, sctp_mid_peek(stream, in, csid))) break; __skb_unlink(pos, lobby); if (!event) event = sctp_skb2event(pos); __skb_queue_tail(&temp, pos); } if (!event && pos != (struct sk_buff *)lobby) { cevent = (struct sctp_ulpevent *)pos->cb; csid = cevent->stream; cmid = cevent->mid; if (csid == sid && cmid == sctp_mid_peek(stream, in, csid)) { sctp_mid_next(stream, in, csid); __skb_unlink(pos, lobby); __skb_queue_tail(&temp, pos); event = sctp_skb2event(pos); } } if (event) { sctp_intl_retrieve_ordered(ulpq, event); sctp_enqueue_event(ulpq, &temp); } } static void sctp_intl_abort_pd(struct sctp_ulpq *ulpq, gfp_t gfp) { struct sctp_stream *stream = &ulpq->asoc->stream; __u16 sid; for (sid = 0; sid < stream->incnt; sid++) { struct sctp_stream_in *sin = SCTP_SI(stream, sid); __u32 mid; if (sin->pd_mode_uo) { sin->pd_mode_uo = 0; mid = sin->mid_uo; sctp_intl_stream_abort_pd(ulpq, sid, mid, 0x1, gfp); } if (sin->pd_mode) { sin->pd_mode = 0; mid = sin->mid; sctp_intl_stream_abort_pd(ulpq, sid, mid, 0, gfp); sctp_mid_skip(stream, in, sid, mid); sctp_intl_reap_ordered(ulpq, sid); } } /* intl abort pd happens only when all data needs to be cleaned */ sctp_ulpq_flush(ulpq); } static inline int sctp_get_skip_pos(struct sctp_ifwdtsn_skip *skiplist, int nskips, __be16 stream, __u8 flags) { int i; for (i = 0; i < nskips; i++) if (skiplist[i].stream == stream && skiplist[i].flags == flags) return i; return i; } #define SCTP_FTSN_U_BIT 0x1 static void sctp_generate_iftsn(struct sctp_outq *q, __u32 ctsn) { struct sctp_ifwdtsn_skip ftsn_skip_arr[10]; struct sctp_association *asoc = q->asoc; struct sctp_chunk *ftsn_chunk = NULL; struct list_head *lchunk, *temp; int nskips = 0, skip_pos; struct sctp_chunk *chunk; __u32 tsn; if (!asoc->peer.prsctp_capable) return; if (TSN_lt(asoc->adv_peer_ack_point, ctsn)) asoc->adv_peer_ack_point = ctsn; list_for_each_safe(lchunk, temp, &q->abandoned) { chunk = list_entry(lchunk, struct sctp_chunk, transmitted_list); tsn = ntohl(chunk->subh.data_hdr->tsn); if (TSN_lte(tsn, ctsn)) { list_del_init(lchunk); sctp_chunk_free(chunk); } else if (TSN_lte(tsn, asoc->adv_peer_ack_point + 1)) { __be16 sid = chunk->subh.idata_hdr->stream; __be32 mid = chunk->subh.idata_hdr->mid; __u8 flags = 0; if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) flags |= SCTP_FTSN_U_BIT; asoc->adv_peer_ack_point = tsn; skip_pos = sctp_get_skip_pos(&ftsn_skip_arr[0], nskips, sid, flags); ftsn_skip_arr[skip_pos].stream = sid; ftsn_skip_arr[skip_pos].reserved = 0; ftsn_skip_arr[skip_pos].flags = flags; ftsn_skip_arr[skip_pos].mid = mid; if (skip_pos == nskips) nskips++; if (nskips == 10) break; } else { break; } } if (asoc->adv_peer_ack_point > ctsn) ftsn_chunk = sctp_make_ifwdtsn(asoc, asoc->adv_peer_ack_point, nskips, &ftsn_skip_arr[0]); if (ftsn_chunk) { list_add_tail(&ftsn_chunk->list, &q->control_chunk_list); SCTP_INC_STATS(asoc->base.net, SCTP_MIB_OUTCTRLCHUNKS); } } #define _sctp_walk_ifwdtsn(pos, chunk, end) \ for (pos = (void *)(chunk->subh.ifwdtsn_hdr + 1); \ (void *)pos <= (void *)(chunk->subh.ifwdtsn_hdr + 1) + (end) - \ sizeof(struct sctp_ifwdtsn_skip); pos++) #define sctp_walk_ifwdtsn(pos, ch) \ _sctp_walk_ifwdtsn((pos), (ch), ntohs((ch)->chunk_hdr->length) - \ sizeof(struct sctp_ifwdtsn_chunk)) static bool sctp_validate_fwdtsn(struct sctp_chunk *chunk) { struct sctp_fwdtsn_skip *skip; __u16 incnt; if (chunk->chunk_hdr->type != SCTP_CID_FWD_TSN) return false; incnt = chunk->asoc->stream.incnt; sctp_walk_fwdtsn(skip, chunk) if (ntohs(skip->stream) >= incnt) return false; return true; } static bool sctp_validate_iftsn(struct sctp_chunk *chunk) { struct sctp_ifwdtsn_skip *skip; __u16 incnt; if (chunk->chunk_hdr->type != SCTP_CID_I_FWD_TSN) return false; incnt = chunk->asoc->stream.incnt; sctp_walk_ifwdtsn(skip, chunk) if (ntohs(skip->stream) >= incnt) return false; return true; } static void sctp_report_fwdtsn(struct sctp_ulpq *ulpq, __u32 ftsn) { /* Move the Cumulattive TSN Ack ahead. */ sctp_tsnmap_skip(&ulpq->asoc->peer.tsn_map, ftsn); /* purge the fragmentation queue */ sctp_ulpq_reasm_flushtsn(ulpq, ftsn); /* Abort any in progress partial delivery. */ sctp_ulpq_abort_pd(ulpq, GFP_ATOMIC); } static void sctp_intl_reasm_flushtsn(struct sctp_ulpq *ulpq, __u32 ftsn) { struct sk_buff *pos, *tmp; skb_queue_walk_safe(&ulpq->reasm, pos, tmp) { struct sctp_ulpevent *event = sctp_skb2event(pos); __u32 tsn = event->tsn; if (TSN_lte(tsn, ftsn)) { __skb_unlink(pos, &ulpq->reasm); sctp_ulpevent_free(event); } } skb_queue_walk_safe(&ulpq->reasm_uo, pos, tmp) { struct sctp_ulpevent *event = sctp_skb2event(pos); __u32 tsn = event->tsn; if (TSN_lte(tsn, ftsn)) { __skb_unlink(pos, &ulpq->reasm_uo); sctp_ulpevent_free(event); } } } static void sctp_report_iftsn(struct sctp_ulpq *ulpq, __u32 ftsn) { /* Move the Cumulattive TSN Ack ahead. */ sctp_tsnmap_skip(&ulpq->asoc->peer.tsn_map, ftsn); /* purge the fragmentation queue */ sctp_intl_reasm_flushtsn(ulpq, ftsn); /* abort only when it's for all data */ if (ftsn == sctp_tsnmap_get_max_tsn_seen(&ulpq->asoc->peer.tsn_map)) sctp_intl_abort_pd(ulpq, GFP_ATOMIC); } static void sctp_handle_fwdtsn(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk) { struct sctp_fwdtsn_skip *skip; /* Walk through all the skipped SSNs */ sctp_walk_fwdtsn(skip, chunk) sctp_ulpq_skip(ulpq, ntohs(skip->stream), ntohs(skip->ssn)); } static void sctp_intl_skip(struct sctp_ulpq *ulpq, __u16 sid, __u32 mid, __u8 flags) { struct sctp_stream_in *sin = sctp_stream_in(&ulpq->asoc->stream, sid); struct sctp_stream *stream = &ulpq->asoc->stream; if (flags & SCTP_FTSN_U_BIT) { if (sin->pd_mode_uo && MID_lt(sin->mid_uo, mid)) { sin->pd_mode_uo = 0; sctp_intl_stream_abort_pd(ulpq, sid, mid, 0x1, GFP_ATOMIC); } return; } if (MID_lt(mid, sctp_mid_peek(stream, in, sid))) return; if (sin->pd_mode) { sin->pd_mode = 0; sctp_intl_stream_abort_pd(ulpq, sid, mid, 0x0, GFP_ATOMIC); } sctp_mid_skip(stream, in, sid, mid); sctp_intl_reap_ordered(ulpq, sid); } static void sctp_handle_iftsn(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk) { struct sctp_ifwdtsn_skip *skip; /* Walk through all the skipped MIDs and abort stream pd if possible */ sctp_walk_ifwdtsn(skip, chunk) sctp_intl_skip(ulpq, ntohs(skip->stream), ntohl(skip->mid), skip->flags); } static int do_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sk_buff_head temp; skb_queue_head_init(&temp); __skb_queue_tail(&temp, sctp_event2skb(event)); return sctp_ulpq_tail_event(ulpq, &temp); } static struct sctp_stream_interleave sctp_stream_interleave_0 = { .data_chunk_len = sizeof(struct sctp_data_chunk), .ftsn_chunk_len = sizeof(struct sctp_fwdtsn_chunk), /* DATA process functions */ .make_datafrag = sctp_make_datafrag_empty, .assign_number = sctp_chunk_assign_ssn, .validate_data = sctp_validate_data, .ulpevent_data = sctp_ulpq_tail_data, .enqueue_event = do_ulpq_tail_event, .renege_events = sctp_ulpq_renege, .start_pd = sctp_ulpq_partial_delivery, .abort_pd = sctp_ulpq_abort_pd, /* FORWARD-TSN process functions */ .generate_ftsn = sctp_generate_fwdtsn, .validate_ftsn = sctp_validate_fwdtsn, .report_ftsn = sctp_report_fwdtsn, .handle_ftsn = sctp_handle_fwdtsn, }; static int do_sctp_enqueue_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sk_buff_head temp; skb_queue_head_init(&temp); __skb_queue_tail(&temp, sctp_event2skb(event)); return sctp_enqueue_event(ulpq, &temp); } static struct sctp_stream_interleave sctp_stream_interleave_1 = { .data_chunk_len = sizeof(struct sctp_idata_chunk), .ftsn_chunk_len = sizeof(struct sctp_ifwdtsn_chunk), /* I-DATA process functions */ .make_datafrag = sctp_make_idatafrag_empty, .assign_number = sctp_chunk_assign_mid, .validate_data = sctp_validate_idata, .ulpevent_data = sctp_ulpevent_idata, .enqueue_event = do_sctp_enqueue_event, .renege_events = sctp_renege_events, .start_pd = sctp_intl_start_pd, .abort_pd = sctp_intl_abort_pd, /* I-FORWARD-TSN process functions */ .generate_ftsn = sctp_generate_iftsn, .validate_ftsn = sctp_validate_iftsn, .report_ftsn = sctp_report_iftsn, .handle_ftsn = sctp_handle_iftsn, }; void sctp_stream_interleave_init(struct sctp_stream *stream) { struct sctp_association *asoc; asoc = container_of(stream, struct sctp_association, stream); stream->si = asoc->peer.intl_capable ? &sctp_stream_interleave_1 : &sctp_stream_interleave_0; }
744 741 744 741 743 742 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 // SPDX-License-Identifier: GPL-2.0-only /* * Common framework for low-level network console, dump, and debugger code * * Sep 8 2003 Matt Mackall <mpm@selenic.com> * * based on the netconsole code from: * * Copyright (C) 2001 Ingo Molnar <mingo@redhat.com> * Copyright (C) 2002 Red Hat, Inc. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/moduleparam.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/string.h> #include <linux/if_arp.h> #include <linux/inetdevice.h> #include <linux/inet.h> #include <linux/interrupt.h> #include <linux/netpoll.h> #include <linux/sched.h> #include <linux/delay.h> #include <linux/rcupdate.h> #include <linux/workqueue.h> #include <linux/slab.h> #include <linux/export.h> #include <linux/if_vlan.h> #include <net/tcp.h> #include <net/udp.h> #include <net/addrconf.h> #include <net/ndisc.h> #include <net/ip6_checksum.h> #include <linux/unaligned.h> #include <trace/events/napi.h> #include <linux/kconfig.h> /* * We maintain a small pool of fully-sized skbs, to make sure the * message gets out even in extreme OOM situations. */ #define MAX_UDP_CHUNK 1460 #define MAX_SKBS 32 #define USEC_PER_POLL 50 #define MAX_SKB_SIZE \ (sizeof(struct ethhdr) + \ sizeof(struct iphdr) + \ sizeof(struct udphdr) + \ MAX_UDP_CHUNK) static void zap_completion_queue(void); static unsigned int carrier_timeout = 4; module_param(carrier_timeout, uint, 0644); #define np_info(np, fmt, ...) \ pr_info("%s: " fmt, np->name, ##__VA_ARGS__) #define np_err(np, fmt, ...) \ pr_err("%s: " fmt, np->name, ##__VA_ARGS__) #define np_notice(np, fmt, ...) \ pr_notice("%s: " fmt, np->name, ##__VA_ARGS__) static netdev_tx_t netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq) { netdev_tx_t status = NETDEV_TX_OK; netdev_features_t features; features = netif_skb_features(skb); if (skb_vlan_tag_present(skb) && !vlan_hw_offload_capable(features, skb->vlan_proto)) { skb = __vlan_hwaccel_push_inside(skb); if (unlikely(!skb)) { /* This is actually a packet drop, but we * don't want the code that calls this * function to try and operate on a NULL skb. */ goto out; } } status = netdev_start_xmit(skb, dev, txq, false); out: return status; } static void queue_process(struct work_struct *work) { struct netpoll_info *npinfo = container_of(work, struct netpoll_info, tx_work.work); struct sk_buff *skb; unsigned long flags; while ((skb = skb_dequeue(&npinfo->txq))) { struct net_device *dev = skb->dev; struct netdev_queue *txq; unsigned int q_index; if (!netif_device_present(dev) || !netif_running(dev)) { kfree_skb(skb); continue; } local_irq_save(flags); /* check if skb->queue_mapping is still valid */ q_index = skb_get_queue_mapping(skb); if (unlikely(q_index >= dev->real_num_tx_queues)) { q_index = q_index % dev->real_num_tx_queues; skb_set_queue_mapping(skb, q_index); } txq = netdev_get_tx_queue(dev, q_index); HARD_TX_LOCK(dev, txq, smp_processor_id()); if (netif_xmit_frozen_or_stopped(txq) || !dev_xmit_complete(netpoll_start_xmit(skb, dev, txq))) { skb_queue_head(&npinfo->txq, skb); HARD_TX_UNLOCK(dev, txq); local_irq_restore(flags); schedule_delayed_work(&npinfo->tx_work, HZ/10); return; } HARD_TX_UNLOCK(dev, txq); local_irq_restore(flags); } } static int netif_local_xmit_active(struct net_device *dev) { int i; for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq = netdev_get_tx_queue(dev, i); if (READ_ONCE(txq->xmit_lock_owner) == smp_processor_id()) return 1; } return 0; } static void poll_one_napi(struct napi_struct *napi) { int work; /* If we set this bit but see that it has already been set, * that indicates that napi has been disabled and we need * to abort this operation */ if (test_and_set_bit(NAPI_STATE_NPSVC, &napi->state)) return; /* We explicitly pass the polling call a budget of 0 to * indicate that we are clearing the Tx path only. */ work = napi->poll(napi, 0); WARN_ONCE(work, "%pS exceeded budget in poll\n", napi->poll); trace_napi_poll(napi, work, 0); clear_bit(NAPI_STATE_NPSVC, &napi->state); } static void poll_napi(struct net_device *dev) { struct napi_struct *napi; int cpu = smp_processor_id(); list_for_each_entry_rcu(napi, &dev->napi_list, dev_list) { if (cmpxchg(&napi->poll_owner, -1, cpu) == -1) { poll_one_napi(napi); smp_store_release(&napi->poll_owner, -1); } } } void netpoll_poll_dev(struct net_device *dev) { struct netpoll_info *ni = rcu_dereference_bh(dev->npinfo); const struct net_device_ops *ops; /* Don't do any rx activity if the dev_lock mutex is held * the dev_open/close paths use this to block netpoll activity * while changing device state */ if (!ni || down_trylock(&ni->dev_lock)) return; /* Some drivers will take the same locks in poll and xmit, * we can't poll if local CPU is already in xmit. */ if (!netif_running(dev) || netif_local_xmit_active(dev)) { up(&ni->dev_lock); return; } ops = dev->netdev_ops; if (ops->ndo_poll_controller) ops->ndo_poll_controller(dev); poll_napi(dev); up(&ni->dev_lock); zap_completion_queue(); } EXPORT_SYMBOL(netpoll_poll_dev); void netpoll_poll_disable(struct net_device *dev) { struct netpoll_info *ni; might_sleep(); ni = rtnl_dereference(dev->npinfo); if (ni) down(&ni->dev_lock); } void netpoll_poll_enable(struct net_device *dev) { struct netpoll_info *ni; ni = rtnl_dereference(dev->npinfo); if (ni) up(&ni->dev_lock); } static void refill_skbs(struct netpoll *np) { struct sk_buff_head *skb_pool; struct sk_buff *skb; unsigned long flags; skb_pool = &np->skb_pool; spin_lock_irqsave(&skb_pool->lock, flags); while (skb_pool->qlen < MAX_SKBS) { skb = alloc_skb(MAX_SKB_SIZE, GFP_ATOMIC); if (!skb) break; __skb_queue_tail(skb_pool, skb); } spin_unlock_irqrestore(&skb_pool->lock, flags); } static void zap_completion_queue(void) { unsigned long flags; struct softnet_data *sd = &get_cpu_var(softnet_data); if (sd->completion_queue) { struct sk_buff *clist; local_irq_save(flags); clist = sd->completion_queue; sd->completion_queue = NULL; local_irq_restore(flags); while (clist != NULL) { struct sk_buff *skb = clist; clist = clist->next; if (!skb_irq_freeable(skb)) { refcount_set(&skb->users, 1); dev_kfree_skb_any(skb); /* put this one back */ } else { __kfree_skb(skb); } } } put_cpu_var(softnet_data); } static struct sk_buff *find_skb(struct netpoll *np, int len, int reserve) { int count = 0; struct sk_buff *skb; zap_completion_queue(); refill_skbs(np); repeat: skb = alloc_skb(len, GFP_ATOMIC); if (!skb) skb = skb_dequeue(&np->skb_pool); if (!skb) { if (++count < 10) { netpoll_poll_dev(np->dev); goto repeat; } return NULL; } refcount_set(&skb->users, 1); skb_reserve(skb, reserve); return skb; } static int netpoll_owner_active(struct net_device *dev) { struct napi_struct *napi; list_for_each_entry_rcu(napi, &dev->napi_list, dev_list) { if (READ_ONCE(napi->poll_owner) == smp_processor_id()) return 1; } return 0; } /* call with IRQ disabled */ static netdev_tx_t __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) { netdev_tx_t status = NETDEV_TX_BUSY; struct net_device *dev; unsigned long tries; /* It is up to the caller to keep npinfo alive. */ struct netpoll_info *npinfo; lockdep_assert_irqs_disabled(); dev = np->dev; npinfo = rcu_dereference_bh(dev->npinfo); if (!npinfo || !netif_running(dev) || !netif_device_present(dev)) { dev_kfree_skb_irq(skb); return NET_XMIT_DROP; } /* don't get messages out of order, and no recursion */ if (skb_queue_len(&npinfo->txq) == 0 && !netpoll_owner_active(dev)) { struct netdev_queue *txq; txq = netdev_core_pick_tx(dev, skb, NULL); /* try until next clock tick */ for (tries = jiffies_to_usecs(1)/USEC_PER_POLL; tries > 0; --tries) { if (HARD_TX_TRYLOCK(dev, txq)) { if (!netif_xmit_stopped(txq)) status = netpoll_start_xmit(skb, dev, txq); HARD_TX_UNLOCK(dev, txq); if (dev_xmit_complete(status)) break; } /* tickle device maybe there is some cleanup */ netpoll_poll_dev(np->dev); udelay(USEC_PER_POLL); } WARN_ONCE(!irqs_disabled(), "netpoll_send_skb_on_dev(): %s enabled interrupts in poll (%pS)\n", dev->name, dev->netdev_ops->ndo_start_xmit); } if (!dev_xmit_complete(status)) { skb_queue_tail(&npinfo->txq, skb); schedule_delayed_work(&npinfo->tx_work,0); } return NETDEV_TX_OK; } netdev_tx_t netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) { unsigned long flags; netdev_tx_t ret; if (unlikely(!np)) { dev_kfree_skb_irq(skb); ret = NET_XMIT_DROP; } else { local_irq_save(flags); ret = __netpoll_send_skb(np, skb); local_irq_restore(flags); } return ret; } EXPORT_SYMBOL(netpoll_send_skb); int netpoll_send_udp(struct netpoll *np, const char *msg, int len) { int total_len, ip_len, udp_len; struct sk_buff *skb; struct udphdr *udph; struct iphdr *iph; struct ethhdr *eth; static atomic_t ip_ident; struct ipv6hdr *ip6h; if (!IS_ENABLED(CONFIG_PREEMPT_RT)) WARN_ON_ONCE(!irqs_disabled()); udp_len = len + sizeof(*udph); if (np->ipv6) ip_len = udp_len + sizeof(*ip6h); else ip_len = udp_len + sizeof(*iph); total_len = ip_len + LL_RESERVED_SPACE(np->dev); skb = find_skb(np, total_len + np->dev->needed_tailroom, total_len - len); if (!skb) return -ENOMEM; skb_copy_to_linear_data(skb, msg, len); skb_put(skb, len); skb_push(skb, sizeof(*udph)); skb_reset_transport_header(skb); udph = udp_hdr(skb); udph->source = htons(np->local_port); udph->dest = htons(np->remote_port); udph->len = htons(udp_len); if (np->ipv6) { udph->check = 0; udph->check = csum_ipv6_magic(&np->local_ip.in6, &np->remote_ip.in6, udp_len, IPPROTO_UDP, csum_partial(udph, udp_len, 0)); if (udph->check == 0) udph->check = CSUM_MANGLED_0; skb_push(skb, sizeof(*ip6h)); skb_reset_network_header(skb); ip6h = ipv6_hdr(skb); /* ip6h->version = 6; ip6h->priority = 0; */ *(unsigned char *)ip6h = 0x60; ip6h->flow_lbl[0] = 0; ip6h->flow_lbl[1] = 0; ip6h->flow_lbl[2] = 0; ip6h->payload_len = htons(sizeof(struct udphdr) + len); ip6h->nexthdr = IPPROTO_UDP; ip6h->hop_limit = 32; ip6h->saddr = np->local_ip.in6; ip6h->daddr = np->remote_ip.in6; eth = skb_push(skb, ETH_HLEN); skb_reset_mac_header(skb); skb->protocol = eth->h_proto = htons(ETH_P_IPV6); } else { udph->check = 0; udph->check = csum_tcpudp_magic(np->local_ip.ip, np->remote_ip.ip, udp_len, IPPROTO_UDP, csum_partial(udph, udp_len, 0)); if (udph->check == 0) udph->check = CSUM_MANGLED_0; skb_push(skb, sizeof(*iph)); skb_reset_network_header(skb); iph = ip_hdr(skb); /* iph->version = 4; iph->ihl = 5; */ *(unsigned char *)iph = 0x45; iph->tos = 0; put_unaligned(htons(ip_len), &(iph->tot_len)); iph->id = htons(atomic_inc_return(&ip_ident)); iph->frag_off = 0; iph->ttl = 64; iph->protocol = IPPROTO_UDP; iph->check = 0; put_unaligned(np->local_ip.ip, &(iph->saddr)); put_unaligned(np->remote_ip.ip, &(iph->daddr)); iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); eth = skb_push(skb, ETH_HLEN); skb_reset_mac_header(skb); skb->protocol = eth->h_proto = htons(ETH_P_IP); } ether_addr_copy(eth->h_source, np->dev->dev_addr); ether_addr_copy(eth->h_dest, np->remote_mac); skb->dev = np->dev; return (int)netpoll_send_skb(np, skb); } EXPORT_SYMBOL(netpoll_send_udp); void netpoll_print_options(struct netpoll *np) { np_info(np, "local port %d\n", np->local_port); if (np->ipv6) np_info(np, "local IPv6 address %pI6c\n", &np->local_ip.in6); else np_info(np, "local IPv4 address %pI4\n", &np->local_ip.ip); np_info(np, "interface '%s'\n", np->dev_name); np_info(np, "remote port %d\n", np->remote_port); if (np->ipv6) np_info(np, "remote IPv6 address %pI6c\n", &np->remote_ip.in6); else np_info(np, "remote IPv4 address %pI4\n", &np->remote_ip.ip); np_info(np, "remote ethernet address %pM\n", np->remote_mac); } EXPORT_SYMBOL(netpoll_print_options); static int netpoll_parse_ip_addr(const char *str, union inet_addr *addr) { const char *end; if (!strchr(str, ':') && in4_pton(str, -1, (void *)addr, -1, &end) > 0) { if (!*end) return 0; } if (in6_pton(str, -1, addr->in6.s6_addr, -1, &end) > 0) { #if IS_ENABLED(CONFIG_IPV6) if (!*end) return 1; #else return -1; #endif } return -1; } static void skb_pool_flush(struct netpoll *np) { struct sk_buff_head *skb_pool; skb_pool = &np->skb_pool; skb_queue_purge_reason(skb_pool, SKB_CONSUMED); } int netpoll_parse_options(struct netpoll *np, char *opt) { char *cur=opt, *delim; int ipv6; bool ipversion_set = false; if (*cur != '@') { if ((delim = strchr(cur, '@')) == NULL) goto parse_failed; *delim = 0; if (kstrtou16(cur, 10, &np->local_port)) goto parse_failed; cur = delim; } cur++; if (*cur != '/') { ipversion_set = true; if ((delim = strchr(cur, '/')) == NULL) goto parse_failed; *delim = 0; ipv6 = netpoll_parse_ip_addr(cur, &np->local_ip); if (ipv6 < 0) goto parse_failed; else np->ipv6 = (bool)ipv6; cur = delim; } cur++; if (*cur != ',') { /* parse out dev name */ if ((delim = strchr(cur, ',')) == NULL) goto parse_failed; *delim = 0; strscpy(np->dev_name, cur, sizeof(np->dev_name)); cur = delim; } cur++; if (*cur != '@') { /* dst port */ if ((delim = strchr(cur, '@')) == NULL) goto parse_failed; *delim = 0; if (*cur == ' ' || *cur == '\t') np_info(np, "warning: whitespace is not allowed\n"); if (kstrtou16(cur, 10, &np->remote_port)) goto parse_failed; cur = delim; } cur++; /* dst ip */ if ((delim = strchr(cur, '/')) == NULL) goto parse_failed; *delim = 0; ipv6 = netpoll_parse_ip_addr(cur, &np->remote_ip); if (ipv6 < 0) goto parse_failed; else if (ipversion_set && np->ipv6 != (bool)ipv6) goto parse_failed; else np->ipv6 = (bool)ipv6; cur = delim + 1; if (*cur != 0) { /* MAC address */ if (!mac_pton(cur, np->remote_mac)) goto parse_failed; } netpoll_print_options(np); return 0; parse_failed: np_info(np, "couldn't parse config at '%s'!\n", cur); return -1; } EXPORT_SYMBOL(netpoll_parse_options); int __netpoll_setup(struct netpoll *np, struct net_device *ndev) { struct netpoll_info *npinfo; const struct net_device_ops *ops; int err; skb_queue_head_init(&np->skb_pool); if (ndev->priv_flags & IFF_DISABLE_NETPOLL) { np_err(np, "%s doesn't support polling, aborting\n", ndev->name); err = -ENOTSUPP; goto out; } npinfo = rtnl_dereference(ndev->npinfo); if (!npinfo) { npinfo = kmalloc(sizeof(*npinfo), GFP_KERNEL); if (!npinfo) { err = -ENOMEM; goto out; } sema_init(&npinfo->dev_lock, 1); skb_queue_head_init(&npinfo->txq); INIT_DELAYED_WORK(&npinfo->tx_work, queue_process); refcount_set(&npinfo->refcnt, 1); ops = ndev->netdev_ops; if (ops->ndo_netpoll_setup) { err = ops->ndo_netpoll_setup(ndev); if (err) goto free_npinfo; } } else { refcount_inc(&npinfo->refcnt); } np->dev = ndev; strscpy(np->dev_name, ndev->name, IFNAMSIZ); npinfo->netpoll = np; /* fill up the skb queue */ refill_skbs(np); /* last thing to do is link it to the net device structure */ rcu_assign_pointer(ndev->npinfo, npinfo); return 0; free_npinfo: kfree(npinfo); out: return err; } EXPORT_SYMBOL_GPL(__netpoll_setup); int netpoll_setup(struct netpoll *np) { struct net_device *ndev = NULL; bool ip_overwritten = false; struct in_device *in_dev; int err; rtnl_lock(); if (np->dev_name[0]) { struct net *net = current->nsproxy->net_ns; ndev = __dev_get_by_name(net, np->dev_name); } if (!ndev) { np_err(np, "%s doesn't exist, aborting\n", np->dev_name); err = -ENODEV; goto unlock; } netdev_hold(ndev, &np->dev_tracker, GFP_KERNEL); if (netdev_master_upper_dev_get(ndev)) { np_err(np, "%s is a slave device, aborting\n", np->dev_name); err = -EBUSY; goto put; } if (!netif_running(ndev)) { unsigned long atmost; np_info(np, "device %s not up yet, forcing it\n", np->dev_name); err = dev_open(ndev, NULL); if (err) { np_err(np, "failed to open %s\n", ndev->name); goto put; } rtnl_unlock(); atmost = jiffies + carrier_timeout * HZ; while (!netif_carrier_ok(ndev)) { if (time_after(jiffies, atmost)) { np_notice(np, "timeout waiting for carrier\n"); break; } msleep(1); } rtnl_lock(); } if (!np->local_ip.ip) { if (!np->ipv6) { const struct in_ifaddr *ifa; in_dev = __in_dev_get_rtnl(ndev); if (!in_dev) goto put_noaddr; ifa = rtnl_dereference(in_dev->ifa_list); if (!ifa) { put_noaddr: np_err(np, "no IP address for %s, aborting\n", np->dev_name); err = -EDESTADDRREQ; goto put; } np->local_ip.ip = ifa->ifa_local; ip_overwritten = true; np_info(np, "local IP %pI4\n", &np->local_ip.ip); } else { #if IS_ENABLED(CONFIG_IPV6) struct inet6_dev *idev; err = -EDESTADDRREQ; idev = __in6_dev_get(ndev); if (idev) { struct inet6_ifaddr *ifp; read_lock_bh(&idev->lock); list_for_each_entry(ifp, &idev->addr_list, if_list) { if (!!(ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL) != !!(ipv6_addr_type(&np->remote_ip.in6) & IPV6_ADDR_LINKLOCAL)) continue; np->local_ip.in6 = ifp->addr; ip_overwritten = true; err = 0; break; } read_unlock_bh(&idev->lock); } if (err) { np_err(np, "no IPv6 address for %s, aborting\n", np->dev_name); goto put; } else np_info(np, "local IPv6 %pI6c\n", &np->local_ip.in6); #else np_err(np, "IPv6 is not supported %s, aborting\n", np->dev_name); err = -EINVAL; goto put; #endif } } err = __netpoll_setup(np, ndev); if (err) goto flush; rtnl_unlock(); return 0; flush: skb_pool_flush(np); put: DEBUG_NET_WARN_ON_ONCE(np->dev); if (ip_overwritten) memset(&np->local_ip, 0, sizeof(np->local_ip)); netdev_put(ndev, &np->dev_tracker); unlock: rtnl_unlock(); return err; } EXPORT_SYMBOL(netpoll_setup); static void rcu_cleanup_netpoll_info(struct rcu_head *rcu_head) { struct netpoll_info *npinfo = container_of(rcu_head, struct netpoll_info, rcu); skb_queue_purge(&npinfo->txq); /* we can't call cancel_delayed_work_sync here, as we are in softirq */ cancel_delayed_work(&npinfo->tx_work); /* clean after last, unfinished work */ __skb_queue_purge(&npinfo->txq); /* now cancel it again */ cancel_delayed_work(&npinfo->tx_work); kfree(npinfo); } void __netpoll_cleanup(struct netpoll *np) { struct netpoll_info *npinfo; npinfo = rtnl_dereference(np->dev->npinfo); if (!npinfo) return; if (refcount_dec_and_test(&npinfo->refcnt)) { const struct net_device_ops *ops; ops = np->dev->netdev_ops; if (ops->ndo_netpoll_cleanup) ops->ndo_netpoll_cleanup(np->dev); RCU_INIT_POINTER(np->dev->npinfo, NULL); call_rcu(&npinfo->rcu, rcu_cleanup_netpoll_info); } else RCU_INIT_POINTER(np->dev->npinfo, NULL); skb_pool_flush(np); } EXPORT_SYMBOL_GPL(__netpoll_cleanup); void __netpoll_free(struct netpoll *np) { ASSERT_RTNL(); /* Wait for transmitting packets to finish before freeing. */ synchronize_rcu(); __netpoll_cleanup(np); kfree(np); } EXPORT_SYMBOL_GPL(__netpoll_free); void do_netpoll_cleanup(struct netpoll *np) { __netpoll_cleanup(np); netdev_put(np->dev, &np->dev_tracker); np->dev = NULL; } EXPORT_SYMBOL(do_netpoll_cleanup); void netpoll_cleanup(struct netpoll *np) { rtnl_lock(); if (!np->dev) goto out; do_netpoll_cleanup(np); out: rtnl_unlock(); } EXPORT_SYMBOL(netpoll_cleanup);
131 131 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 // SPDX-License-Identifier: GPL-2.0 /* * Copyright 2019, 2020 Amazon.com, Inc. or its affiliates. All rights reserved. * * User extended attribute client side cache functions. * * Author: Frank van der Linden <fllinden@amazon.com> */ #include <linux/errno.h> #include <linux/nfs_fs.h> #include <linux/hashtable.h> #include <linux/refcount.h> #include <uapi/linux/xattr.h> #include "nfs4_fs.h" #include "internal.h" /* * User extended attributes client side caching is implemented by having * a cache structure attached to NFS inodes. This structure is allocated * when needed, and freed when the cache is zapped. * * The cache structure contains as hash table of entries, and a pointer * to a special-cased entry for the listxattr cache. * * Accessing and allocating / freeing the caches is done via reference * counting. The cache entries use a similar refcounting scheme. * * This makes freeing a cache, both from the shrinker and from the * zap cache path, easy. It also means that, in current use cases, * the large majority of inodes will not waste any memory, as they * will never have any user extended attributes assigned to them. * * Attribute entries are hashed in to a simple hash table. They are * also part of an LRU. * * There are three shrinkers. * * Two shrinkers deal with the cache entries themselves: one for * large entries (> PAGE_SIZE), and one for smaller entries. The * shrinker for the larger entries works more aggressively than * those for the smaller entries. * * The other shrinker frees the cache structures themselves. */ /* * 64 buckets is a good default. There is likely no reasonable * workload that uses more than even 64 user extended attributes. * You can certainly add a lot more - but you get what you ask for * in those circumstances. */ #define NFS4_XATTR_HASH_SIZE 64 #define NFSDBG_FACILITY NFSDBG_XATTRCACHE struct nfs4_xattr_cache; struct nfs4_xattr_entry; struct nfs4_xattr_bucket { spinlock_t lock; struct hlist_head hlist; struct nfs4_xattr_cache *cache; bool draining; }; struct nfs4_xattr_cache { struct kref ref; struct nfs4_xattr_bucket buckets[NFS4_XATTR_HASH_SIZE]; struct list_head lru; struct list_head dispose; atomic_long_t nent; spinlock_t listxattr_lock; struct inode *inode; struct nfs4_xattr_entry *listxattr; }; struct nfs4_xattr_entry { struct kref ref; struct hlist_node hnode; struct list_head lru; struct list_head dispose; char *xattr_name; void *xattr_value; size_t xattr_size; struct nfs4_xattr_bucket *bucket; uint32_t flags; }; #define NFS4_XATTR_ENTRY_EXTVAL 0x0001 /* * LRU list of NFS inodes that have xattr caches. */ static struct list_lru nfs4_xattr_cache_lru; static struct list_lru nfs4_xattr_entry_lru; static struct list_lru nfs4_xattr_large_entry_lru; static struct kmem_cache *nfs4_xattr_cache_cachep; /* * Hashing helper functions. */ static void nfs4_xattr_hash_init(struct nfs4_xattr_cache *cache) { unsigned int i; for (i = 0; i < NFS4_XATTR_HASH_SIZE; i++) { INIT_HLIST_HEAD(&cache->buckets[i].hlist); spin_lock_init(&cache->buckets[i].lock); cache->buckets[i].cache = cache; cache->buckets[i].draining = false; } } /* * Locking order: * 1. inode i_lock or bucket lock * 2. list_lru lock (taken by list_lru_* functions) */ /* * Wrapper functions to add a cache entry to the right LRU. */ static bool nfs4_xattr_entry_lru_add(struct nfs4_xattr_entry *entry) { struct list_lru *lru; lru = (entry->flags & NFS4_XATTR_ENTRY_EXTVAL) ? &nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru; return list_lru_add_obj(lru, &entry->lru); } static bool nfs4_xattr_entry_lru_del(struct nfs4_xattr_entry *entry) { struct list_lru *lru; lru = (entry->flags & NFS4_XATTR_ENTRY_EXTVAL) ? &nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru; return list_lru_del_obj(lru, &entry->lru); } /* * This function allocates cache entries. They are the normal * extended attribute name/value pairs, but may also be a listxattr * cache. Those allocations use the same entry so that they can be * treated as one by the memory shrinker. * * xattr cache entries are allocated together with names. If the * value fits in to one page with the entry structure and the name, * it will also be part of the same allocation (kmalloc). This is * expected to be the vast majority of cases. Larger allocations * have a value pointer that is allocated separately by kvmalloc. * * Parameters: * * @name: Name of the extended attribute. NULL for listxattr cache * entry. * @value: Value of attribute, or listxattr cache. NULL if the * value is to be copied from pages instead. * @pages: Pages to copy the value from, if not NULL. Passed in to * make it easier to copy the value after an RPC, even if * the value will not be passed up to application (e.g. * for a 'query' getxattr with NULL buffer). * @len: Length of the value. Can be 0 for zero-length attributes. * @value and @pages will be NULL if @len is 0. */ static struct nfs4_xattr_entry * nfs4_xattr_alloc_entry(const char *name, const void *value, struct page **pages, size_t len) { struct nfs4_xattr_entry *entry; void *valp; char *namep; size_t alloclen, slen; char *buf; uint32_t flags; BUILD_BUG_ON(sizeof(struct nfs4_xattr_entry) + XATTR_NAME_MAX + 1 > PAGE_SIZE); alloclen = sizeof(struct nfs4_xattr_entry); if (name != NULL) { slen = strlen(name) + 1; alloclen += slen; } else slen = 0; if (alloclen + len <= PAGE_SIZE) { alloclen += len; flags = 0; } else { flags = NFS4_XATTR_ENTRY_EXTVAL; } buf = kmalloc(alloclen, GFP_KERNEL); if (buf == NULL) return NULL; entry = (struct nfs4_xattr_entry *)buf; if (name != NULL) { namep = buf + sizeof(struct nfs4_xattr_entry); memcpy(namep, name, slen); } else { namep = NULL; } if (flags & NFS4_XATTR_ENTRY_EXTVAL) { valp = kvmalloc(len, GFP_KERNEL); if (valp == NULL) { kfree(buf); return NULL; } } else if (len != 0) { valp = buf + sizeof(struct nfs4_xattr_entry) + slen; } else valp = NULL; if (valp != NULL) { if (value != NULL) memcpy(valp, value, len); else _copy_from_pages(valp, pages, 0, len); } entry->flags = flags; entry->xattr_value = valp; kref_init(&entry->ref); entry->xattr_name = namep; entry->xattr_size = len; entry->bucket = NULL; INIT_LIST_HEAD(&entry->lru); INIT_LIST_HEAD(&entry->dispose); INIT_HLIST_NODE(&entry->hnode); return entry; } static void nfs4_xattr_free_entry(struct nfs4_xattr_entry *entry) { if (entry->flags & NFS4_XATTR_ENTRY_EXTVAL) kvfree(entry->xattr_value); kfree(entry); } static void nfs4_xattr_free_entry_cb(struct kref *kref) { struct nfs4_xattr_entry *entry; entry = container_of(kref, struct nfs4_xattr_entry, ref); if (WARN_ON(!list_empty(&entry->lru))) return; nfs4_xattr_free_entry(entry); } static void nfs4_xattr_free_cache_cb(struct kref *kref) { struct nfs4_xattr_cache *cache; int i; cache = container_of(kref, struct nfs4_xattr_cache, ref); for (i = 0; i < NFS4_XATTR_HASH_SIZE; i++) { if (WARN_ON(!hlist_empty(&cache->buckets[i].hlist))) return; cache->buckets[i].draining = false; } cache->listxattr = NULL; kmem_cache_free(nfs4_xattr_cache_cachep, cache); } static struct nfs4_xattr_cache * nfs4_xattr_alloc_cache(void) { struct nfs4_xattr_cache *cache; cache = kmem_cache_alloc(nfs4_xattr_cache_cachep, GFP_KERNEL); if (cache == NULL) return NULL; kref_init(&cache->ref); atomic_long_set(&cache->nent, 0); return cache; } /* * Set the listxattr cache, which is a special-cased cache entry. * The special value ERR_PTR(-ESTALE) is used to indicate that * the cache is being drained - this prevents a new listxattr * cache from being added to what is now a stale cache. */ static int nfs4_xattr_set_listcache(struct nfs4_xattr_cache *cache, struct nfs4_xattr_entry *new) { struct nfs4_xattr_entry *old; int ret = 1; spin_lock(&cache->listxattr_lock); old = cache->listxattr; if (old == ERR_PTR(-ESTALE)) { ret = 0; goto out; } cache->listxattr = new; if (new != NULL && new != ERR_PTR(-ESTALE)) nfs4_xattr_entry_lru_add(new); if (old != NULL) { nfs4_xattr_entry_lru_del(old); kref_put(&old->ref, nfs4_xattr_free_entry_cb); } out: spin_unlock(&cache->listxattr_lock); return ret; } /* * Unlink a cache from its parent inode, clearing out an invalid * cache. Must be called with i_lock held. */ static struct nfs4_xattr_cache * nfs4_xattr_cache_unlink(struct inode *inode) { struct nfs_inode *nfsi; struct nfs4_xattr_cache *oldcache; nfsi = NFS_I(inode); oldcache = nfsi->xattr_cache; if (oldcache != NULL) { list_lru_del_obj(&nfs4_xattr_cache_lru, &oldcache->lru); oldcache->inode = NULL; } nfsi->xattr_cache = NULL; nfsi->cache_validity &= ~NFS_INO_INVALID_XATTR; return oldcache; } /* * Discard a cache. Called by get_cache() if there was an old, * invalid cache. Can also be called from a shrinker callback. * * The cache is dead, it has already been unlinked from its inode, * and no longer appears on the cache LRU list. * * Mark all buckets as draining, so that no new entries are added. This * could still happen in the unlikely, but possible case that another * thread had grabbed a reference before it was unlinked from the inode, * and is still holding it for an add operation. * * Remove all entries from the LRU lists, so that there is no longer * any way to 'find' this cache. Then, remove the entries from the hash * table. * * At that point, the cache will remain empty and can be freed when the final * reference drops, which is very likely the kref_put at the end of * this function, or the one called immediately afterwards in the * shrinker callback. */ static void nfs4_xattr_discard_cache(struct nfs4_xattr_cache *cache) { unsigned int i; struct nfs4_xattr_entry *entry; struct nfs4_xattr_bucket *bucket; struct hlist_node *n; nfs4_xattr_set_listcache(cache, ERR_PTR(-ESTALE)); for (i = 0; i < NFS4_XATTR_HASH_SIZE; i++) { bucket = &cache->buckets[i]; spin_lock(&bucket->lock); bucket->draining = true; hlist_for_each_entry_safe(entry, n, &bucket->hlist, hnode) { nfs4_xattr_entry_lru_del(entry); hlist_del_init(&entry->hnode); kref_put(&entry->ref, nfs4_xattr_free_entry_cb); } spin_unlock(&bucket->lock); } atomic_long_set(&cache->nent, 0); kref_put(&cache->ref, nfs4_xattr_free_cache_cb); } /* * Get a referenced copy of the cache structure. Avoid doing allocs * while holding i_lock. Which means that we do some optimistic allocation, * and might have to free the result in rare cases. * * This function only checks the NFS_INO_INVALID_XATTR cache validity bit * and acts accordingly, replacing the cache when needed. For the read case * (!add), this means that the caller must make sure that the cache * is valid before caling this function. getxattr and listxattr call * revalidate_inode to do this. The attribute cache timeout (for the * non-delegated case) is expected to be dealt with in the revalidate * call. */ static struct nfs4_xattr_cache * nfs4_xattr_get_cache(struct inode *inode, int add) { struct nfs_inode *nfsi; struct nfs4_xattr_cache *cache, *oldcache, *newcache; nfsi = NFS_I(inode); cache = oldcache = NULL; spin_lock(&inode->i_lock); if (nfsi->cache_validity & NFS_INO_INVALID_XATTR) oldcache = nfs4_xattr_cache_unlink(inode); else cache = nfsi->xattr_cache; if (cache != NULL) kref_get(&cache->ref); spin_unlock(&inode->i_lock); if (add && cache == NULL) { newcache = NULL; cache = nfs4_xattr_alloc_cache(); if (cache == NULL) goto out; spin_lock(&inode->i_lock); if (nfsi->cache_validity & NFS_INO_INVALID_XATTR) { /* * The cache was invalidated again. Give up, * since what we want to enter is now likely * outdated anyway. */ spin_unlock(&inode->i_lock); kref_put(&cache->ref, nfs4_xattr_free_cache_cb); cache = NULL; goto out; } /* * Check if someone beat us to it. */ if (nfsi->xattr_cache != NULL) { newcache = nfsi->xattr_cache; kref_get(&newcache->ref); } else { kref_get(&cache->ref); nfsi->xattr_cache = cache; cache->inode = inode; list_lru_add_obj(&nfs4_xattr_cache_lru, &cache->lru); } spin_unlock(&inode->i_lock); /* * If there was a race, throw away the cache we just * allocated, and use the new one allocated by someone * else. */ if (newcache != NULL) { kref_put(&cache->ref, nfs4_xattr_free_cache_cb); cache = newcache; } } out: /* * Discard the now orphaned old cache. */ if (oldcache != NULL) nfs4_xattr_discard_cache(oldcache); return cache; } static inline struct nfs4_xattr_bucket * nfs4_xattr_hash_bucket(struct nfs4_xattr_cache *cache, const char *name) { return &cache->buckets[jhash(name, strlen(name), 0) & (ARRAY_SIZE(cache->buckets) - 1)]; } static struct nfs4_xattr_entry * nfs4_xattr_get_entry(struct nfs4_xattr_bucket *bucket, const char *name) { struct nfs4_xattr_entry *entry; entry = NULL; hlist_for_each_entry(entry, &bucket->hlist, hnode) { if (!strcmp(entry->xattr_name, name)) break; } return entry; } static int nfs4_xattr_hash_add(struct nfs4_xattr_cache *cache, struct nfs4_xattr_entry *entry) { struct nfs4_xattr_bucket *bucket; struct nfs4_xattr_entry *oldentry = NULL; int ret = 1; bucket = nfs4_xattr_hash_bucket(cache, entry->xattr_name); entry->bucket = bucket; spin_lock(&bucket->lock); if (bucket->draining) { ret = 0; goto out; } oldentry = nfs4_xattr_get_entry(bucket, entry->xattr_name); if (oldentry != NULL) { hlist_del_init(&oldentry->hnode); nfs4_xattr_entry_lru_del(oldentry); } else { atomic_long_inc(&cache->nent); } hlist_add_head(&entry->hnode, &bucket->hlist); nfs4_xattr_entry_lru_add(entry); out: spin_unlock(&bucket->lock); if (oldentry != NULL) kref_put(&oldentry->ref, nfs4_xattr_free_entry_cb); return ret; } static void nfs4_xattr_hash_remove(struct nfs4_xattr_cache *cache, const char *name) { struct nfs4_xattr_bucket *bucket; struct nfs4_xattr_entry *entry; bucket = nfs4_xattr_hash_bucket(cache, name); spin_lock(&bucket->lock); entry = nfs4_xattr_get_entry(bucket, name); if (entry != NULL) { hlist_del_init(&entry->hnode); nfs4_xattr_entry_lru_del(entry); atomic_long_dec(&cache->nent); } spin_unlock(&bucket->lock); if (entry != NULL) kref_put(&entry->ref, nfs4_xattr_free_entry_cb); } static struct nfs4_xattr_entry * nfs4_xattr_hash_find(struct nfs4_xattr_cache *cache, const char *name) { struct nfs4_xattr_bucket *bucket; struct nfs4_xattr_entry *entry; bucket = nfs4_xattr_hash_bucket(cache, name); spin_lock(&bucket->lock); entry = nfs4_xattr_get_entry(bucket, name); if (entry != NULL) kref_get(&entry->ref); spin_unlock(&bucket->lock); return entry; } /* * Entry point to retrieve an entry from the cache. */ ssize_t nfs4_xattr_cache_get(struct inode *inode, const char *name, char *buf, ssize_t buflen) { struct nfs4_xattr_cache *cache; struct nfs4_xattr_entry *entry; ssize_t ret; cache = nfs4_xattr_get_cache(inode, 0); if (cache == NULL) return -ENOENT; ret = 0; entry = nfs4_xattr_hash_find(cache, name); if (entry != NULL) { dprintk("%s: cache hit '%s', len %lu\n", __func__, entry->xattr_name, (unsigned long)entry->xattr_size); if (buflen == 0) { /* Length probe only */ ret = entry->xattr_size; } else if (buflen < entry->xattr_size) ret = -ERANGE; else { memcpy(buf, entry->xattr_value, entry->xattr_size); ret = entry->xattr_size; } kref_put(&entry->ref, nfs4_xattr_free_entry_cb); } else { dprintk("%s: cache miss '%s'\n", __func__, name); ret = -ENOENT; } kref_put(&cache->ref, nfs4_xattr_free_cache_cb); return ret; } /* * Retrieve a cached list of xattrs from the cache. */ ssize_t nfs4_xattr_cache_list(struct inode *inode, char *buf, ssize_t buflen) { struct nfs4_xattr_cache *cache; struct nfs4_xattr_entry *entry; ssize_t ret; cache = nfs4_xattr_get_cache(inode, 0); if (cache == NULL) return -ENOENT; spin_lock(&cache->listxattr_lock); entry = cache->listxattr; if (entry != NULL && entry != ERR_PTR(-ESTALE)) { if (buflen == 0) { /* Length probe only */ ret = entry->xattr_size; } else if (entry->xattr_size > buflen) ret = -ERANGE; else { memcpy(buf, entry->xattr_value, entry->xattr_size); ret = entry->xattr_size; } } else { ret = -ENOENT; } spin_unlock(&cache->listxattr_lock); kref_put(&cache->ref, nfs4_xattr_free_cache_cb); return ret; } /* * Add an xattr to the cache. * * This also invalidates the xattr list cache. */ void nfs4_xattr_cache_add(struct inode *inode, const char *name, const char *buf, struct page **pages, ssize_t buflen) { struct nfs4_xattr_cache *cache; struct nfs4_xattr_entry *entry; dprintk("%s: add '%s' len %lu\n", __func__, name, (unsigned long)buflen); cache = nfs4_xattr_get_cache(inode, 1); if (cache == NULL) return; entry = nfs4_xattr_alloc_entry(name, buf, pages, buflen); if (entry == NULL) goto out; (void)nfs4_xattr_set_listcache(cache, NULL); if (!nfs4_xattr_hash_add(cache, entry)) kref_put(&entry->ref, nfs4_xattr_free_entry_cb); out: kref_put(&cache->ref, nfs4_xattr_free_cache_cb); } /* * Remove an xattr from the cache. * * This also invalidates the xattr list cache. */ void nfs4_xattr_cache_remove(struct inode *inode, const char *name) { struct nfs4_xattr_cache *cache; dprintk("%s: remove '%s'\n", __func__, name); cache = nfs4_xattr_get_cache(inode, 0); if (cache == NULL) return; (void)nfs4_xattr_set_listcache(cache, NULL); nfs4_xattr_hash_remove(cache, name); kref_put(&cache->ref, nfs4_xattr_free_cache_cb); } /* * Cache listxattr output, replacing any possible old one. */ void nfs4_xattr_cache_set_list(struct inode *inode, const char *buf, ssize_t buflen) { struct nfs4_xattr_cache *cache; struct nfs4_xattr_entry *entry; cache = nfs4_xattr_get_cache(inode, 1); if (cache == NULL) return; entry = nfs4_xattr_alloc_entry(NULL, buf, NULL, buflen); if (entry == NULL) goto out; /* * This is just there to be able to get to bucket->cache, * which is obviously the same for all buckets, so just * use bucket 0. */ entry->bucket = &cache->buckets[0]; if (!nfs4_xattr_set_listcache(cache, entry)) kref_put(&entry->ref, nfs4_xattr_free_entry_cb); out: kref_put(&cache->ref, nfs4_xattr_free_cache_cb); } /* * Zap the entire cache. Called when an inode is evicted. */ void nfs4_xattr_cache_zap(struct inode *inode) { struct nfs4_xattr_cache *oldcache; spin_lock(&inode->i_lock); oldcache = nfs4_xattr_cache_unlink(inode); spin_unlock(&inode->i_lock); if (oldcache) nfs4_xattr_discard_cache(oldcache); } /* * The entry LRU is shrunk more aggressively than the cache LRU, * by settings @seeks to 1. * * Cache structures are freed only when they've become empty, after * pruning all but one entry. */ static unsigned long nfs4_xattr_cache_count(struct shrinker *shrink, struct shrink_control *sc); static unsigned long nfs4_xattr_entry_count(struct shrinker *shrink, struct shrink_control *sc); static unsigned long nfs4_xattr_cache_scan(struct shrinker *shrink, struct shrink_control *sc); static unsigned long nfs4_xattr_entry_scan(struct shrinker *shrink, struct shrink_control *sc); static struct shrinker *nfs4_xattr_cache_shrinker; static struct shrinker *nfs4_xattr_entry_shrinker; static struct shrinker *nfs4_xattr_large_entry_shrinker; static enum lru_status cache_lru_isolate(struct list_head *item, struct list_lru_one *lru, void *arg) { struct list_head *dispose = arg; struct inode *inode; struct nfs4_xattr_cache *cache = container_of(item, struct nfs4_xattr_cache, lru); if (atomic_long_read(&cache->nent) > 1) return LRU_SKIP; /* * If a cache structure is on the LRU list, we know that * its inode is valid. Try to lock it to break the link. * Since we're inverting the lock order here, only try. */ inode = cache->inode; if (!spin_trylock(&inode->i_lock)) return LRU_SKIP; kref_get(&cache->ref); cache->inode = NULL; NFS_I(inode)->xattr_cache = NULL; NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_XATTR; list_lru_isolate(lru, &cache->lru); spin_unlock(&inode->i_lock); list_add_tail(&cache->dispose, dispose); return LRU_REMOVED; } static unsigned long nfs4_xattr_cache_scan(struct shrinker *shrink, struct shrink_control *sc) { LIST_HEAD(dispose); unsigned long freed; struct nfs4_xattr_cache *cache; freed = list_lru_shrink_walk(&nfs4_xattr_cache_lru, sc, cache_lru_isolate, &dispose); while (!list_empty(&dispose)) { cache = list_first_entry(&dispose, struct nfs4_xattr_cache, dispose); list_del_init(&cache->dispose); nfs4_xattr_discard_cache(cache); kref_put(&cache->ref, nfs4_xattr_free_cache_cb); } return freed; } static unsigned long nfs4_xattr_cache_count(struct shrinker *shrink, struct shrink_control *sc) { unsigned long count; count = list_lru_shrink_count(&nfs4_xattr_cache_lru, sc); return vfs_pressure_ratio(count); } static enum lru_status entry_lru_isolate(struct list_head *item, struct list_lru_one *lru, void *arg) { struct list_head *dispose = arg; struct nfs4_xattr_bucket *bucket; struct nfs4_xattr_cache *cache; struct nfs4_xattr_entry *entry = container_of(item, struct nfs4_xattr_entry, lru); bucket = entry->bucket; cache = bucket->cache; /* * Unhook the entry from its parent (either a cache bucket * or a cache structure if it's a listxattr buf), so that * it's no longer found. Then add it to the isolate list, * to be freed later. * * In both cases, we're reverting lock order, so use * trylock and skip the entry if we can't get the lock. */ if (entry->xattr_name != NULL) { /* Regular cache entry */ if (!spin_trylock(&bucket->lock)) return LRU_SKIP; kref_get(&entry->ref); hlist_del_init(&entry->hnode); atomic_long_dec(&cache->nent); list_lru_isolate(lru, &entry->lru); spin_unlock(&bucket->lock); } else { /* Listxattr cache entry */ if (!spin_trylock(&cache->listxattr_lock)) return LRU_SKIP; kref_get(&entry->ref); cache->listxattr = NULL; list_lru_isolate(lru, &entry->lru); spin_unlock(&cache->listxattr_lock); } list_add_tail(&entry->dispose, dispose); return LRU_REMOVED; } static unsigned long nfs4_xattr_entry_scan(struct shrinker *shrink, struct shrink_control *sc) { LIST_HEAD(dispose); unsigned long freed; struct nfs4_xattr_entry *entry; struct list_lru *lru; lru = (shrink == nfs4_xattr_large_entry_shrinker) ? &nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru; freed = list_lru_shrink_walk(lru, sc, entry_lru_isolate, &dispose); while (!list_empty(&dispose)) { entry = list_first_entry(&dispose, struct nfs4_xattr_entry, dispose); list_del_init(&entry->dispose); /* * Drop two references: the one that we just grabbed * in entry_lru_isolate, and the one that was set * when the entry was first allocated. */ kref_put(&entry->ref, nfs4_xattr_free_entry_cb); kref_put(&entry->ref, nfs4_xattr_free_entry_cb); } return freed; } static unsigned long nfs4_xattr_entry_count(struct shrinker *shrink, struct shrink_control *sc) { unsigned long count; struct list_lru *lru; lru = (shrink == nfs4_xattr_large_entry_shrinker) ? &nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru; count = list_lru_shrink_count(lru, sc); return vfs_pressure_ratio(count); } static void nfs4_xattr_cache_init_once(void *p) { struct nfs4_xattr_cache *cache = p; spin_lock_init(&cache->listxattr_lock); atomic_long_set(&cache->nent, 0); nfs4_xattr_hash_init(cache); cache->listxattr = NULL; INIT_LIST_HEAD(&cache->lru); INIT_LIST_HEAD(&cache->dispose); } typedef unsigned long (*count_objects_cb)(struct shrinker *s, struct shrink_control *sc); typedef unsigned long (*scan_objects_cb)(struct shrinker *s, struct shrink_control *sc); static int __init nfs4_xattr_shrinker_init(struct shrinker **shrinker, struct list_lru *lru, const char *name, count_objects_cb count, scan_objects_cb scan, long batch, int seeks) { int ret; *shrinker = shrinker_alloc(SHRINKER_MEMCG_AWARE, name); if (!*shrinker) return -ENOMEM; ret = list_lru_init_memcg(lru, *shrinker); if (ret) { shrinker_free(*shrinker); return ret; } (*shrinker)->count_objects = count; (*shrinker)->scan_objects = scan; (*shrinker)->batch = batch; (*shrinker)->seeks = seeks; shrinker_register(*shrinker); return ret; } static void nfs4_xattr_shrinker_destroy(struct shrinker *shrinker, struct list_lru *lru) { shrinker_free(shrinker); list_lru_destroy(lru); } int __init nfs4_xattr_cache_init(void) { int ret = 0; nfs4_xattr_cache_cachep = kmem_cache_create("nfs4_xattr_cache_cache", sizeof(struct nfs4_xattr_cache), 0, (SLAB_RECLAIM_ACCOUNT), nfs4_xattr_cache_init_once); if (nfs4_xattr_cache_cachep == NULL) return -ENOMEM; ret = nfs4_xattr_shrinker_init(&nfs4_xattr_cache_shrinker, &nfs4_xattr_cache_lru, "nfs-xattr_cache", nfs4_xattr_cache_count, nfs4_xattr_cache_scan, 0, DEFAULT_SEEKS); if (ret) goto out1; ret = nfs4_xattr_shrinker_init(&nfs4_xattr_entry_shrinker, &nfs4_xattr_entry_lru, "nfs-xattr_entry", nfs4_xattr_entry_count, nfs4_xattr_entry_scan, 512, DEFAULT_SEEKS); if (ret) goto out2; ret = nfs4_xattr_shrinker_init(&nfs4_xattr_large_entry_shrinker, &nfs4_xattr_large_entry_lru, "nfs-xattr_large_entry", nfs4_xattr_entry_count, nfs4_xattr_entry_scan, 512, 1); if (!ret) return 0; nfs4_xattr_shrinker_destroy(nfs4_xattr_entry_shrinker, &nfs4_xattr_entry_lru); out2: nfs4_xattr_shrinker_destroy(nfs4_xattr_cache_shrinker, &nfs4_xattr_cache_lru); out1: kmem_cache_destroy(nfs4_xattr_cache_cachep); return ret; } void nfs4_xattr_cache_exit(void) { nfs4_xattr_shrinker_destroy(nfs4_xattr_large_entry_shrinker, &nfs4_xattr_large_entry_lru); nfs4_xattr_shrinker_destroy(nfs4_xattr_entry_shrinker, &nfs4_xattr_entry_lru); nfs4_xattr_shrinker_destroy(nfs4_xattr_cache_shrinker, &nfs4_xattr_cache_lru); kmem_cache_destroy(nfs4_xattr_cache_cachep); }
6 1 2 4 1 2 2 2 1 6 6 6 6 4 2 6 6 3 1 2 2 1 1 1 1 2 2 6 6 6 15 1 1 1 1 1 7 3 5 4 1 9 3 1 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 // SPDX-License-Identifier: GPL-2.0-only /* * move_extents.c * * Copyright (C) 2011 Oracle. All rights reserved. */ #include <linux/fs.h> #include <linux/types.h> #include <linux/mount.h> #include <linux/swap.h> #include <cluster/masklog.h> #include "ocfs2.h" #include "ocfs2_ioctl.h" #include "alloc.h" #include "localalloc.h" #include "aops.h" #include "dlmglue.h" #include "extent_map.h" #include "inode.h" #include "journal.h" #include "suballoc.h" #include "uptodate.h" #include "super.h" #include "dir.h" #include "buffer_head_io.h" #include "sysfile.h" #include "refcounttree.h" #include "move_extents.h" struct ocfs2_move_extents_context { struct inode *inode; struct file *file; int auto_defrag; int partial; int credits; u32 new_phys_cpos; u32 clusters_moved; u64 refcount_loc; struct ocfs2_move_extents *range; struct ocfs2_extent_tree et; struct ocfs2_alloc_context *meta_ac; struct ocfs2_alloc_context *data_ac; struct ocfs2_cached_dealloc_ctxt dealloc; }; static int __ocfs2_move_extent(handle_t *handle, struct ocfs2_move_extents_context *context, u32 cpos, u32 len, u32 p_cpos, u32 new_p_cpos, int ext_flags) { int ret = 0, index; struct inode *inode = context->inode; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_extent_rec *rec, replace_rec; struct ocfs2_path *path = NULL; struct ocfs2_extent_list *el; u64 ino = ocfs2_metadata_cache_owner(context->et.et_ci); u64 old_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cpos); ret = ocfs2_duplicate_clusters_by_page(handle, inode, cpos, p_cpos, new_p_cpos, len); if (ret) { mlog_errno(ret); goto out; } memset(&replace_rec, 0, sizeof(replace_rec)); replace_rec.e_cpos = cpu_to_le32(cpos); replace_rec.e_leaf_clusters = cpu_to_le16(len); replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(inode->i_sb, new_p_cpos)); path = ocfs2_new_path_from_et(&context->et); if (!path) { ret = -ENOMEM; mlog_errno(ret); goto out; } ret = ocfs2_find_path(INODE_CACHE(inode), path, cpos); if (ret) { mlog_errno(ret); goto out; } el = path_leaf_el(path); index = ocfs2_search_extent_list(el, cpos); if (index == -1) { ret = ocfs2_error(inode->i_sb, "Inode %llu has an extent at cpos %u which can no longer be found\n", (unsigned long long)ino, cpos); goto out; } rec = &el->l_recs[index]; BUG_ON(ext_flags != rec->e_flags); /* * after moving/defraging to new location, the extent is not going * to be refcounted anymore. */ replace_rec.e_flags = ext_flags & ~OCFS2_EXT_REFCOUNTED; ret = ocfs2_split_extent(handle, &context->et, path, index, &replace_rec, context->meta_ac, &context->dealloc); if (ret) { mlog_errno(ret); goto out; } context->new_phys_cpos = new_p_cpos; /* * need I to append truncate log for old clusters? */ if (old_blkno) { if (ext_flags & OCFS2_EXT_REFCOUNTED) ret = ocfs2_decrease_refcount(inode, handle, ocfs2_blocks_to_clusters(osb->sb, old_blkno), len, context->meta_ac, &context->dealloc, 1); else ret = ocfs2_truncate_log_append(osb, handle, old_blkno, len); } ocfs2_update_inode_fsync_trans(handle, inode, 0); out: ocfs2_free_path(path); return ret; } /* * lock allocator, and reserve appropriate number of bits for * meta blocks. */ static int ocfs2_lock_meta_allocator_move_extents(struct inode *inode, struct ocfs2_extent_tree *et, u32 clusters_to_move, u32 extents_to_split, struct ocfs2_alloc_context **meta_ac, int extra_blocks, int *credits) { int ret, num_free_extents; unsigned int max_recs_needed = 2 * extents_to_split + clusters_to_move; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); num_free_extents = ocfs2_num_free_extents(et); if (num_free_extents < 0) { ret = num_free_extents; mlog_errno(ret); goto out; } if (!num_free_extents || (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) extra_blocks += ocfs2_extend_meta_needed(et->et_root_el); ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, meta_ac); if (ret) { mlog_errno(ret); goto out; } *credits += ocfs2_calc_extend_credits(osb->sb, et->et_root_el); mlog(0, "reserve metadata_blocks: %d, data_clusters: %u, credits: %d\n", extra_blocks, clusters_to_move, *credits); out: if (ret) { if (*meta_ac) { ocfs2_free_alloc_context(*meta_ac); *meta_ac = NULL; } } return ret; } /* * Using one journal handle to guarantee the data consistency in case * crash happens anywhere. * * XXX: defrag can end up with finishing partial extent as requested, * due to not enough contiguous clusters can be found in allocator. */ static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context, u32 cpos, u32 phys_cpos, u32 *len, int ext_flags) { int ret, credits = 0, extra_blocks = 0, partial = context->partial; handle_t *handle; struct inode *inode = context->inode; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct inode *tl_inode = osb->osb_tl_inode; struct ocfs2_refcount_tree *ref_tree = NULL; u32 new_phys_cpos, new_len; u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); int need_free = 0; if ((ext_flags & OCFS2_EXT_REFCOUNTED) && *len) { BUG_ON(!ocfs2_is_refcount_inode(inode)); BUG_ON(!context->refcount_loc); ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1, &ref_tree, NULL); if (ret) { mlog_errno(ret); return ret; } ret = ocfs2_prepare_refcount_change_for_del(inode, context->refcount_loc, phys_blkno, *len, &credits, &extra_blocks); if (ret) { mlog_errno(ret); goto out; } } ret = ocfs2_lock_meta_allocator_move_extents(inode, &context->et, *len, 1, &context->meta_ac, extra_blocks, &credits); if (ret) { mlog_errno(ret); goto out; } /* * should be using allocation reservation strategy there? * * if (context->data_ac) * context->data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv; */ inode_lock(tl_inode); if (ocfs2_truncate_log_needs_flush(osb)) { ret = __ocfs2_flush_truncate_log(osb); if (ret < 0) { mlog_errno(ret); goto out_unlock_mutex; } } /* * Make sure ocfs2_reserve_cluster is called after * __ocfs2_flush_truncate_log, otherwise, dead lock may happen. * * If ocfs2_reserve_cluster is called * before __ocfs2_flush_truncate_log, dead lock on global bitmap * may happen. * */ ret = ocfs2_reserve_clusters(osb, *len, &context->data_ac); if (ret) { mlog_errno(ret); goto out_unlock_mutex; } handle = ocfs2_start_trans(osb, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); mlog_errno(ret); goto out_unlock_mutex; } ret = __ocfs2_claim_clusters(handle, context->data_ac, 1, *len, &new_phys_cpos, &new_len); if (ret) { mlog_errno(ret); goto out_commit; } /* * allowing partial extent moving is kind of 'pros and cons', it makes * whole defragmentation less likely to fail, on the contrary, the bad * thing is it may make the fs even more fragmented after moving, let * userspace make a good decision here. */ if (new_len != *len) { mlog(0, "len_claimed: %u, len: %u\n", new_len, *len); if (!partial) { context->range->me_flags &= ~OCFS2_MOVE_EXT_FL_COMPLETE; ret = -ENOSPC; need_free = 1; goto out_commit; } } mlog(0, "cpos: %u, phys_cpos: %u, new_phys_cpos: %u\n", cpos, phys_cpos, new_phys_cpos); ret = __ocfs2_move_extent(handle, context, cpos, new_len, phys_cpos, new_phys_cpos, ext_flags); if (ret) mlog_errno(ret); if (partial && (new_len != *len)) *len = new_len; /* * Here we should write the new page out first if we are * in write-back mode. */ ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, *len); if (ret) mlog_errno(ret); out_commit: if (need_free && context->data_ac) { struct ocfs2_alloc_context *data_ac = context->data_ac; if (context->data_ac->ac_which == OCFS2_AC_USE_LOCAL) ocfs2_free_local_alloc_bits(osb, handle, data_ac, new_phys_cpos, new_len); else ocfs2_free_clusters(handle, data_ac->ac_inode, data_ac->ac_bh, ocfs2_clusters_to_blocks(osb->sb, new_phys_cpos), new_len); } ocfs2_commit_trans(osb, handle); out_unlock_mutex: inode_unlock(tl_inode); if (context->data_ac) { ocfs2_free_alloc_context(context->data_ac); context->data_ac = NULL; } if (context->meta_ac) { ocfs2_free_alloc_context(context->meta_ac); context->meta_ac = NULL; } out: if (ref_tree) ocfs2_unlock_refcount_tree(osb, ref_tree, 1); return ret; } /* * find the victim alloc group, where #blkno fits. */ static int ocfs2_find_victim_alloc_group(struct inode *inode, u64 vict_blkno, int type, int slot, int *vict_bit, struct buffer_head **ret_bh) { int ret, i, bits_per_unit = 0; u64 blkno; char namebuf[40]; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct buffer_head *ac_bh = NULL, *gd_bh = NULL; struct ocfs2_chain_list *cl; struct ocfs2_chain_rec *rec; struct ocfs2_dinode *ac_dinode; struct ocfs2_group_desc *bg; ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, slot); ret = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf, strlen(namebuf), &blkno); if (ret) { ret = -ENOENT; goto out; } ret = ocfs2_read_blocks_sync(osb, blkno, 1, &ac_bh); if (ret) { mlog_errno(ret); goto out; } ac_dinode = (struct ocfs2_dinode *)ac_bh->b_data; cl = &(ac_dinode->id2.i_chain); rec = &(cl->cl_recs[0]); if (type == GLOBAL_BITMAP_SYSTEM_INODE) bits_per_unit = osb->s_clustersize_bits - inode->i_sb->s_blocksize_bits; /* * 'vict_blkno' was out of the valid range. */ if ((vict_blkno < le64_to_cpu(rec->c_blkno)) || (vict_blkno >= ((u64)le32_to_cpu(ac_dinode->id1.bitmap1.i_total) << bits_per_unit))) { ret = -EINVAL; goto out; } for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) { rec = &(cl->cl_recs[i]); if (!rec) continue; bg = NULL; do { if (!bg) blkno = le64_to_cpu(rec->c_blkno); else blkno = le64_to_cpu(bg->bg_next_group); if (gd_bh) { brelse(gd_bh); gd_bh = NULL; } ret = ocfs2_read_blocks_sync(osb, blkno, 1, &gd_bh); if (ret) { mlog_errno(ret); goto out; } bg = (struct ocfs2_group_desc *)gd_bh->b_data; if (vict_blkno < (le64_to_cpu(bg->bg_blkno) + (le16_to_cpu(bg->bg_bits) << bits_per_unit))) { *ret_bh = gd_bh; *vict_bit = (vict_blkno - blkno) >> bits_per_unit; mlog(0, "find the victim group: #%llu, " "total_bits: %u, vict_bit: %u\n", blkno, le16_to_cpu(bg->bg_bits), *vict_bit); goto out; } } while (le64_to_cpu(bg->bg_next_group)); } ret = -EINVAL; out: brelse(ac_bh); /* * caller has to release the gd_bh properly. */ return ret; } /* * XXX: helper to validate and adjust moving goal. */ static int ocfs2_validate_and_adjust_move_goal(struct inode *inode, struct ocfs2_move_extents *range) { int ret, goal_bit = 0; struct buffer_head *gd_bh = NULL; struct ocfs2_group_desc *bg; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); int c_to_b = 1 << (osb->s_clustersize_bits - inode->i_sb->s_blocksize_bits); /* * make goal become cluster aligned. */ range->me_goal = ocfs2_block_to_cluster_start(inode->i_sb, range->me_goal); /* * validate goal sits within global_bitmap, and return the victim * group desc */ ret = ocfs2_find_victim_alloc_group(inode, range->me_goal, GLOBAL_BITMAP_SYSTEM_INODE, OCFS2_INVALID_SLOT, &goal_bit, &gd_bh); if (ret) goto out; bg = (struct ocfs2_group_desc *)gd_bh->b_data; /* * moving goal is not allowed to start with a group desc blok(#0 blk) * let's compromise to the latter cluster. */ if (range->me_goal == le64_to_cpu(bg->bg_blkno)) range->me_goal += c_to_b; /* * movement is not gonna cross two groups. */ if ((le16_to_cpu(bg->bg_bits) - goal_bit) * osb->s_clustersize < range->me_len) { ret = -EINVAL; goto out; } /* * more exact validations/adjustments will be performed later during * moving operation for each extent range. */ mlog(0, "extents get ready to be moved to #%llu block\n", range->me_goal); out: brelse(gd_bh); return ret; } static void ocfs2_probe_alloc_group(struct inode *inode, struct buffer_head *bh, int *goal_bit, u32 move_len, u32 max_hop, u32 *phys_cpos) { int i, used, last_free_bits = 0, base_bit = *goal_bit; struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; u32 base_cpos = ocfs2_blocks_to_clusters(inode->i_sb, le64_to_cpu(gd->bg_blkno)); for (i = base_bit; i < le16_to_cpu(gd->bg_bits); i++) { used = ocfs2_test_bit(i, (unsigned long *)gd->bg_bitmap); if (used) { /* * we even tried searching the free chunk by jumping * a 'max_hop' distance, but still failed. */ if ((i - base_bit) > max_hop) { *phys_cpos = 0; break; } if (last_free_bits) last_free_bits = 0; continue; } else last_free_bits++; if (last_free_bits == move_len) { i -= move_len; *goal_bit = i; *phys_cpos = base_cpos + i; break; } } mlog(0, "found phys_cpos: %u to fit the wanted moving.\n", *phys_cpos); } static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, u32 cpos, u32 phys_cpos, u32 *new_phys_cpos, u32 len, int ext_flags) { int ret, credits = 0, extra_blocks = 0, goal_bit = 0; handle_t *handle; struct inode *inode = context->inode; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct inode *tl_inode = osb->osb_tl_inode; struct inode *gb_inode = NULL; struct buffer_head *gb_bh = NULL; struct buffer_head *gd_bh = NULL; struct ocfs2_group_desc *gd; struct ocfs2_refcount_tree *ref_tree = NULL; u32 move_max_hop = ocfs2_blocks_to_clusters(inode->i_sb, context->range->me_threshold); u64 phys_blkno, new_phys_blkno; phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); if ((ext_flags & OCFS2_EXT_REFCOUNTED) && len) { BUG_ON(!ocfs2_is_refcount_inode(inode)); BUG_ON(!context->refcount_loc); ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1, &ref_tree, NULL); if (ret) { mlog_errno(ret); return ret; } ret = ocfs2_prepare_refcount_change_for_del(inode, context->refcount_loc, phys_blkno, len, &credits, &extra_blocks); if (ret) { mlog_errno(ret); goto out; } } ret = ocfs2_lock_meta_allocator_move_extents(inode, &context->et, len, 1, &context->meta_ac, extra_blocks, &credits); if (ret) { mlog_errno(ret); goto out; } /* * need to count 2 extra credits for global_bitmap inode and * group descriptor. */ credits += OCFS2_INODE_UPDATE_CREDITS + 1; /* * ocfs2_move_extent() didn't reserve any clusters in lock_allocators() * logic, while we still need to lock the global_bitmap. */ gb_inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE, OCFS2_INVALID_SLOT); if (!gb_inode) { mlog(ML_ERROR, "unable to get global_bitmap inode\n"); ret = -EIO; goto out; } inode_lock(gb_inode); ret = ocfs2_inode_lock(gb_inode, &gb_bh, 1); if (ret) { mlog_errno(ret); goto out_unlock_gb_mutex; } inode_lock(tl_inode); handle = ocfs2_start_trans(osb, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); mlog_errno(ret); goto out_unlock_tl_inode; } new_phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *new_phys_cpos); ret = ocfs2_find_victim_alloc_group(inode, new_phys_blkno, GLOBAL_BITMAP_SYSTEM_INODE, OCFS2_INVALID_SLOT, &goal_bit, &gd_bh); if (ret) { mlog_errno(ret); goto out_commit; } /* * probe the victim cluster group to find a proper * region to fit wanted movement, it even will perform * a best-effort attempt by compromising to a threshold * around the goal. */ ocfs2_probe_alloc_group(inode, gd_bh, &goal_bit, len, move_max_hop, new_phys_cpos); if (!*new_phys_cpos) { ret = -ENOSPC; goto out_commit; } ret = __ocfs2_move_extent(handle, context, cpos, len, phys_cpos, *new_phys_cpos, ext_flags); if (ret) { mlog_errno(ret); goto out_commit; } gd = (struct ocfs2_group_desc *)gd_bh->b_data; ret = ocfs2_alloc_dinode_update_counts(gb_inode, handle, gb_bh, len, le16_to_cpu(gd->bg_chain)); if (ret) { mlog_errno(ret); goto out_commit; } ret = ocfs2_block_group_set_bits(handle, gb_inode, gd, gd_bh, goal_bit, len, 0, 0); if (ret) { ocfs2_rollback_alloc_dinode_counts(gb_inode, gb_bh, len, le16_to_cpu(gd->bg_chain)); mlog_errno(ret); } /* * Here we should write the new page out first if we are * in write-back mode. */ ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, len); if (ret) mlog_errno(ret); out_commit: ocfs2_commit_trans(osb, handle); brelse(gd_bh); out_unlock_tl_inode: inode_unlock(tl_inode); ocfs2_inode_unlock(gb_inode, 1); out_unlock_gb_mutex: inode_unlock(gb_inode); brelse(gb_bh); iput(gb_inode); out: if (context->meta_ac) { ocfs2_free_alloc_context(context->meta_ac); context->meta_ac = NULL; } if (ref_tree) ocfs2_unlock_refcount_tree(osb, ref_tree, 1); return ret; } /* * Helper to calculate the defraging length in one run according to threshold. */ static void ocfs2_calc_extent_defrag_len(u32 *alloc_size, u32 *len_defraged, u32 threshold, int *skip) { if ((*alloc_size + *len_defraged) < threshold) { /* * proceed defragmentation until we meet the thresh */ *len_defraged += *alloc_size; } else if (*len_defraged == 0) { /* * XXX: skip a large extent. */ *skip = 1; } else { /* * split this extent to coalesce with former pieces as * to reach the threshold. * * we're done here with one cycle of defragmentation * in a size of 'thresh', resetting 'len_defraged' * forces a new defragmentation. */ *alloc_size = threshold - *len_defraged; *len_defraged = 0; } } static int __ocfs2_move_extents_range(struct buffer_head *di_bh, struct ocfs2_move_extents_context *context) { int ret = 0, flags, do_defrag, skip = 0; u32 cpos, phys_cpos, move_start, len_to_move, alloc_size; u32 len_defraged = 0, defrag_thresh = 0, new_phys_cpos = 0; struct inode *inode = context->inode; struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; struct ocfs2_move_extents *range = context->range; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if ((i_size_read(inode) == 0) || (range->me_len == 0)) return 0; if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) return 0; context->refcount_loc = le64_to_cpu(di->i_refcount_loc); ocfs2_init_dinode_extent_tree(&context->et, INODE_CACHE(inode), di_bh); ocfs2_init_dealloc_ctxt(&context->dealloc); /* * TO-DO XXX: * * - xattr extents. */ do_defrag = context->auto_defrag; /* * extents moving happens in unit of clusters, for the sake * of simplicity, we may ignore two clusters where 'byte_start' * and 'byte_start + len' were within. */ move_start = ocfs2_clusters_for_bytes(osb->sb, range->me_start); len_to_move = (range->me_start + range->me_len) >> osb->s_clustersize_bits; if (len_to_move >= move_start) len_to_move -= move_start; else len_to_move = 0; if (do_defrag) { defrag_thresh = range->me_threshold >> osb->s_clustersize_bits; if (defrag_thresh <= 1) goto done; } else new_phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, range->me_goal); mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u, " "thresh: %u\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, (unsigned long long)range->me_start, (unsigned long long)range->me_len, move_start, len_to_move, defrag_thresh); cpos = move_start; while (len_to_move) { ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &alloc_size, &flags); if (ret) { mlog_errno(ret); goto out; } if (alloc_size > len_to_move) alloc_size = len_to_move; /* * XXX: how to deal with a hole: * * - skip the hole of course * - force a new defragmentation */ if (!phys_cpos) { if (do_defrag) len_defraged = 0; goto next; } if (do_defrag) { ocfs2_calc_extent_defrag_len(&alloc_size, &len_defraged, defrag_thresh, &skip); /* * skip large extents */ if (skip) { skip = 0; goto next; } mlog(0, "#Defrag: cpos: %u, phys_cpos: %u, " "alloc_size: %u, len_defraged: %u\n", cpos, phys_cpos, alloc_size, len_defraged); ret = ocfs2_defrag_extent(context, cpos, phys_cpos, &alloc_size, flags); } else { ret = ocfs2_move_extent(context, cpos, phys_cpos, &new_phys_cpos, alloc_size, flags); new_phys_cpos += alloc_size; } if (ret < 0) { mlog_errno(ret); goto out; } context->clusters_moved += alloc_size; next: cpos += alloc_size; len_to_move -= alloc_size; } done: range->me_flags |= OCFS2_MOVE_EXT_FL_COMPLETE; out: range->me_moved_len = ocfs2_clusters_to_bytes(osb->sb, context->clusters_moved); range->me_new_offset = ocfs2_clusters_to_bytes(osb->sb, context->new_phys_cpos); ocfs2_schedule_truncate_log_flush(osb, 1); ocfs2_run_deallocs(osb, &context->dealloc); return ret; } static int ocfs2_move_extents(struct ocfs2_move_extents_context *context) { int status; handle_t *handle; struct inode *inode = context->inode; struct ocfs2_dinode *di; struct buffer_head *di_bh = NULL; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) return -EROFS; inode_lock(inode); /* * This prevents concurrent writes from other nodes */ status = ocfs2_rw_lock(inode, 1); if (status) { mlog_errno(status); goto out; } status = ocfs2_inode_lock(inode, &di_bh, 1); if (status) { mlog_errno(status); goto out_rw_unlock; } /* * remember ip_xattr_sem also needs to be held if necessary */ down_write(&OCFS2_I(inode)->ip_alloc_sem); status = __ocfs2_move_extents_range(di_bh, context); up_write(&OCFS2_I(inode)->ip_alloc_sem); if (status) { mlog_errno(status); goto out_inode_unlock; } /* * We update ctime for these changes */ handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); if (IS_ERR(handle)) { status = PTR_ERR(handle); mlog_errno(status); goto out_inode_unlock; } status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status) { mlog_errno(status); goto out_commit; } di = (struct ocfs2_dinode *)di_bh->b_data; inode_set_ctime_current(inode); di->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode)); di->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode)); ocfs2_update_inode_fsync_trans(handle, inode, 0); ocfs2_journal_dirty(handle, di_bh); out_commit: ocfs2_commit_trans(osb, handle); out_inode_unlock: brelse(di_bh); ocfs2_inode_unlock(inode, 1); out_rw_unlock: ocfs2_rw_unlock(inode, 1); out: inode_unlock(inode); return status; } int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp) { int status; struct inode *inode = file_inode(filp); struct ocfs2_move_extents range; struct ocfs2_move_extents_context *context; if (!argp) return -EINVAL; status = mnt_want_write_file(filp); if (status) return status; if ((!S_ISREG(inode->i_mode)) || !(filp->f_mode & FMODE_WRITE)) { status = -EPERM; goto out_drop; } if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) { status = -EPERM; goto out_drop; } context = kzalloc(sizeof(struct ocfs2_move_extents_context), GFP_NOFS); if (!context) { status = -ENOMEM; mlog_errno(status); goto out_drop; } context->inode = inode; context->file = filp; if (copy_from_user(&range, argp, sizeof(range))) { status = -EFAULT; goto out_free; } if (range.me_start > i_size_read(inode)) { status = -EINVAL; goto out_free; } if (range.me_start + range.me_len > i_size_read(inode)) range.me_len = i_size_read(inode) - range.me_start; context->range = &range; /* * ok, the default threshold for the defragmentation * is 1M, since our maximum clustersize was 1M also. * any thought? */ if (!range.me_threshold) range.me_threshold = 1024 * 1024; if (range.me_threshold > i_size_read(inode)) range.me_threshold = i_size_read(inode); if (range.me_flags & OCFS2_MOVE_EXT_FL_AUTO_DEFRAG) { context->auto_defrag = 1; if (range.me_flags & OCFS2_MOVE_EXT_FL_PART_DEFRAG) context->partial = 1; } else { /* * first best-effort attempt to validate and adjust the goal * (physical address in block), while it can't guarantee later * operation can succeed all the time since global_bitmap may * change a bit over time. */ status = ocfs2_validate_and_adjust_move_goal(inode, &range); if (status) goto out_copy; } status = ocfs2_move_extents(context); if (status) mlog_errno(status); out_copy: /* * movement/defragmentation may end up being partially completed, * that's the reason why we need to return userspace the finished * length and new_offset even if failure happens somewhere. */ if (copy_to_user(argp, &range, sizeof(range))) status = -EFAULT; out_free: kfree(context); out_drop: mnt_drop_write_file(filp); return status; }
74 73 73 74 74 74 21 20 23 74 88 25 2 23 2 23 25 25 25 23 23 21 21 21 21 21 4 4 4 4 4 4 67 66 66 21 21 19 70 4 4 73 73 74 74 23 2 23 23 23 75 2 76 75 2 76 74 2 76 76 76 76 74 9 9 24 9 25 27 27 5 17 9 13 13 1 4 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. */ #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/completion.h> #include <linux/bug.h> #include <linux/list.h> #include <crypto/hash.h> #include "messages.h" #include "ctree.h" #include "discard.h" #include "disk-io.h" #include "send.h" #include "transaction.h" #include "sysfs.h" #include "volumes.h" #include "space-info.h" #include "block-group.h" #include "qgroup.h" #include "misc.h" #include "fs.h" #include "accessors.h" /* * Structure name Path * -------------------------------------------------------------------------- * btrfs_supported_static_feature_attrs /sys/fs/btrfs/features * btrfs_supported_feature_attrs /sys/fs/btrfs/features and * /sys/fs/btrfs/<uuid>/features * btrfs_attrs /sys/fs/btrfs/<uuid> * devid_attrs /sys/fs/btrfs/<uuid>/devinfo/<devid> * allocation_attrs /sys/fs/btrfs/<uuid>/allocation * qgroup_attrs /sys/fs/btrfs/<uuid>/qgroups/<level>_<qgroupid> * space_info_attrs /sys/fs/btrfs/<uuid>/allocation/<bg-type> * raid_attrs /sys/fs/btrfs/<uuid>/allocation/<bg-type>/<bg-profile> * discard_attrs /sys/fs/btrfs/<uuid>/discard * * When built with BTRFS_CONFIG_DEBUG: * * btrfs_debug_feature_attrs /sys/fs/btrfs/debug * btrfs_debug_mount_attrs /sys/fs/btrfs/<uuid>/debug */ struct btrfs_feature_attr { struct kobj_attribute kobj_attr; enum btrfs_feature_set feature_set; u64 feature_bit; }; /* For raid type sysfs entries */ struct raid_kobject { u64 flags; struct kobject kobj; }; #define __INIT_KOBJ_ATTR(_name, _mode, _show, _store) \ { \ .attr = { .name = __stringify(_name), .mode = _mode }, \ .show = _show, \ .store = _store, \ } #define BTRFS_ATTR_W(_prefix, _name, _store) \ static struct kobj_attribute btrfs_attr_##_prefix##_##_name = \ __INIT_KOBJ_ATTR(_name, 0200, NULL, _store) #define BTRFS_ATTR_RW(_prefix, _name, _show, _store) \ static struct kobj_attribute btrfs_attr_##_prefix##_##_name = \ __INIT_KOBJ_ATTR(_name, 0644, _show, _store) #define BTRFS_ATTR(_prefix, _name, _show) \ static struct kobj_attribute btrfs_attr_##_prefix##_##_name = \ __INIT_KOBJ_ATTR(_name, 0444, _show, NULL) #define BTRFS_ATTR_PTR(_prefix, _name) \ (&btrfs_attr_##_prefix##_##_name.attr) #define BTRFS_FEAT_ATTR(_name, _feature_set, _feature_prefix, _feature_bit) \ static struct btrfs_feature_attr btrfs_attr_features_##_name = { \ .kobj_attr = __INIT_KOBJ_ATTR(_name, S_IRUGO, \ btrfs_feature_attr_show, \ btrfs_feature_attr_store), \ .feature_set = _feature_set, \ .feature_bit = _feature_prefix ##_## _feature_bit, \ } #define BTRFS_FEAT_ATTR_PTR(_name) \ (&btrfs_attr_features_##_name.kobj_attr.attr) #define BTRFS_FEAT_ATTR_COMPAT(name, feature) \ BTRFS_FEAT_ATTR(name, FEAT_COMPAT, BTRFS_FEATURE_COMPAT, feature) #define BTRFS_FEAT_ATTR_COMPAT_RO(name, feature) \ BTRFS_FEAT_ATTR(name, FEAT_COMPAT_RO, BTRFS_FEATURE_COMPAT_RO, feature) #define BTRFS_FEAT_ATTR_INCOMPAT(name, feature) \ BTRFS_FEAT_ATTR(name, FEAT_INCOMPAT, BTRFS_FEATURE_INCOMPAT, feature) static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj); static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj); static struct kobject *get_btrfs_kobj(struct kobject *kobj); static struct btrfs_feature_attr *to_btrfs_feature_attr(struct kobj_attribute *a) { return container_of(a, struct btrfs_feature_attr, kobj_attr); } static struct kobj_attribute *attr_to_btrfs_attr(struct attribute *attr) { return container_of(attr, struct kobj_attribute, attr); } static struct btrfs_feature_attr *attr_to_btrfs_feature_attr( struct attribute *attr) { return to_btrfs_feature_attr(attr_to_btrfs_attr(attr)); } static u64 get_features(struct btrfs_fs_info *fs_info, enum btrfs_feature_set set) { struct btrfs_super_block *disk_super = fs_info->super_copy; if (set == FEAT_COMPAT) return btrfs_super_compat_flags(disk_super); else if (set == FEAT_COMPAT_RO) return btrfs_super_compat_ro_flags(disk_super); else return btrfs_super_incompat_flags(disk_super); } static void set_features(struct btrfs_fs_info *fs_info, enum btrfs_feature_set set, u64 features) { struct btrfs_super_block *disk_super = fs_info->super_copy; if (set == FEAT_COMPAT) btrfs_set_super_compat_flags(disk_super, features); else if (set == FEAT_COMPAT_RO) btrfs_set_super_compat_ro_flags(disk_super, features); else btrfs_set_super_incompat_flags(disk_super, features); } static int can_modify_feature(struct btrfs_feature_attr *fa) { int val = 0; u64 set, clear; switch (fa->feature_set) { case FEAT_COMPAT: set = BTRFS_FEATURE_COMPAT_SAFE_SET; clear = BTRFS_FEATURE_COMPAT_SAFE_CLEAR; break; case FEAT_COMPAT_RO: set = BTRFS_FEATURE_COMPAT_RO_SAFE_SET; clear = BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR; break; case FEAT_INCOMPAT: set = BTRFS_FEATURE_INCOMPAT_SAFE_SET; clear = BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR; break; default: pr_warn("btrfs: sysfs: unknown feature set %d\n", fa->feature_set); return 0; } if (set & fa->feature_bit) val |= 1; if (clear & fa->feature_bit) val |= 2; return val; } static ssize_t btrfs_feature_attr_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { int val = 0; struct btrfs_fs_info *fs_info = to_fs_info(kobj); struct btrfs_feature_attr *fa = to_btrfs_feature_attr(a); if (fs_info) { u64 features = get_features(fs_info, fa->feature_set); if (features & fa->feature_bit) val = 1; } else val = can_modify_feature(fa); return sysfs_emit(buf, "%d\n", val); } static ssize_t btrfs_feature_attr_store(struct kobject *kobj, struct kobj_attribute *a, const char *buf, size_t count) { struct btrfs_fs_info *fs_info; struct btrfs_feature_attr *fa = to_btrfs_feature_attr(a); u64 features, set, clear; unsigned long val; int ret; fs_info = to_fs_info(kobj); if (!fs_info) return -EPERM; if (sb_rdonly(fs_info->sb)) return -EROFS; ret = kstrtoul(skip_spaces(buf), 0, &val); if (ret) return ret; if (fa->feature_set == FEAT_COMPAT) { set = BTRFS_FEATURE_COMPAT_SAFE_SET; clear = BTRFS_FEATURE_COMPAT_SAFE_CLEAR; } else if (fa->feature_set == FEAT_COMPAT_RO) { set = BTRFS_FEATURE_COMPAT_RO_SAFE_SET; clear = BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR; } else { set = BTRFS_FEATURE_INCOMPAT_SAFE_SET; clear = BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR; } features = get_features(fs_info, fa->feature_set); /* Nothing to do */ if ((val && (features & fa->feature_bit)) || (!val && !(features & fa->feature_bit))) return count; if ((val && !(set & fa->feature_bit)) || (!val && !(clear & fa->feature_bit))) { btrfs_info(fs_info, "%sabling feature %s on mounted fs is not supported.", val ? "En" : "Dis", fa->kobj_attr.attr.name); return -EPERM; } btrfs_info(fs_info, "%s %s feature flag", val ? "Setting" : "Clearing", fa->kobj_attr.attr.name); spin_lock(&fs_info->super_lock); features = get_features(fs_info, fa->feature_set); if (val) features |= fa->feature_bit; else features &= ~fa->feature_bit; set_features(fs_info, fa->feature_set, features); spin_unlock(&fs_info->super_lock); /* * We don't want to do full transaction commit from inside sysfs */ set_bit(BTRFS_FS_NEED_TRANS_COMMIT, &fs_info->flags); wake_up_process(fs_info->transaction_kthread); return count; } static umode_t btrfs_feature_visible(struct kobject *kobj, struct attribute *attr, int unused) { struct btrfs_fs_info *fs_info = to_fs_info(kobj); umode_t mode = attr->mode; if (fs_info) { struct btrfs_feature_attr *fa; u64 features; fa = attr_to_btrfs_feature_attr(attr); features = get_features(fs_info, fa->feature_set); if (can_modify_feature(fa)) mode |= S_IWUSR; else if (!(features & fa->feature_bit)) mode = 0; } return mode; } BTRFS_FEAT_ATTR_INCOMPAT(default_subvol, DEFAULT_SUBVOL); BTRFS_FEAT_ATTR_INCOMPAT(mixed_groups, MIXED_GROUPS); BTRFS_FEAT_ATTR_INCOMPAT(compress_lzo, COMPRESS_LZO); BTRFS_FEAT_ATTR_INCOMPAT(compress_zstd, COMPRESS_ZSTD); BTRFS_FEAT_ATTR_INCOMPAT(extended_iref, EXTENDED_IREF); BTRFS_FEAT_ATTR_INCOMPAT(raid56, RAID56); BTRFS_FEAT_ATTR_INCOMPAT(skinny_metadata, SKINNY_METADATA); BTRFS_FEAT_ATTR_INCOMPAT(no_holes, NO_HOLES); BTRFS_FEAT_ATTR_INCOMPAT(metadata_uuid, METADATA_UUID); BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE); BTRFS_FEAT_ATTR_COMPAT_RO(block_group_tree, BLOCK_GROUP_TREE); BTRFS_FEAT_ATTR_INCOMPAT(raid1c34, RAID1C34); BTRFS_FEAT_ATTR_INCOMPAT(simple_quota, SIMPLE_QUOTA); #ifdef CONFIG_BLK_DEV_ZONED BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED); #endif #ifdef CONFIG_BTRFS_EXPERIMENTAL /* Remove once support for extent tree v2 is feature complete */ BTRFS_FEAT_ATTR_INCOMPAT(extent_tree_v2, EXTENT_TREE_V2); /* Remove once support for raid stripe tree is feature complete. */ BTRFS_FEAT_ATTR_INCOMPAT(raid_stripe_tree, RAID_STRIPE_TREE); #endif #ifdef CONFIG_FS_VERITY BTRFS_FEAT_ATTR_COMPAT_RO(verity, VERITY); #endif /* * Features which depend on feature bits and may differ between each fs. * * /sys/fs/btrfs/features - all available features implemented by this version * /sys/fs/btrfs/UUID/features - features of the fs which are enabled or * can be changed on a mounted filesystem. */ static struct attribute *btrfs_supported_feature_attrs[] = { BTRFS_FEAT_ATTR_PTR(default_subvol), BTRFS_FEAT_ATTR_PTR(mixed_groups), BTRFS_FEAT_ATTR_PTR(compress_lzo), BTRFS_FEAT_ATTR_PTR(compress_zstd), BTRFS_FEAT_ATTR_PTR(extended_iref), BTRFS_FEAT_ATTR_PTR(raid56), BTRFS_FEAT_ATTR_PTR(skinny_metadata), BTRFS_FEAT_ATTR_PTR(no_holes), BTRFS_FEAT_ATTR_PTR(metadata_uuid), BTRFS_FEAT_ATTR_PTR(free_space_tree), BTRFS_FEAT_ATTR_PTR(raid1c34), BTRFS_FEAT_ATTR_PTR(block_group_tree), BTRFS_FEAT_ATTR_PTR(simple_quota), #ifdef CONFIG_BLK_DEV_ZONED BTRFS_FEAT_ATTR_PTR(zoned), #endif #ifdef CONFIG_BTRFS_EXPERIMENTAL BTRFS_FEAT_ATTR_PTR(extent_tree_v2), BTRFS_FEAT_ATTR_PTR(raid_stripe_tree), #endif #ifdef CONFIG_FS_VERITY BTRFS_FEAT_ATTR_PTR(verity), #endif NULL }; static const struct attribute_group btrfs_feature_attr_group = { .name = "features", .is_visible = btrfs_feature_visible, .attrs = btrfs_supported_feature_attrs, }; static ssize_t rmdir_subvol_show(struct kobject *kobj, struct kobj_attribute *ka, char *buf) { return sysfs_emit(buf, "0\n"); } BTRFS_ATTR(static_feature, rmdir_subvol, rmdir_subvol_show); static ssize_t supported_checksums_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { ssize_t ret = 0; int i; for (i = 0; i < btrfs_get_num_csums(); i++) { /* * This "trick" only works as long as 'enum btrfs_csum_type' has * no holes in it */ ret += sysfs_emit_at(buf, ret, "%s%s", (i == 0 ? "" : " "), btrfs_super_csum_name(i)); } ret += sysfs_emit_at(buf, ret, "\n"); return ret; } BTRFS_ATTR(static_feature, supported_checksums, supported_checksums_show); static ssize_t send_stream_version_show(struct kobject *kobj, struct kobj_attribute *ka, char *buf) { return sysfs_emit(buf, "%d\n", BTRFS_SEND_STREAM_VERSION); } BTRFS_ATTR(static_feature, send_stream_version, send_stream_version_show); static const char *rescue_opts[] = { "usebackuproot", "nologreplay", "ignorebadroots", "ignoredatacsums", "ignoremetacsums", "ignoresuperflags", "all", }; static ssize_t supported_rescue_options_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { ssize_t ret = 0; int i; for (i = 0; i < ARRAY_SIZE(rescue_opts); i++) ret += sysfs_emit_at(buf, ret, "%s%s", (i ? " " : ""), rescue_opts[i]); ret += sysfs_emit_at(buf, ret, "\n"); return ret; } BTRFS_ATTR(static_feature, supported_rescue_options, supported_rescue_options_show); static ssize_t supported_sectorsizes_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { ssize_t ret = 0; /* An artificial limit to only support 4K and PAGE_SIZE */ if (PAGE_SIZE > SZ_4K) ret += sysfs_emit_at(buf, ret, "%u ", SZ_4K); ret += sysfs_emit_at(buf, ret, "%lu\n", PAGE_SIZE); return ret; } BTRFS_ATTR(static_feature, supported_sectorsizes, supported_sectorsizes_show); static ssize_t acl_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { return sysfs_emit(buf, "%d\n", IS_ENABLED(CONFIG_BTRFS_FS_POSIX_ACL)); } BTRFS_ATTR(static_feature, acl, acl_show); static ssize_t temp_fsid_supported_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { return sysfs_emit(buf, "0\n"); } BTRFS_ATTR(static_feature, temp_fsid, temp_fsid_supported_show); /* * Features which only depend on kernel version. * * These are listed in /sys/fs/btrfs/features along with * btrfs_supported_feature_attrs. */ static struct attribute *btrfs_supported_static_feature_attrs[] = { BTRFS_ATTR_PTR(static_feature, acl), BTRFS_ATTR_PTR(static_feature, rmdir_subvol), BTRFS_ATTR_PTR(static_feature, supported_checksums), BTRFS_ATTR_PTR(static_feature, send_stream_version), BTRFS_ATTR_PTR(static_feature, supported_rescue_options), BTRFS_ATTR_PTR(static_feature, supported_sectorsizes), BTRFS_ATTR_PTR(static_feature, temp_fsid), NULL }; static const struct attribute_group btrfs_static_feature_attr_group = { .name = "features", .attrs = btrfs_supported_static_feature_attrs, }; /* * Discard statistics and tunables */ #define discard_to_fs_info(_kobj) to_fs_info(get_btrfs_kobj(_kobj)) static ssize_t btrfs_discardable_bytes_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); return sysfs_emit(buf, "%lld\n", atomic64_read(&fs_info->discard_ctl.discardable_bytes)); } BTRFS_ATTR(discard, discardable_bytes, btrfs_discardable_bytes_show); static ssize_t btrfs_discardable_extents_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); return sysfs_emit(buf, "%d\n", atomic_read(&fs_info->discard_ctl.discardable_extents)); } BTRFS_ATTR(discard, discardable_extents, btrfs_discardable_extents_show); static ssize_t btrfs_discard_bitmap_bytes_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); return sysfs_emit(buf, "%llu\n", fs_info->discard_ctl.discard_bitmap_bytes); } BTRFS_ATTR(discard, discard_bitmap_bytes, btrfs_discard_bitmap_bytes_show); static ssize_t btrfs_discard_bytes_saved_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); return sysfs_emit(buf, "%lld\n", atomic64_read(&fs_info->discard_ctl.discard_bytes_saved)); } BTRFS_ATTR(discard, discard_bytes_saved, btrfs_discard_bytes_saved_show); static ssize_t btrfs_discard_extent_bytes_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); return sysfs_emit(buf, "%llu\n", fs_info->discard_ctl.discard_extent_bytes); } BTRFS_ATTR(discard, discard_extent_bytes, btrfs_discard_extent_bytes_show); static ssize_t btrfs_discard_iops_limit_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); return sysfs_emit(buf, "%u\n", READ_ONCE(fs_info->discard_ctl.iops_limit)); } static ssize_t btrfs_discard_iops_limit_store(struct kobject *kobj, struct kobj_attribute *a, const char *buf, size_t len) { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl; u32 iops_limit; int ret; ret = kstrtou32(buf, 10, &iops_limit); if (ret) return -EINVAL; WRITE_ONCE(discard_ctl->iops_limit, iops_limit); btrfs_discard_calc_delay(discard_ctl); btrfs_discard_schedule_work(discard_ctl, true); return len; } BTRFS_ATTR_RW(discard, iops_limit, btrfs_discard_iops_limit_show, btrfs_discard_iops_limit_store); static ssize_t btrfs_discard_kbps_limit_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); return sysfs_emit(buf, "%u\n", READ_ONCE(fs_info->discard_ctl.kbps_limit)); } static ssize_t btrfs_discard_kbps_limit_store(struct kobject *kobj, struct kobj_attribute *a, const char *buf, size_t len) { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl; u32 kbps_limit; int ret; ret = kstrtou32(buf, 10, &kbps_limit); if (ret) return -EINVAL; WRITE_ONCE(discard_ctl->kbps_limit, kbps_limit); btrfs_discard_schedule_work(discard_ctl, true); return len; } BTRFS_ATTR_RW(discard, kbps_limit, btrfs_discard_kbps_limit_show, btrfs_discard_kbps_limit_store); static ssize_t btrfs_discard_max_discard_size_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); return sysfs_emit(buf, "%llu\n", READ_ONCE(fs_info->discard_ctl.max_discard_size)); } static ssize_t btrfs_discard_max_discard_size_store(struct kobject *kobj, struct kobj_attribute *a, const char *buf, size_t len) { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl; u64 max_discard_size; int ret; ret = kstrtou64(buf, 10, &max_discard_size); if (ret) return -EINVAL; WRITE_ONCE(discard_ctl->max_discard_size, max_discard_size); return len; } BTRFS_ATTR_RW(discard, max_discard_size, btrfs_discard_max_discard_size_show, btrfs_discard_max_discard_size_store); /* * Per-filesystem stats for discard (when mounted with discard=async). * * Path: /sys/fs/btrfs/<uuid>/discard/ */ static const struct attribute *discard_attrs[] = { BTRFS_ATTR_PTR(discard, discardable_bytes), BTRFS_ATTR_PTR(discard, discardable_extents), BTRFS_ATTR_PTR(discard, discard_bitmap_bytes), BTRFS_ATTR_PTR(discard, discard_bytes_saved), BTRFS_ATTR_PTR(discard, discard_extent_bytes), BTRFS_ATTR_PTR(discard, iops_limit), BTRFS_ATTR_PTR(discard, kbps_limit), BTRFS_ATTR_PTR(discard, max_discard_size), NULL, }; #ifdef CONFIG_BTRFS_DEBUG /* * Per-filesystem runtime debugging exported via sysfs. * * Path: /sys/fs/btrfs/UUID/debug/ */ static const struct attribute *btrfs_debug_mount_attrs[] = { NULL, }; /* * Runtime debugging exported via sysfs, applies to all mounted filesystems. * * Path: /sys/fs/btrfs/debug */ static struct attribute *btrfs_debug_feature_attrs[] = { NULL }; static const struct attribute_group btrfs_debug_feature_attr_group = { .name = "debug", .attrs = btrfs_debug_feature_attrs, }; #endif static ssize_t btrfs_show_u64(u64 *value_ptr, spinlock_t *lock, char *buf) { u64 val; if (lock) spin_lock(lock); val = *value_ptr; if (lock) spin_unlock(lock); return sysfs_emit(buf, "%llu\n", val); } static ssize_t global_rsv_size_show(struct kobject *kobj, struct kobj_attribute *ka, char *buf) { struct btrfs_fs_info *fs_info = to_fs_info(kobj->parent); struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; return btrfs_show_u64(&block_rsv->size, &block_rsv->lock, buf); } BTRFS_ATTR(allocation, global_rsv_size, global_rsv_size_show); static ssize_t global_rsv_reserved_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = to_fs_info(kobj->parent); struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; return btrfs_show_u64(&block_rsv->reserved, &block_rsv->lock, buf); } BTRFS_ATTR(allocation, global_rsv_reserved, global_rsv_reserved_show); #define to_space_info(_kobj) container_of(_kobj, struct btrfs_space_info, kobj) #define to_raid_kobj(_kobj) container_of(_kobj, struct raid_kobject, kobj) static ssize_t raid_bytes_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf); BTRFS_ATTR(raid, total_bytes, raid_bytes_show); BTRFS_ATTR(raid, used_bytes, raid_bytes_show); static ssize_t raid_bytes_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct btrfs_space_info *sinfo = to_space_info(kobj->parent); struct btrfs_block_group *block_group; int index = btrfs_bg_flags_to_raid_index(to_raid_kobj(kobj)->flags); u64 val = 0; down_read(&sinfo->groups_sem); list_for_each_entry(block_group, &sinfo->block_groups[index], list) { if (&attr->attr == BTRFS_ATTR_PTR(raid, total_bytes)) val += block_group->length; else val += block_group->used; } up_read(&sinfo->groups_sem); return sysfs_emit(buf, "%llu\n", val); } /* * Allocation information about block group profiles. * * Path: /sys/fs/btrfs/<uuid>/allocation/<bg-type>/<bg-profile>/ */ static struct attribute *raid_attrs[] = { BTRFS_ATTR_PTR(raid, total_bytes), BTRFS_ATTR_PTR(raid, used_bytes), NULL }; ATTRIBUTE_GROUPS(raid); static void release_raid_kobj(struct kobject *kobj) { kfree(to_raid_kobj(kobj)); } static const struct kobj_type btrfs_raid_ktype = { .sysfs_ops = &kobj_sysfs_ops, .release = release_raid_kobj, .default_groups = raid_groups, }; #define SPACE_INFO_ATTR(field) \ static ssize_t btrfs_space_info_show_##field(struct kobject *kobj, \ struct kobj_attribute *a, \ char *buf) \ { \ struct btrfs_space_info *sinfo = to_space_info(kobj); \ return btrfs_show_u64(&sinfo->field, &sinfo->lock, buf); \ } \ BTRFS_ATTR(space_info, field, btrfs_space_info_show_##field) static ssize_t btrfs_chunk_size_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_space_info *sinfo = to_space_info(kobj); return sysfs_emit(buf, "%llu\n", READ_ONCE(sinfo->chunk_size)); } /* * Store new chunk size in space info. Can be called on a read-only filesystem. * * If the new chunk size value is larger than 10% of free space it is reduced * to match that limit. Alignment must be to 256M and the system chunk size * cannot be set. */ static ssize_t btrfs_chunk_size_store(struct kobject *kobj, struct kobj_attribute *a, const char *buf, size_t len) { struct btrfs_space_info *space_info = to_space_info(kobj); struct btrfs_fs_info *fs_info = to_fs_info(get_btrfs_kobj(kobj)); char *retptr; u64 val; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (!fs_info->fs_devices) return -EINVAL; if (btrfs_is_zoned(fs_info)) return -EINVAL; /* System block type must not be changed. */ if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM) return -EPERM; val = memparse(buf, &retptr); /* There could be trailing '\n', also catch any typos after the value */ retptr = skip_spaces(retptr); if (*retptr != 0 || val == 0) return -EINVAL; val = min(val, BTRFS_MAX_DATA_CHUNK_SIZE); /* Limit stripe size to 10% of available space. */ val = min(mult_perc(fs_info->fs_devices->total_rw_bytes, 10), val); /* Must be multiple of 256M. */ val &= ~((u64)SZ_256M - 1); /* Must be at least 256M. */ if (val < SZ_256M) return -EINVAL; btrfs_update_space_info_chunk_size(space_info, val); return len; } static ssize_t btrfs_size_classes_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_space_info *sinfo = to_space_info(kobj); struct btrfs_block_group *bg; u32 none = 0; u32 small = 0; u32 medium = 0; u32 large = 0; for (int i = 0; i < BTRFS_NR_RAID_TYPES; ++i) { down_read(&sinfo->groups_sem); list_for_each_entry(bg, &sinfo->block_groups[i], list) { if (!btrfs_block_group_should_use_size_class(bg)) continue; switch (bg->size_class) { case BTRFS_BG_SZ_NONE: none++; break; case BTRFS_BG_SZ_SMALL: small++; break; case BTRFS_BG_SZ_MEDIUM: medium++; break; case BTRFS_BG_SZ_LARGE: large++; break; } } up_read(&sinfo->groups_sem); } return sysfs_emit(buf, "none %u\n" "small %u\n" "medium %u\n" "large %u\n", none, small, medium, large); } #ifdef CONFIG_BTRFS_DEBUG /* * Request chunk allocation with current chunk size. */ static ssize_t btrfs_force_chunk_alloc_store(struct kobject *kobj, struct kobj_attribute *a, const char *buf, size_t len) { struct btrfs_space_info *space_info = to_space_info(kobj); struct btrfs_fs_info *fs_info = to_fs_info(get_btrfs_kobj(kobj)); struct btrfs_trans_handle *trans; bool val; int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (sb_rdonly(fs_info->sb)) return -EROFS; ret = kstrtobool(buf, &val); if (ret) return ret; if (!val) return -EINVAL; /* * This is unsafe to be called from sysfs context and may cause * unexpected problems. */ trans = btrfs_start_transaction(fs_info->tree_root, 0); if (IS_ERR(trans)) return PTR_ERR(trans); ret = btrfs_force_chunk_alloc(trans, space_info->flags); btrfs_end_transaction(trans); if (ret == 1) return len; return -ENOSPC; } BTRFS_ATTR_W(space_info, force_chunk_alloc, btrfs_force_chunk_alloc_store); #endif SPACE_INFO_ATTR(flags); SPACE_INFO_ATTR(total_bytes); SPACE_INFO_ATTR(bytes_used); SPACE_INFO_ATTR(bytes_pinned); SPACE_INFO_ATTR(bytes_reserved); SPACE_INFO_ATTR(bytes_may_use); SPACE_INFO_ATTR(bytes_readonly); SPACE_INFO_ATTR(bytes_zone_unusable); SPACE_INFO_ATTR(disk_used); SPACE_INFO_ATTR(disk_total); SPACE_INFO_ATTR(reclaim_count); SPACE_INFO_ATTR(reclaim_bytes); SPACE_INFO_ATTR(reclaim_errors); BTRFS_ATTR_RW(space_info, chunk_size, btrfs_chunk_size_show, btrfs_chunk_size_store); BTRFS_ATTR(space_info, size_classes, btrfs_size_classes_show); static ssize_t btrfs_sinfo_bg_reclaim_threshold_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_space_info *space_info = to_space_info(kobj); ssize_t ret; spin_lock(&space_info->lock); ret = sysfs_emit(buf, "%d\n", btrfs_calc_reclaim_threshold(space_info)); spin_unlock(&space_info->lock); return ret; } static ssize_t btrfs_sinfo_bg_reclaim_threshold_store(struct kobject *kobj, struct kobj_attribute *a, const char *buf, size_t len) { struct btrfs_space_info *space_info = to_space_info(kobj); int thresh; int ret; if (READ_ONCE(space_info->dynamic_reclaim)) return -EINVAL; ret = kstrtoint(buf, 10, &thresh); if (ret) return ret; if (thresh < 0 || thresh > 100) return -EINVAL; WRITE_ONCE(space_info->bg_reclaim_threshold, thresh); return len; } BTRFS_ATTR_RW(space_info, bg_reclaim_threshold, btrfs_sinfo_bg_reclaim_threshold_show, btrfs_sinfo_bg_reclaim_threshold_store); static ssize_t btrfs_sinfo_dynamic_reclaim_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_space_info *space_info = to_space_info(kobj); return sysfs_emit(buf, "%d\n", READ_ONCE(space_info->dynamic_reclaim)); } static ssize_t btrfs_sinfo_dynamic_reclaim_store(struct kobject *kobj, struct kobj_attribute *a, const char *buf, size_t len) { struct btrfs_space_info *space_info = to_space_info(kobj); int dynamic_reclaim; int ret; ret = kstrtoint(buf, 10, &dynamic_reclaim); if (ret) return ret; if (dynamic_reclaim < 0) return -EINVAL; WRITE_ONCE(space_info->dynamic_reclaim, dynamic_reclaim != 0); return len; } BTRFS_ATTR_RW(space_info, dynamic_reclaim, btrfs_sinfo_dynamic_reclaim_show, btrfs_sinfo_dynamic_reclaim_store); static ssize_t btrfs_sinfo_periodic_reclaim_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_space_info *space_info = to_space_info(kobj); return sysfs_emit(buf, "%d\n", READ_ONCE(space_info->periodic_reclaim)); } static ssize_t btrfs_sinfo_periodic_reclaim_store(struct kobject *kobj, struct kobj_attribute *a, const char *buf, size_t len) { struct btrfs_space_info *space_info = to_space_info(kobj); int periodic_reclaim; int ret; ret = kstrtoint(buf, 10, &periodic_reclaim); if (ret) return ret; if (periodic_reclaim < 0) return -EINVAL; WRITE_ONCE(space_info->periodic_reclaim, periodic_reclaim != 0); return len; } BTRFS_ATTR_RW(space_info, periodic_reclaim, btrfs_sinfo_periodic_reclaim_show, btrfs_sinfo_periodic_reclaim_store); /* * Allocation information about block group types. * * Path: /sys/fs/btrfs/<uuid>/allocation/<bg-type>/ */ static struct attribute *space_info_attrs[] = { BTRFS_ATTR_PTR(space_info, flags), BTRFS_ATTR_PTR(space_info, total_bytes), BTRFS_ATTR_PTR(space_info, bytes_used), BTRFS_ATTR_PTR(space_info, bytes_pinned), BTRFS_ATTR_PTR(space_info, bytes_reserved), BTRFS_ATTR_PTR(space_info, bytes_may_use), BTRFS_ATTR_PTR(space_info, bytes_readonly), BTRFS_ATTR_PTR(space_info, bytes_zone_unusable), BTRFS_ATTR_PTR(space_info, disk_used), BTRFS_ATTR_PTR(space_info, disk_total), BTRFS_ATTR_PTR(space_info, bg_reclaim_threshold), BTRFS_ATTR_PTR(space_info, dynamic_reclaim), BTRFS_ATTR_PTR(space_info, chunk_size), BTRFS_ATTR_PTR(space_info, size_classes), BTRFS_ATTR_PTR(space_info, reclaim_count), BTRFS_ATTR_PTR(space_info, reclaim_bytes), BTRFS_ATTR_PTR(space_info, reclaim_errors), BTRFS_ATTR_PTR(space_info, periodic_reclaim), #ifdef CONFIG_BTRFS_DEBUG BTRFS_ATTR_PTR(space_info, force_chunk_alloc), #endif NULL, }; ATTRIBUTE_GROUPS(space_info); static void space_info_release(struct kobject *kobj) { struct btrfs_space_info *sinfo = to_space_info(kobj); kfree(sinfo); } static const struct kobj_type space_info_ktype = { .sysfs_ops = &kobj_sysfs_ops, .release = space_info_release, .default_groups = space_info_groups, }; /* * Allocation information about block groups. * * Path: /sys/fs/btrfs/<uuid>/allocation/ */ static const struct attribute *allocation_attrs[] = { BTRFS_ATTR_PTR(allocation, global_rsv_reserved), BTRFS_ATTR_PTR(allocation, global_rsv_size), NULL, }; static ssize_t btrfs_label_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = to_fs_info(kobj); char *label = fs_info->super_copy->label; ssize_t ret; spin_lock(&fs_info->super_lock); ret = sysfs_emit(buf, label[0] ? "%s\n" : "%s", label); spin_unlock(&fs_info->super_lock); return ret; } static ssize_t btrfs_label_store(struct kobject *kobj, struct kobj_attribute *a, const char *buf, size_t len) { struct btrfs_fs_info *fs_info = to_fs_info(kobj); size_t p_len; if (!fs_info) return -EPERM; if (sb_rdonly(fs_info->sb)) return -EROFS; /* * p_len is the len until the first occurrence of either * '\n' or '\0' */ p_len = strcspn(buf, "\n"); if (p_len >= BTRFS_LABEL_SIZE) return -EINVAL; spin_lock(&fs_info->super_lock); memset(fs_info->super_copy->label, 0, BTRFS_LABEL_SIZE); memcpy(fs_info->super_copy->label, buf, p_len); spin_unlock(&fs_info->super_lock); /* * We don't want to do full transaction commit from inside sysfs */ set_bit(BTRFS_FS_NEED_TRANS_COMMIT, &fs_info->flags); wake_up_process(fs_info->transaction_kthread); return len; } BTRFS_ATTR_RW(, label, btrfs_label_show, btrfs_label_store); static ssize_t btrfs_nodesize_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = to_fs_info(kobj); return sysfs_emit(buf, "%u\n", fs_info->nodesize); } BTRFS_ATTR(, nodesize, btrfs_nodesize_show); static ssize_t btrfs_sectorsize_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = to_fs_info(kobj); return sysfs_emit(buf, "%u\n", fs_info->sectorsize); } BTRFS_ATTR(, sectorsize, btrfs_sectorsize_show); static ssize_t btrfs_commit_stats_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = to_fs_info(kobj); return sysfs_emit(buf, "commits %llu\n" "last_commit_ms %llu\n" "max_commit_ms %llu\n" "total_commit_ms %llu\n", fs_info->commit_stats.commit_count, div_u64(fs_info->commit_stats.last_commit_dur, NSEC_PER_MSEC), div_u64(fs_info->commit_stats.max_commit_dur, NSEC_PER_MSEC), div_u64(fs_info->commit_stats.total_commit_dur, NSEC_PER_MSEC)); } static ssize_t btrfs_commit_stats_store(struct kobject *kobj, struct kobj_attribute *a, const char *buf, size_t len) { struct btrfs_fs_info *fs_info = to_fs_info(kobj); unsigned long val; int ret; if (!fs_info) return -EPERM; if (!capable(CAP_SYS_RESOURCE)) return -EPERM; ret = kstrtoul(buf, 10, &val); if (ret) return ret; if (val) return -EINVAL; WRITE_ONCE(fs_info->commit_stats.max_commit_dur, 0); return len; } BTRFS_ATTR_RW(, commit_stats, btrfs_commit_stats_show, btrfs_commit_stats_store); static ssize_t btrfs_clone_alignment_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = to_fs_info(kobj); return sysfs_emit(buf, "%u\n", fs_info->sectorsize); } BTRFS_ATTR(, clone_alignment, btrfs_clone_alignment_show); static ssize_t quota_override_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = to_fs_info(kobj); int quota_override; quota_override = test_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags); return sysfs_emit(buf, "%d\n", quota_override); } static ssize_t quota_override_store(struct kobject *kobj, struct kobj_attribute *a, const char *buf, size_t len) { struct btrfs_fs_info *fs_info = to_fs_info(kobj); unsigned long knob; int err; if (!fs_info) return -EPERM; if (!capable(CAP_SYS_RESOURCE)) return -EPERM; err = kstrtoul(buf, 10, &knob); if (err) return err; if (knob > 1) return -EINVAL; if (knob) set_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags); else clear_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags); return len; } BTRFS_ATTR_RW(, quota_override, quota_override_show, quota_override_store); static ssize_t btrfs_metadata_uuid_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = to_fs_info(kobj); return sysfs_emit(buf, "%pU\n", fs_info->fs_devices->metadata_uuid); } BTRFS_ATTR(, metadata_uuid, btrfs_metadata_uuid_show); static ssize_t btrfs_checksum_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = to_fs_info(kobj); u16 csum_type = btrfs_super_csum_type(fs_info->super_copy); return sysfs_emit(buf, "%s (%s)\n", btrfs_super_csum_name(csum_type), crypto_shash_driver_name(fs_info->csum_shash)); } BTRFS_ATTR(, checksum, btrfs_checksum_show); static ssize_t btrfs_exclusive_operation_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = to_fs_info(kobj); const char *str; switch (READ_ONCE(fs_info->exclusive_operation)) { case BTRFS_EXCLOP_NONE: str = "none\n"; break; case BTRFS_EXCLOP_BALANCE: str = "balance\n"; break; case BTRFS_EXCLOP_BALANCE_PAUSED: str = "balance paused\n"; break; case BTRFS_EXCLOP_DEV_ADD: str = "device add\n"; break; case BTRFS_EXCLOP_DEV_REMOVE: str = "device remove\n"; break; case BTRFS_EXCLOP_DEV_REPLACE: str = "device replace\n"; break; case BTRFS_EXCLOP_RESIZE: str = "resize\n"; break; case BTRFS_EXCLOP_SWAP_ACTIVATE: str = "swap activate\n"; break; default: str = "UNKNOWN\n"; break; } return sysfs_emit(buf, "%s", str); } BTRFS_ATTR(, exclusive_operation, btrfs_exclusive_operation_show); static ssize_t btrfs_generation_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = to_fs_info(kobj); return sysfs_emit(buf, "%llu\n", btrfs_get_fs_generation(fs_info)); } BTRFS_ATTR(, generation, btrfs_generation_show); static ssize_t btrfs_temp_fsid_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = to_fs_info(kobj); return sysfs_emit(buf, "%d\n", fs_info->fs_devices->temp_fsid); } BTRFS_ATTR(, temp_fsid, btrfs_temp_fsid_show); static const char *btrfs_read_policy_name[] = { "pid", #ifdef CONFIG_BTRFS_EXPERIMENTAL "round-robin", "devid", #endif }; #ifdef CONFIG_BTRFS_EXPERIMENTAL /* Global module configuration parameters. */ static char *read_policy; char *btrfs_get_mod_read_policy(void) { return read_policy; } /* Set perms to 0, disable /sys/module/btrfs/parameter/read_policy interface. */ module_param(read_policy, charp, 0); MODULE_PARM_DESC(read_policy, "Global read policy: pid (default), round-robin[:<min_contig_read>], devid[:<devid>]"); #endif int btrfs_read_policy_to_enum(const char *str, s64 *value_ret) { char param[32] = { 0 }; char __maybe_unused *value_str; if (!str || strlen(str) == 0) return 0; strncpy(param, str, sizeof(param) - 1); #ifdef CONFIG_BTRFS_EXPERIMENTAL /* Separate value from input in policy:value format. */ value_str = strchr(param, ':'); if (value_str) { int ret; *value_str = 0; value_str++; if (!value_ret) return -EINVAL; ret = kstrtos64(value_str, 10, value_ret); if (ret) return -EINVAL; if (*value_ret < 0) return -ERANGE; } #endif return sysfs_match_string(btrfs_read_policy_name, param); } #ifdef CONFIG_BTRFS_EXPERIMENTAL int __init btrfs_read_policy_init(void) { s64 value; if (btrfs_read_policy_to_enum(read_policy, &value) == -EINVAL) { btrfs_err(NULL, "invalid read policy or value %s", read_policy); return -EINVAL; } return 0; } #endif static ssize_t btrfs_read_policy_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj); const enum btrfs_read_policy policy = READ_ONCE(fs_devices->read_policy); ssize_t ret = 0; int i; for (i = 0; i < BTRFS_NR_READ_POLICY; i++) { if (ret != 0) ret += sysfs_emit_at(buf, ret, " "); if (i == policy) ret += sysfs_emit_at(buf, ret, "["); ret += sysfs_emit_at(buf, ret, "%s", btrfs_read_policy_name[i]); #ifdef CONFIG_BTRFS_EXPERIMENTAL if (i == BTRFS_READ_POLICY_RR) ret += sysfs_emit_at(buf, ret, ":%u", READ_ONCE(fs_devices->rr_min_contig_read)); if (i == BTRFS_READ_POLICY_DEVID) ret += sysfs_emit_at(buf, ret, ":%llu", READ_ONCE(fs_devices->read_devid)); #endif if (i == policy) ret += sysfs_emit_at(buf, ret, "]"); } ret += sysfs_emit_at(buf, ret, "\n"); return ret; } static ssize_t btrfs_read_policy_store(struct kobject *kobj, struct kobj_attribute *a, const char *buf, size_t len) { struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj); int index; s64 value = -1; index = btrfs_read_policy_to_enum(buf, &value); if (index < 0) return -EINVAL; #ifdef CONFIG_BTRFS_EXPERIMENTAL /* If moving from RR then disable collecting fs stats. */ if (fs_devices->read_policy == BTRFS_READ_POLICY_RR && index != BTRFS_READ_POLICY_RR) fs_devices->collect_fs_stats = false; if (index == BTRFS_READ_POLICY_RR) { if (value != -1) { const u32 sectorsize = fs_devices->fs_info->sectorsize; if (!IS_ALIGNED(value, sectorsize)) { u64 temp_value = round_up(value, sectorsize); btrfs_debug(fs_devices->fs_info, "read_policy: min contig read %lld should be multiple of sectorsize %u, rounded to %llu", value, sectorsize, temp_value); value = temp_value; } } else { value = BTRFS_DEFAULT_RR_MIN_CONTIG_READ; } if (index != READ_ONCE(fs_devices->read_policy) || value != READ_ONCE(fs_devices->rr_min_contig_read)) { WRITE_ONCE(fs_devices->read_policy, index); WRITE_ONCE(fs_devices->rr_min_contig_read, value); btrfs_info(fs_devices->fs_info, "read policy set to '%s:%lld'", btrfs_read_policy_name[index], value); } fs_devices->collect_fs_stats = true; return len; } if (index == BTRFS_READ_POLICY_DEVID) { if (value != -1) { BTRFS_DEV_LOOKUP_ARGS(args); /* Validate input devid. */ args.devid = value; if (btrfs_find_device(fs_devices, &args) == NULL) return -EINVAL; } else { /* Set default devid to the devid of the latest device. */ value = fs_devices->latest_dev->devid; } if (index != READ_ONCE(fs_devices->read_policy) || value != READ_ONCE(fs_devices->read_devid)) { WRITE_ONCE(fs_devices->read_policy, index); WRITE_ONCE(fs_devices->read_devid, value); btrfs_info(fs_devices->fs_info, "read policy set to '%s:%llu'", btrfs_read_policy_name[index], value); } return len; } #endif if (index != READ_ONCE(fs_devices->read_policy)) { WRITE_ONCE(fs_devices->read_policy, index); btrfs_info(fs_devices->fs_info, "read policy set to '%s'", btrfs_read_policy_name[index]); } return len; } BTRFS_ATTR_RW(, read_policy, btrfs_read_policy_show, btrfs_read_policy_store); static ssize_t btrfs_bg_reclaim_threshold_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = to_fs_info(kobj); return sysfs_emit(buf, "%d\n", READ_ONCE(fs_info->bg_reclaim_threshold)); } static ssize_t btrfs_bg_reclaim_threshold_store(struct kobject *kobj, struct kobj_attribute *a, const char *buf, size_t len) { struct btrfs_fs_info *fs_info = to_fs_info(kobj); int thresh; int ret; ret = kstrtoint(buf, 10, &thresh); if (ret) return ret; #ifdef CONFIG_BTRFS_DEBUG if (thresh != 0 && (thresh > 100)) return -EINVAL; #else if (thresh != 0 && (thresh <= 50 || thresh > 100)) return -EINVAL; #endif WRITE_ONCE(fs_info->bg_reclaim_threshold, thresh); return len; } BTRFS_ATTR_RW(, bg_reclaim_threshold, btrfs_bg_reclaim_threshold_show, btrfs_bg_reclaim_threshold_store); #ifdef CONFIG_BTRFS_EXPERIMENTAL static ssize_t btrfs_offload_csum_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj); switch (READ_ONCE(fs_devices->offload_csum_mode)) { case BTRFS_OFFLOAD_CSUM_AUTO: return sysfs_emit(buf, "auto\n"); case BTRFS_OFFLOAD_CSUM_FORCE_ON: return sysfs_emit(buf, "1\n"); case BTRFS_OFFLOAD_CSUM_FORCE_OFF: return sysfs_emit(buf, "0\n"); default: WARN_ON(1); return -EINVAL; } } static ssize_t btrfs_offload_csum_store(struct kobject *kobj, struct kobj_attribute *a, const char *buf, size_t len) { struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj); int ret; bool val; ret = kstrtobool(buf, &val); if (ret == 0) WRITE_ONCE(fs_devices->offload_csum_mode, val ? BTRFS_OFFLOAD_CSUM_FORCE_ON : BTRFS_OFFLOAD_CSUM_FORCE_OFF); else if (ret == -EINVAL && sysfs_streq(buf, "auto")) WRITE_ONCE(fs_devices->offload_csum_mode, BTRFS_OFFLOAD_CSUM_AUTO); else return -EINVAL; return len; } BTRFS_ATTR_RW(, offload_csum, btrfs_offload_csum_show, btrfs_offload_csum_store); #endif /* * Per-filesystem information and stats. * * Path: /sys/fs/btrfs/<uuid>/ */ static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(, label), BTRFS_ATTR_PTR(, nodesize), BTRFS_ATTR_PTR(, sectorsize), BTRFS_ATTR_PTR(, clone_alignment), BTRFS_ATTR_PTR(, quota_override), BTRFS_ATTR_PTR(, metadata_uuid), BTRFS_ATTR_PTR(, checksum), BTRFS_ATTR_PTR(, exclusive_operation), BTRFS_ATTR_PTR(, generation), BTRFS_ATTR_PTR(, read_policy), BTRFS_ATTR_PTR(, bg_reclaim_threshold), BTRFS_ATTR_PTR(, commit_stats), BTRFS_ATTR_PTR(, temp_fsid), #ifdef CONFIG_BTRFS_EXPERIMENTAL BTRFS_ATTR_PTR(, offload_csum), #endif NULL, }; static void btrfs_release_fsid_kobj(struct kobject *kobj) { struct btrfs_fs_devices *fs_devs = to_fs_devs(kobj); memset(&fs_devs->fsid_kobj, 0, sizeof(struct kobject)); complete(&fs_devs->kobj_unregister); } static const struct kobj_type btrfs_ktype = { .sysfs_ops = &kobj_sysfs_ops, .release = btrfs_release_fsid_kobj, }; static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj) { if (kobj->ktype != &btrfs_ktype) return NULL; return container_of(kobj, struct btrfs_fs_devices, fsid_kobj); } static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj) { if (kobj->ktype != &btrfs_ktype) return NULL; return to_fs_devs(kobj)->fs_info; } static struct kobject *get_btrfs_kobj(struct kobject *kobj) { while (kobj) { if (kobj->ktype == &btrfs_ktype) return kobj; kobj = kobj->parent; } return NULL; } #define NUM_FEATURE_BITS 64 #define BTRFS_FEATURE_NAME_MAX 13 static char btrfs_unknown_feature_names[FEAT_MAX][NUM_FEATURE_BITS][BTRFS_FEATURE_NAME_MAX]; static struct btrfs_feature_attr btrfs_feature_attrs[FEAT_MAX][NUM_FEATURE_BITS]; static_assert(ARRAY_SIZE(btrfs_unknown_feature_names) == ARRAY_SIZE(btrfs_feature_attrs)); static_assert(ARRAY_SIZE(btrfs_unknown_feature_names[0]) == ARRAY_SIZE(btrfs_feature_attrs[0])); static const u64 supported_feature_masks[FEAT_MAX] = { [FEAT_COMPAT] = BTRFS_FEATURE_COMPAT_SUPP, [FEAT_COMPAT_RO] = BTRFS_FEATURE_COMPAT_RO_SUPP, [FEAT_INCOMPAT] = BTRFS_FEATURE_INCOMPAT_SUPP, }; static int addrm_unknown_feature_attrs(struct btrfs_fs_info *fs_info, bool add) { int set; for (set = 0; set < FEAT_MAX; set++) { int i; struct attribute *attrs[2]; struct attribute_group agroup = { .name = "features", .attrs = attrs, }; u64 features = get_features(fs_info, set); features &= ~supported_feature_masks[set]; if (!features) continue; attrs[1] = NULL; for (i = 0; i < NUM_FEATURE_BITS; i++) { struct btrfs_feature_attr *fa; if (!(features & (1ULL << i))) continue; fa = &btrfs_feature_attrs[set][i]; attrs[0] = &fa->kobj_attr.attr; if (add) { int ret; ret = sysfs_merge_group(&fs_info->fs_devices->fsid_kobj, &agroup); if (ret) return ret; } else sysfs_unmerge_group(&fs_info->fs_devices->fsid_kobj, &agroup); } } return 0; } static void __btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs) { if (fs_devs->devinfo_kobj) { kobject_del(fs_devs->devinfo_kobj); kobject_put(fs_devs->devinfo_kobj); fs_devs->devinfo_kobj = NULL; } if (fs_devs->devices_kobj) { kobject_del(fs_devs->devices_kobj); kobject_put(fs_devs->devices_kobj); fs_devs->devices_kobj = NULL; } if (fs_devs->fsid_kobj.state_initialized) { kobject_del(&fs_devs->fsid_kobj); kobject_put(&fs_devs->fsid_kobj); wait_for_completion(&fs_devs->kobj_unregister); } } /* when fs_devs is NULL it will remove all fsid kobject */ void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs) { struct list_head *fs_uuids = btrfs_get_fs_uuids(); if (fs_devs) { __btrfs_sysfs_remove_fsid(fs_devs); return; } list_for_each_entry(fs_devs, fs_uuids, fs_list) { __btrfs_sysfs_remove_fsid(fs_devs); } } static void btrfs_sysfs_remove_fs_devices(struct btrfs_fs_devices *fs_devices) { struct btrfs_device *device; struct btrfs_fs_devices *seed; list_for_each_entry(device, &fs_devices->devices, dev_list) btrfs_sysfs_remove_device(device); list_for_each_entry(seed, &fs_devices->seed_list, seed_list) { list_for_each_entry(device, &seed->devices, dev_list) btrfs_sysfs_remove_device(device); } } void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info) { struct kobject *fsid_kobj = &fs_info->fs_devices->fsid_kobj; sysfs_remove_link(fsid_kobj, "bdi"); if (fs_info->space_info_kobj) { sysfs_remove_files(fs_info->space_info_kobj, allocation_attrs); kobject_del(fs_info->space_info_kobj); kobject_put(fs_info->space_info_kobj); } if (fs_info->discard_kobj) { sysfs_remove_files(fs_info->discard_kobj, discard_attrs); kobject_del(fs_info->discard_kobj); kobject_put(fs_info->discard_kobj); } #ifdef CONFIG_BTRFS_DEBUG if (fs_info->debug_kobj) { sysfs_remove_files(fs_info->debug_kobj, btrfs_debug_mount_attrs); kobject_del(fs_info->debug_kobj); kobject_put(fs_info->debug_kobj); } #endif addrm_unknown_feature_attrs(fs_info, false); sysfs_remove_group(fsid_kobj, &btrfs_feature_attr_group); sysfs_remove_files(fsid_kobj, btrfs_attrs); btrfs_sysfs_remove_fs_devices(fs_info->fs_devices); } static const char * const btrfs_feature_set_names[FEAT_MAX] = { [FEAT_COMPAT] = "compat", [FEAT_COMPAT_RO] = "compat_ro", [FEAT_INCOMPAT] = "incompat", }; const char *btrfs_feature_set_name(enum btrfs_feature_set set) { return btrfs_feature_set_names[set]; } char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags) { size_t bufsize = 4096; /* safe max, 64 names * 64 bytes */ int len = 0; int i; char *str; str = kmalloc(bufsize, GFP_KERNEL); if (!str) return str; for (i = 0; i < ARRAY_SIZE(btrfs_feature_attrs[set]); i++) { const char *name; if (!(flags & (1ULL << i))) continue; name = btrfs_feature_attrs[set][i].kobj_attr.attr.name; len += scnprintf(str + len, bufsize - len, "%s%s", len ? "," : "", name); } return str; } static void init_feature_attrs(void) { struct btrfs_feature_attr *fa; int set, i; memset(btrfs_feature_attrs, 0, sizeof(btrfs_feature_attrs)); memset(btrfs_unknown_feature_names, 0, sizeof(btrfs_unknown_feature_names)); for (i = 0; btrfs_supported_feature_attrs[i]; i++) { struct btrfs_feature_attr *sfa; struct attribute *a = btrfs_supported_feature_attrs[i]; int bit; sfa = attr_to_btrfs_feature_attr(a); bit = ilog2(sfa->feature_bit); fa = &btrfs_feature_attrs[sfa->feature_set][bit]; fa->kobj_attr.attr.name = sfa->kobj_attr.attr.name; } for (set = 0; set < FEAT_MAX; set++) { for (i = 0; i < ARRAY_SIZE(btrfs_feature_attrs[set]); i++) { char *name = btrfs_unknown_feature_names[set][i]; fa = &btrfs_feature_attrs[set][i]; if (fa->kobj_attr.attr.name) continue; snprintf(name, BTRFS_FEATURE_NAME_MAX, "%s:%u", btrfs_feature_set_names[set], i); fa->kobj_attr.attr.name = name; fa->kobj_attr.attr.mode = S_IRUGO; fa->feature_set = set; fa->feature_bit = 1ULL << i; } } } /* * Create a sysfs entry for a given block group type at path * /sys/fs/btrfs/UUID/allocation/data/TYPE */ void btrfs_sysfs_add_block_group_type(struct btrfs_block_group *cache) { struct btrfs_fs_info *fs_info = cache->fs_info; struct btrfs_space_info *space_info = cache->space_info; struct raid_kobject *rkobj; const int index = btrfs_bg_flags_to_raid_index(cache->flags); unsigned int nofs_flag; int ret; /* * Setup a NOFS context because kobject_add(), deep in its call chain, * does GFP_KERNEL allocations, and we are often called in a context * where if reclaim is triggered we can deadlock (we are either holding * a transaction handle or some lock required for a transaction * commit). */ nofs_flag = memalloc_nofs_save(); rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS); if (!rkobj) { memalloc_nofs_restore(nofs_flag); btrfs_warn(cache->fs_info, "couldn't alloc memory for raid level kobject"); return; } rkobj->flags = cache->flags; kobject_init(&rkobj->kobj, &btrfs_raid_ktype); /* * We call this either on mount, or if we've created a block group for a * new index type while running (i.e. when restriping). The running * case is tricky because we could race with other threads, so we need * to have this check to make sure we didn't already init the kobject. * * We don't have to protect on the free side because it only happens on * unmount. */ spin_lock(&space_info->lock); if (space_info->block_group_kobjs[index]) { spin_unlock(&space_info->lock); kobject_put(&rkobj->kobj); return; } else { space_info->block_group_kobjs[index] = &rkobj->kobj; } spin_unlock(&space_info->lock); ret = kobject_add(&rkobj->kobj, &space_info->kobj, "%s", btrfs_bg_type_to_raid_name(rkobj->flags)); memalloc_nofs_restore(nofs_flag); if (ret) { spin_lock(&space_info->lock); space_info->block_group_kobjs[index] = NULL; spin_unlock(&space_info->lock); kobject_put(&rkobj->kobj); btrfs_warn(fs_info, "failed to add kobject for block cache, ignoring"); return; } } /* * Remove sysfs directories for all block group types of a given space info and * the space info as well */ void btrfs_sysfs_remove_space_info(struct btrfs_space_info *space_info) { int i; for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { struct kobject *kobj; kobj = space_info->block_group_kobjs[i]; space_info->block_group_kobjs[i] = NULL; if (kobj) { kobject_del(kobj); kobject_put(kobj); } } kobject_del(&space_info->kobj); kobject_put(&space_info->kobj); } static const char *alloc_name(u64 flags) { switch (flags) { case BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA: return "mixed"; case BTRFS_BLOCK_GROUP_METADATA: return "metadata"; case BTRFS_BLOCK_GROUP_DATA: return "data"; case BTRFS_BLOCK_GROUP_SYSTEM: return "system"; default: WARN_ON(1); return "invalid-combination"; } } /* * Create a sysfs entry for a space info type at path * /sys/fs/btrfs/UUID/allocation/TYPE */ int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info) { int ret; ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype, fs_info->space_info_kobj, "%s", alloc_name(space_info->flags)); if (ret) { kobject_put(&space_info->kobj); return ret; } return 0; } void btrfs_sysfs_remove_device(struct btrfs_device *device) { struct kobject *devices_kobj; /* * Seed fs_devices devices_kobj aren't used, fetch kobject from the * fs_info::fs_devices. */ devices_kobj = device->fs_info->fs_devices->devices_kobj; ASSERT(devices_kobj); if (device->bdev) sysfs_remove_link(devices_kobj, bdev_kobj(device->bdev)->name); if (device->devid_kobj.state_initialized) { kobject_del(&device->devid_kobj); kobject_put(&device->devid_kobj); wait_for_completion(&device->kobj_unregister); } } static ssize_t btrfs_devinfo_in_fs_metadata_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { int val; struct btrfs_device *device = container_of(kobj, struct btrfs_device, devid_kobj); val = !!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); return sysfs_emit(buf, "%d\n", val); } BTRFS_ATTR(devid, in_fs_metadata, btrfs_devinfo_in_fs_metadata_show); static ssize_t btrfs_devinfo_missing_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { int val; struct btrfs_device *device = container_of(kobj, struct btrfs_device, devid_kobj); val = !!test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); return sysfs_emit(buf, "%d\n", val); } BTRFS_ATTR(devid, missing, btrfs_devinfo_missing_show); static ssize_t btrfs_devinfo_replace_target_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { int val; struct btrfs_device *device = container_of(kobj, struct btrfs_device, devid_kobj); val = !!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); return sysfs_emit(buf, "%d\n", val); } BTRFS_ATTR(devid, replace_target, btrfs_devinfo_replace_target_show); static ssize_t btrfs_devinfo_scrub_speed_max_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_device *device = container_of(kobj, struct btrfs_device, devid_kobj); return sysfs_emit(buf, "%llu\n", READ_ONCE(device->scrub_speed_max)); } static ssize_t btrfs_devinfo_scrub_speed_max_store(struct kobject *kobj, struct kobj_attribute *a, const char *buf, size_t len) { struct btrfs_device *device = container_of(kobj, struct btrfs_device, devid_kobj); char *endptr; unsigned long long limit; limit = memparse(buf, &endptr); /* There could be trailing '\n', also catch any typos after the value. */ endptr = skip_spaces(endptr); if (*endptr != 0) return -EINVAL; WRITE_ONCE(device->scrub_speed_max, limit); return len; } BTRFS_ATTR_RW(devid, scrub_speed_max, btrfs_devinfo_scrub_speed_max_show, btrfs_devinfo_scrub_speed_max_store); static ssize_t btrfs_devinfo_writeable_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { int val; struct btrfs_device *device = container_of(kobj, struct btrfs_device, devid_kobj); val = !!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); return sysfs_emit(buf, "%d\n", val); } BTRFS_ATTR(devid, writeable, btrfs_devinfo_writeable_show); static ssize_t btrfs_devinfo_fsid_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_device *device = container_of(kobj, struct btrfs_device, devid_kobj); return sysfs_emit(buf, "%pU\n", device->fs_devices->fsid); } BTRFS_ATTR(devid, fsid, btrfs_devinfo_fsid_show); static ssize_t btrfs_devinfo_error_stats_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_device *device = container_of(kobj, struct btrfs_device, devid_kobj); if (!device->dev_stats_valid) return sysfs_emit(buf, "invalid\n"); /* * Print all at once so we get a snapshot of all values from the same * time. Keep them in sync and in order of definition of * btrfs_dev_stat_values. */ return sysfs_emit(buf, "write_errs %d\n" "read_errs %d\n" "flush_errs %d\n" "corruption_errs %d\n" "generation_errs %d\n", btrfs_dev_stat_read(device, BTRFS_DEV_STAT_WRITE_ERRS), btrfs_dev_stat_read(device, BTRFS_DEV_STAT_READ_ERRS), btrfs_dev_stat_read(device, BTRFS_DEV_STAT_FLUSH_ERRS), btrfs_dev_stat_read(device, BTRFS_DEV_STAT_CORRUPTION_ERRS), btrfs_dev_stat_read(device, BTRFS_DEV_STAT_GENERATION_ERRS)); } BTRFS_ATTR(devid, error_stats, btrfs_devinfo_error_stats_show); /* * Information about one device. * * Path: /sys/fs/btrfs/<uuid>/devinfo/<devid>/ */ static struct attribute *devid_attrs[] = { BTRFS_ATTR_PTR(devid, error_stats), BTRFS_ATTR_PTR(devid, fsid), BTRFS_ATTR_PTR(devid, in_fs_metadata), BTRFS_ATTR_PTR(devid, missing), BTRFS_ATTR_PTR(devid, replace_target), BTRFS_ATTR_PTR(devid, scrub_speed_max), BTRFS_ATTR_PTR(devid, writeable), NULL }; ATTRIBUTE_GROUPS(devid); static void btrfs_release_devid_kobj(struct kobject *kobj) { struct btrfs_device *device = container_of(kobj, struct btrfs_device, devid_kobj); memset(&device->devid_kobj, 0, sizeof(struct kobject)); complete(&device->kobj_unregister); } static const struct kobj_type devid_ktype = { .sysfs_ops = &kobj_sysfs_ops, .default_groups = devid_groups, .release = btrfs_release_devid_kobj, }; int btrfs_sysfs_add_device(struct btrfs_device *device) { int ret; unsigned int nofs_flag; struct kobject *devices_kobj; struct kobject *devinfo_kobj; /* * Make sure we use the fs_info::fs_devices to fetch the kobjects even * for the seed fs_devices */ devices_kobj = device->fs_info->fs_devices->devices_kobj; devinfo_kobj = device->fs_info->fs_devices->devinfo_kobj; ASSERT(devices_kobj); ASSERT(devinfo_kobj); nofs_flag = memalloc_nofs_save(); if (device->bdev) { struct kobject *disk_kobj = bdev_kobj(device->bdev); ret = sysfs_create_link(devices_kobj, disk_kobj, disk_kobj->name); if (ret) { btrfs_warn(device->fs_info, "creating sysfs device link for devid %llu failed: %d", device->devid, ret); goto out; } } init_completion(&device->kobj_unregister); ret = kobject_init_and_add(&device->devid_kobj, &devid_ktype, devinfo_kobj, "%llu", device->devid); if (ret) { kobject_put(&device->devid_kobj); btrfs_warn(device->fs_info, "devinfo init for devid %llu failed: %d", device->devid, ret); } out: memalloc_nofs_restore(nofs_flag); return ret; } static int btrfs_sysfs_add_fs_devices(struct btrfs_fs_devices *fs_devices) { int ret; struct btrfs_device *device; struct btrfs_fs_devices *seed; list_for_each_entry(device, &fs_devices->devices, dev_list) { ret = btrfs_sysfs_add_device(device); if (ret) goto fail; } list_for_each_entry(seed, &fs_devices->seed_list, seed_list) { list_for_each_entry(device, &seed->devices, dev_list) { ret = btrfs_sysfs_add_device(device); if (ret) goto fail; } } return 0; fail: btrfs_sysfs_remove_fs_devices(fs_devices); return ret; } void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action) { int ret; ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action); if (ret) pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n", action, kobject_name(&disk_to_dev(bdev->bd_disk)->kobj), &disk_to_dev(bdev->bd_disk)->kobj); } void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices) { char fsid_buf[BTRFS_UUID_UNPARSED_SIZE]; /* * Sprouting changes fsid of the mounted filesystem, rename the fsid * directory */ snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", fs_devices->fsid); if (kobject_rename(&fs_devices->fsid_kobj, fsid_buf)) btrfs_warn(fs_devices->fs_info, "sysfs: failed to create fsid for sprout"); } void btrfs_sysfs_update_devid(struct btrfs_device *device) { char tmp[24]; snprintf(tmp, sizeof(tmp), "%llu", device->devid); if (kobject_rename(&device->devid_kobj, tmp)) btrfs_warn(device->fs_devices->fs_info, "sysfs: failed to update devid for %llu", device->devid); } /* /sys/fs/btrfs/ entry */ static struct kset *btrfs_kset; /* * Creates: * /sys/fs/btrfs/UUID * * Can be called by the device discovery thread. */ int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs) { int error; init_completion(&fs_devs->kobj_unregister); fs_devs->fsid_kobj.kset = btrfs_kset; error = kobject_init_and_add(&fs_devs->fsid_kobj, &btrfs_ktype, NULL, "%pU", fs_devs->fsid); if (error) { kobject_put(&fs_devs->fsid_kobj); return error; } fs_devs->devices_kobj = kobject_create_and_add("devices", &fs_devs->fsid_kobj); if (!fs_devs->devices_kobj) { btrfs_err(fs_devs->fs_info, "failed to init sysfs device interface"); btrfs_sysfs_remove_fsid(fs_devs); return -ENOMEM; } fs_devs->devinfo_kobj = kobject_create_and_add("devinfo", &fs_devs->fsid_kobj); if (!fs_devs->devinfo_kobj) { btrfs_err(fs_devs->fs_info, "failed to init sysfs devinfo kobject"); btrfs_sysfs_remove_fsid(fs_devs); return -ENOMEM; } return 0; } int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info) { int error; struct btrfs_fs_devices *fs_devs = fs_info->fs_devices; struct kobject *fsid_kobj = &fs_devs->fsid_kobj; error = btrfs_sysfs_add_fs_devices(fs_devs); if (error) return error; error = sysfs_create_files(fsid_kobj, btrfs_attrs); if (error) { btrfs_sysfs_remove_fs_devices(fs_devs); return error; } error = sysfs_create_group(fsid_kobj, &btrfs_feature_attr_group); if (error) goto failure; #ifdef CONFIG_BTRFS_DEBUG fs_info->debug_kobj = kobject_create_and_add("debug", fsid_kobj); if (!fs_info->debug_kobj) { error = -ENOMEM; goto failure; } error = sysfs_create_files(fs_info->debug_kobj, btrfs_debug_mount_attrs); if (error) goto failure; #endif /* Discard directory */ fs_info->discard_kobj = kobject_create_and_add("discard", fsid_kobj); if (!fs_info->discard_kobj) { error = -ENOMEM; goto failure; } error = sysfs_create_files(fs_info->discard_kobj, discard_attrs); if (error) goto failure; error = addrm_unknown_feature_attrs(fs_info, true); if (error) goto failure; error = sysfs_create_link(fsid_kobj, &fs_info->sb->s_bdi->dev->kobj, "bdi"); if (error) goto failure; fs_info->space_info_kobj = kobject_create_and_add("allocation", fsid_kobj); if (!fs_info->space_info_kobj) { error = -ENOMEM; goto failure; } error = sysfs_create_files(fs_info->space_info_kobj, allocation_attrs); if (error) goto failure; return 0; failure: btrfs_sysfs_remove_mounted(fs_info); return error; } static ssize_t qgroup_enabled_show(struct kobject *qgroups_kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = to_fs_info(qgroups_kobj->parent); bool enabled; spin_lock(&fs_info->qgroup_lock); enabled = fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON; spin_unlock(&fs_info->qgroup_lock); return sysfs_emit(buf, "%d\n", enabled); } BTRFS_ATTR(qgroups, enabled, qgroup_enabled_show); static ssize_t qgroup_mode_show(struct kobject *qgroups_kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = to_fs_info(qgroups_kobj->parent); ssize_t ret = 0; spin_lock(&fs_info->qgroup_lock); ASSERT(btrfs_qgroup_enabled(fs_info)); switch (btrfs_qgroup_mode(fs_info)) { case BTRFS_QGROUP_MODE_FULL: ret = sysfs_emit(buf, "qgroup\n"); break; case BTRFS_QGROUP_MODE_SIMPLE: ret = sysfs_emit(buf, "squota\n"); break; default: btrfs_warn(fs_info, "unexpected qgroup mode %d\n", btrfs_qgroup_mode(fs_info)); break; } spin_unlock(&fs_info->qgroup_lock); return ret; } BTRFS_ATTR(qgroups, mode, qgroup_mode_show); static ssize_t qgroup_inconsistent_show(struct kobject *qgroups_kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = to_fs_info(qgroups_kobj->parent); bool inconsistent; spin_lock(&fs_info->qgroup_lock); inconsistent = (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT); spin_unlock(&fs_info->qgroup_lock); return sysfs_emit(buf, "%d\n", inconsistent); } BTRFS_ATTR(qgroups, inconsistent, qgroup_inconsistent_show); static ssize_t qgroup_drop_subtree_thres_show(struct kobject *qgroups_kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = to_fs_info(qgroups_kobj->parent); u8 result; spin_lock(&fs_info->qgroup_lock); result = fs_info->qgroup_drop_subtree_thres; spin_unlock(&fs_info->qgroup_lock); return sysfs_emit(buf, "%d\n", result); } static ssize_t qgroup_drop_subtree_thres_store(struct kobject *qgroups_kobj, struct kobj_attribute *a, const char *buf, size_t len) { struct btrfs_fs_info *fs_info = to_fs_info(qgroups_kobj->parent); u8 new_thres; int ret; ret = kstrtou8(buf, 10, &new_thres); if (ret) return -EINVAL; if (new_thres > BTRFS_MAX_LEVEL) return -EINVAL; spin_lock(&fs_info->qgroup_lock); fs_info->qgroup_drop_subtree_thres = new_thres; spin_unlock(&fs_info->qgroup_lock); return len; } BTRFS_ATTR_RW(qgroups, drop_subtree_threshold, qgroup_drop_subtree_thres_show, qgroup_drop_subtree_thres_store); /* * Qgroups global info * * Path: /sys/fs/btrfs/<uuid>/qgroups/ */ static struct attribute *qgroups_attrs[] = { BTRFS_ATTR_PTR(qgroups, enabled), BTRFS_ATTR_PTR(qgroups, inconsistent), BTRFS_ATTR_PTR(qgroups, drop_subtree_threshold), BTRFS_ATTR_PTR(qgroups, mode), NULL }; ATTRIBUTE_GROUPS(qgroups); static void qgroups_release(struct kobject *kobj) { kfree(kobj); } static const struct kobj_type qgroups_ktype = { .sysfs_ops = &kobj_sysfs_ops, .default_groups = qgroups_groups, .release = qgroups_release, }; static inline struct btrfs_fs_info *qgroup_kobj_to_fs_info(struct kobject *kobj) { return to_fs_info(kobj->parent->parent); } #define QGROUP_ATTR(_member, _show_name) \ static ssize_t btrfs_qgroup_show_##_member(struct kobject *qgroup_kobj, \ struct kobj_attribute *a, \ char *buf) \ { \ struct btrfs_fs_info *fs_info = qgroup_kobj_to_fs_info(qgroup_kobj); \ struct btrfs_qgroup *qgroup = container_of(qgroup_kobj, \ struct btrfs_qgroup, kobj); \ return btrfs_show_u64(&qgroup->_member, &fs_info->qgroup_lock, buf); \ } \ BTRFS_ATTR(qgroup, _show_name, btrfs_qgroup_show_##_member) #define QGROUP_RSV_ATTR(_name, _type) \ static ssize_t btrfs_qgroup_rsv_show_##_name(struct kobject *qgroup_kobj, \ struct kobj_attribute *a, \ char *buf) \ { \ struct btrfs_fs_info *fs_info = qgroup_kobj_to_fs_info(qgroup_kobj); \ struct btrfs_qgroup *qgroup = container_of(qgroup_kobj, \ struct btrfs_qgroup, kobj); \ return btrfs_show_u64(&qgroup->rsv.values[_type], \ &fs_info->qgroup_lock, buf); \ } \ BTRFS_ATTR(qgroup, rsv_##_name, btrfs_qgroup_rsv_show_##_name) QGROUP_ATTR(rfer, referenced); QGROUP_ATTR(excl, exclusive); QGROUP_ATTR(max_rfer, max_referenced); QGROUP_ATTR(max_excl, max_exclusive); QGROUP_ATTR(lim_flags, limit_flags); QGROUP_RSV_ATTR(data, BTRFS_QGROUP_RSV_DATA); QGROUP_RSV_ATTR(meta_pertrans, BTRFS_QGROUP_RSV_META_PERTRANS); QGROUP_RSV_ATTR(meta_prealloc, BTRFS_QGROUP_RSV_META_PREALLOC); /* * Qgroup information. * * Path: /sys/fs/btrfs/<uuid>/qgroups/<level>_<qgroupid>/ */ static struct attribute *qgroup_attrs[] = { BTRFS_ATTR_PTR(qgroup, referenced), BTRFS_ATTR_PTR(qgroup, exclusive), BTRFS_ATTR_PTR(qgroup, max_referenced), BTRFS_ATTR_PTR(qgroup, max_exclusive), BTRFS_ATTR_PTR(qgroup, limit_flags), BTRFS_ATTR_PTR(qgroup, rsv_data), BTRFS_ATTR_PTR(qgroup, rsv_meta_pertrans), BTRFS_ATTR_PTR(qgroup, rsv_meta_prealloc), NULL }; ATTRIBUTE_GROUPS(qgroup); static void qgroup_release(struct kobject *kobj) { struct btrfs_qgroup *qgroup = container_of(kobj, struct btrfs_qgroup, kobj); memset(&qgroup->kobj, 0, sizeof(*kobj)); } static const struct kobj_type qgroup_ktype = { .sysfs_ops = &kobj_sysfs_ops, .release = qgroup_release, .default_groups = qgroup_groups, }; int btrfs_sysfs_add_one_qgroup(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup) { struct kobject *qgroups_kobj = fs_info->qgroups_kobj; int ret; if (btrfs_is_testing(fs_info)) return 0; if (qgroup->kobj.state_initialized) return 0; if (!qgroups_kobj) return -EINVAL; ret = kobject_init_and_add(&qgroup->kobj, &qgroup_ktype, qgroups_kobj, "%hu_%llu", btrfs_qgroup_level(qgroup->qgroupid), btrfs_qgroup_subvolid(qgroup->qgroupid)); if (ret < 0) kobject_put(&qgroup->kobj); return ret; } void btrfs_sysfs_del_qgroups(struct btrfs_fs_info *fs_info) { struct btrfs_qgroup *qgroup; struct btrfs_qgroup *next; if (btrfs_is_testing(fs_info)) return; rbtree_postorder_for_each_entry_safe(qgroup, next, &fs_info->qgroup_tree, node) btrfs_sysfs_del_one_qgroup(fs_info, qgroup); if (fs_info->qgroups_kobj) { kobject_del(fs_info->qgroups_kobj); kobject_put(fs_info->qgroups_kobj); fs_info->qgroups_kobj = NULL; } } /* Called when qgroups get initialized, thus there is no need for locking */ int btrfs_sysfs_add_qgroups(struct btrfs_fs_info *fs_info) { struct kobject *fsid_kobj = &fs_info->fs_devices->fsid_kobj; struct btrfs_qgroup *qgroup; struct btrfs_qgroup *next; int ret = 0; if (btrfs_is_testing(fs_info)) return 0; ASSERT(fsid_kobj); if (fs_info->qgroups_kobj) return 0; fs_info->qgroups_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL); if (!fs_info->qgroups_kobj) return -ENOMEM; ret = kobject_init_and_add(fs_info->qgroups_kobj, &qgroups_ktype, fsid_kobj, "qgroups"); if (ret < 0) goto out; rbtree_postorder_for_each_entry_safe(qgroup, next, &fs_info->qgroup_tree, node) { ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); if (ret < 0) goto out; } out: if (ret < 0) btrfs_sysfs_del_qgroups(fs_info); return ret; } void btrfs_sysfs_del_one_qgroup(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup) { if (btrfs_is_testing(fs_info)) return; if (qgroup->kobj.state_initialized) { kobject_del(&qgroup->kobj); kobject_put(&qgroup->kobj); } } /* * Change per-fs features in /sys/fs/btrfs/UUID/features to match current * values in superblock. Call after any changes to incompat/compat_ro flags */ void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info) { struct kobject *fsid_kobj; int ret; if (!fs_info) return; fsid_kobj = &fs_info->fs_devices->fsid_kobj; if (!fsid_kobj->state_initialized) return; ret = sysfs_update_group(fsid_kobj, &btrfs_feature_attr_group); if (ret < 0) btrfs_warn(fs_info, "failed to update /sys/fs/btrfs/%pU/features: %d", fs_info->fs_devices->fsid, ret); } int __init btrfs_init_sysfs(void) { int ret; btrfs_kset = kset_create_and_add("btrfs", NULL, fs_kobj); if (!btrfs_kset) return -ENOMEM; init_feature_attrs(); ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_feature_attr_group); if (ret) goto out2; ret = sysfs_merge_group(&btrfs_kset->kobj, &btrfs_static_feature_attr_group); if (ret) goto out_remove_group; #ifdef CONFIG_BTRFS_DEBUG ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_debug_feature_attr_group); if (ret) { sysfs_unmerge_group(&btrfs_kset->kobj, &btrfs_static_feature_attr_group); goto out_remove_group; } #endif return 0; out_remove_group: sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group); out2: kset_unregister(btrfs_kset); return ret; } void __cold btrfs_exit_sysfs(void) { sysfs_unmerge_group(&btrfs_kset->kobj, &btrfs_static_feature_attr_group); sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group); #ifdef CONFIG_BTRFS_DEBUG sysfs_remove_group(&btrfs_kset->kobj, &btrfs_debug_feature_attr_group); #endif kset_unregister(btrfs_kset); }
180 181 181 181 179 181 179 181 181 40 268 269 30 242 31 489 493 286 293 295 287 327 323 324 323 185 185 13 13 11 12 185 183 185 185 5 179 185 184 184 184 185 183 185 184 87 86 87 88 88 88 88 87 87 88 88 88 88 87 88 86 69 69 69 69 69 69 176 177 137 49 175 176 1 69 69 69 60 1 8 69 69 69 69 69 69 69 69 69 67 67 181 1 180 1 1 1 185 4 181 181 181 181 181 181 51 132 180 189 1 135 185 178 43 9 135 55 129 3 1 49 131 42 41 58 181 179 52 131 181 136 50 132 180 4 54 56 56 56 2 2 2 31 32 4 152 152 152 4 152 8 129 129 129 135 2 1 127 1 1 246 245 64 201 4 201 200 4 197 195 196 196 197 195 197 201 200 141 61 201 142 61 201 201 201 200 200 201 198 200 200 141 61 200 200 201 200 200 1 201 4 197 197 201 200 1 200 1 182 18 201 75 141 201 181 19 180 19 200 199 4 201 201 201 262 101 170 102 201 200 64 64 64 55 55 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2002-2005, Instant802 Networks, Inc. * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015 - 2017 Intel Deutschland GmbH * Copyright (C) 2018-2023 Intel Corporation */ #include <linux/module.h> #include <linux/init.h> #include <linux/etherdevice.h> #include <linux/netdevice.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/skbuff.h> #include <linux/if_arp.h> #include <linux/timer.h> #include <linux/rtnetlink.h> #include <net/codel.h> #include <net/mac80211.h> #include "ieee80211_i.h" #include "driver-ops.h" #include "rate.h" #include "sta_info.h" #include "debugfs_sta.h" #include "mesh.h" #include "wme.h" /** * DOC: STA information lifetime rules * * STA info structures (&struct sta_info) are managed in a hash table * for faster lookup and a list for iteration. They are managed using * RCU, i.e. access to the list and hash table is protected by RCU. * * Upon allocating a STA info structure with sta_info_alloc(), the caller * owns that structure. It must then insert it into the hash table using * either sta_info_insert() or sta_info_insert_rcu(); only in the latter * case (which acquires an rcu read section but must not be called from * within one) will the pointer still be valid after the call. Note that * the caller may not do much with the STA info before inserting it; in * particular, it may not start any mesh peer link management or add * encryption keys. * * When the insertion fails (sta_info_insert()) returns non-zero), the * structure will have been freed by sta_info_insert()! * * Station entries are added by mac80211 when you establish a link with a * peer. This means different things for the different type of interfaces * we support. For a regular station this mean we add the AP sta when we * receive an association response from the AP. For IBSS this occurs when * get to know about a peer on the same IBSS. For WDS we add the sta for * the peer immediately upon device open. When using AP mode we add stations * for each respective station upon request from userspace through nl80211. * * In order to remove a STA info structure, various sta_info_destroy_*() * calls are available. * * There is no concept of ownership on a STA entry; each structure is * owned by the global hash table/list until it is removed. All users of * the structure need to be RCU protected so that the structure won't be * freed before they are done using it. */ struct sta_link_alloc { struct link_sta_info info; struct ieee80211_link_sta sta; struct rcu_head rcu_head; }; static const struct rhashtable_params sta_rht_params = { .nelem_hint = 3, /* start small */ .automatic_shrinking = true, .head_offset = offsetof(struct sta_info, hash_node), .key_offset = offsetof(struct sta_info, addr), .key_len = ETH_ALEN, .max_size = CONFIG_MAC80211_STA_HASH_MAX_SIZE, }; static const struct rhashtable_params link_sta_rht_params = { .nelem_hint = 3, /* start small */ .automatic_shrinking = true, .head_offset = offsetof(struct link_sta_info, link_hash_node), .key_offset = offsetof(struct link_sta_info, addr), .key_len = ETH_ALEN, .max_size = CONFIG_MAC80211_STA_HASH_MAX_SIZE, }; static int sta_info_hash_del(struct ieee80211_local *local, struct sta_info *sta) { return rhltable_remove(&local->sta_hash, &sta->hash_node, sta_rht_params); } static int link_sta_info_hash_add(struct ieee80211_local *local, struct link_sta_info *link_sta) { lockdep_assert_wiphy(local->hw.wiphy); return rhltable_insert(&local->link_sta_hash, &link_sta->link_hash_node, link_sta_rht_params); } static int link_sta_info_hash_del(struct ieee80211_local *local, struct link_sta_info *link_sta) { lockdep_assert_wiphy(local->hw.wiphy); return rhltable_remove(&local->link_sta_hash, &link_sta->link_hash_node, link_sta_rht_params); } void ieee80211_purge_sta_txqs(struct sta_info *sta) { struct ieee80211_local *local = sta->sdata->local; int i; for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) { struct txq_info *txqi; if (!sta->sta.txq[i]) continue; txqi = to_txq_info(sta->sta.txq[i]); ieee80211_txq_purge(local, txqi); } } static void __cleanup_single_sta(struct sta_info *sta) { int ac, i; struct tid_ampdu_tx *tid_tx; struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; struct ps_data *ps; if (test_sta_flag(sta, WLAN_STA_PS_STA) || test_sta_flag(sta, WLAN_STA_PS_DRIVER) || test_sta_flag(sta, WLAN_STA_PS_DELIVER)) { if (sta->sdata->vif.type == NL80211_IFTYPE_AP || sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN) ps = &sdata->bss->ps; else if (ieee80211_vif_is_mesh(&sdata->vif)) ps = &sdata->u.mesh.ps; else return; clear_sta_flag(sta, WLAN_STA_PS_STA); clear_sta_flag(sta, WLAN_STA_PS_DRIVER); clear_sta_flag(sta, WLAN_STA_PS_DELIVER); atomic_dec(&ps->num_sta_ps); } ieee80211_purge_sta_txqs(sta); for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { local->total_ps_buffered -= skb_queue_len(&sta->ps_tx_buf[ac]); ieee80211_purge_tx_queue(&local->hw, &sta->ps_tx_buf[ac]); ieee80211_purge_tx_queue(&local->hw, &sta->tx_filtered[ac]); } if (ieee80211_vif_is_mesh(&sdata->vif)) mesh_sta_cleanup(sta); cancel_work_sync(&sta->drv_deliver_wk); /* * Destroy aggregation state here. It would be nice to wait for the * driver to finish aggregation stop and then clean up, but for now * drivers have to handle aggregation stop being requested, followed * directly by station destruction. */ for (i = 0; i < IEEE80211_NUM_TIDS; i++) { kfree(sta->ampdu_mlme.tid_start_tx[i]); tid_tx = rcu_dereference_raw(sta->ampdu_mlme.tid_tx[i]); if (!tid_tx) continue; ieee80211_purge_tx_queue(&local->hw, &tid_tx->pending); kfree(tid_tx); } } static void cleanup_single_sta(struct sta_info *sta) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; __cleanup_single_sta(sta); sta_info_free(local, sta); } struct rhlist_head *sta_info_hash_lookup(struct ieee80211_local *local, const u8 *addr) { return rhltable_lookup(&local->sta_hash, addr, sta_rht_params); } /* protected by RCU */ struct sta_info *sta_info_get(struct ieee80211_sub_if_data *sdata, const u8 *addr) { struct ieee80211_local *local = sdata->local; struct rhlist_head *tmp; struct sta_info *sta; rcu_read_lock(); for_each_sta_info(local, addr, sta, tmp) { if (sta->sdata == sdata) { rcu_read_unlock(); /* this is safe as the caller must already hold * another rcu read section or the mutex */ return sta; } } rcu_read_unlock(); return NULL; } /* * Get sta info either from the specified interface * or from one of its vlans */ struct sta_info *sta_info_get_bss(struct ieee80211_sub_if_data *sdata, const u8 *addr) { struct ieee80211_local *local = sdata->local; struct rhlist_head *tmp; struct sta_info *sta; rcu_read_lock(); for_each_sta_info(local, addr, sta, tmp) { if (sta->sdata == sdata || (sta->sdata->bss && sta->sdata->bss == sdata->bss)) { rcu_read_unlock(); /* this is safe as the caller must already hold * another rcu read section or the mutex */ return sta; } } rcu_read_unlock(); return NULL; } struct rhlist_head *link_sta_info_hash_lookup(struct ieee80211_local *local, const u8 *addr) { return rhltable_lookup(&local->link_sta_hash, addr, link_sta_rht_params); } struct link_sta_info * link_sta_info_get_bss(struct ieee80211_sub_if_data *sdata, const u8 *addr) { struct ieee80211_local *local = sdata->local; struct rhlist_head *tmp; struct link_sta_info *link_sta; rcu_read_lock(); for_each_link_sta_info(local, addr, link_sta, tmp) { struct sta_info *sta = link_sta->sta; if (sta->sdata == sdata || (sta->sdata->bss && sta->sdata->bss == sdata->bss)) { rcu_read_unlock(); /* this is safe as the caller must already hold * another rcu read section or the mutex */ return link_sta; } } rcu_read_unlock(); return NULL; } struct ieee80211_sta * ieee80211_find_sta_by_link_addrs(struct ieee80211_hw *hw, const u8 *addr, const u8 *localaddr, unsigned int *link_id) { struct ieee80211_local *local = hw_to_local(hw); struct link_sta_info *link_sta; struct rhlist_head *tmp; for_each_link_sta_info(local, addr, link_sta, tmp) { struct sta_info *sta = link_sta->sta; struct ieee80211_link_data *link; u8 _link_id = link_sta->link_id; if (!localaddr) { if (link_id) *link_id = _link_id; return &sta->sta; } link = rcu_dereference(sta->sdata->link[_link_id]); if (!link) continue; if (memcmp(link->conf->addr, localaddr, ETH_ALEN)) continue; if (link_id) *link_id = _link_id; return &sta->sta; } return NULL; } EXPORT_SYMBOL_GPL(ieee80211_find_sta_by_link_addrs); struct sta_info *sta_info_get_by_addrs(struct ieee80211_local *local, const u8 *sta_addr, const u8 *vif_addr) { struct rhlist_head *tmp; struct sta_info *sta; for_each_sta_info(local, sta_addr, sta, tmp) { if (ether_addr_equal(vif_addr, sta->sdata->vif.addr)) return sta; } return NULL; } struct sta_info *sta_info_get_by_idx(struct ieee80211_sub_if_data *sdata, int idx) { struct ieee80211_local *local = sdata->local; struct sta_info *sta; int i = 0; list_for_each_entry_rcu(sta, &local->sta_list, list, lockdep_is_held(&local->hw.wiphy->mtx)) { if (sdata != sta->sdata) continue; if (i < idx) { ++i; continue; } return sta; } return NULL; } static void sta_info_free_link(struct link_sta_info *link_sta) { free_percpu(link_sta->pcpu_rx_stats); } static void sta_remove_link(struct sta_info *sta, unsigned int link_id, bool unhash) { struct sta_link_alloc *alloc = NULL; struct link_sta_info *link_sta; lockdep_assert_wiphy(sta->local->hw.wiphy); link_sta = rcu_access_pointer(sta->link[link_id]); if (WARN_ON(!link_sta)) return; if (unhash) link_sta_info_hash_del(sta->local, link_sta); if (test_sta_flag(sta, WLAN_STA_INSERTED)) ieee80211_link_sta_debugfs_remove(link_sta); if (link_sta != &sta->deflink) alloc = container_of(link_sta, typeof(*alloc), info); sta->sta.valid_links &= ~BIT(link_id); RCU_INIT_POINTER(sta->link[link_id], NULL); RCU_INIT_POINTER(sta->sta.link[link_id], NULL); if (alloc) { sta_info_free_link(&alloc->info); kfree_rcu(alloc, rcu_head); } ieee80211_sta_recalc_aggregates(&sta->sta); } /** * sta_info_free - free STA * * @local: pointer to the global information * @sta: STA info to free * * This function must undo everything done by sta_info_alloc() * that may happen before sta_info_insert(). It may only be * called when sta_info_insert() has not been attempted (and * if that fails, the station is freed anyway.) */ void sta_info_free(struct ieee80211_local *local, struct sta_info *sta) { int i; for (i = 0; i < ARRAY_SIZE(sta->link); i++) { struct link_sta_info *link_sta; link_sta = rcu_access_pointer(sta->link[i]); if (!link_sta) continue; sta_remove_link(sta, i, false); } /* * If we had used sta_info_pre_move_state() then we might not * have gone through the state transitions down again, so do * it here now (and warn if it's inserted). * * This will clear state such as fast TX/RX that may have been * allocated during state transitions. */ while (sta->sta_state > IEEE80211_STA_NONE) { int ret; WARN_ON_ONCE(test_sta_flag(sta, WLAN_STA_INSERTED)); ret = sta_info_move_state(sta, sta->sta_state - 1); if (WARN_ONCE(ret, "sta_info_move_state() returned %d\n", ret)) break; } if (sta->rate_ctrl) rate_control_free_sta(sta); sta_dbg(sta->sdata, "Destroyed STA %pM\n", sta->sta.addr); kfree(to_txq_info(sta->sta.txq[0])); kfree(rcu_dereference_raw(sta->sta.rates)); #ifdef CONFIG_MAC80211_MESH kfree(sta->mesh); #endif sta_info_free_link(&sta->deflink); kfree(sta); } static int sta_info_hash_add(struct ieee80211_local *local, struct sta_info *sta) { return rhltable_insert(&local->sta_hash, &sta->hash_node, sta_rht_params); } static void sta_deliver_ps_frames(struct work_struct *wk) { struct sta_info *sta; sta = container_of(wk, struct sta_info, drv_deliver_wk); if (sta->dead) return; local_bh_disable(); if (!test_sta_flag(sta, WLAN_STA_PS_STA)) ieee80211_sta_ps_deliver_wakeup(sta); else if (test_and_clear_sta_flag(sta, WLAN_STA_PSPOLL)) ieee80211_sta_ps_deliver_poll_response(sta); else if (test_and_clear_sta_flag(sta, WLAN_STA_UAPSD)) ieee80211_sta_ps_deliver_uapsd(sta); local_bh_enable(); } static int sta_prepare_rate_control(struct ieee80211_local *local, struct sta_info *sta, gfp_t gfp) { if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL)) return 0; sta->rate_ctrl = local->rate_ctrl; sta->rate_ctrl_priv = rate_control_alloc_sta(sta->rate_ctrl, sta, gfp); if (!sta->rate_ctrl_priv) return -ENOMEM; return 0; } static int sta_info_alloc_link(struct ieee80211_local *local, struct link_sta_info *link_info, gfp_t gfp) { struct ieee80211_hw *hw = &local->hw; int i; if (ieee80211_hw_check(hw, USES_RSS)) { link_info->pcpu_rx_stats = alloc_percpu_gfp(struct ieee80211_sta_rx_stats, gfp); if (!link_info->pcpu_rx_stats) return -ENOMEM; } link_info->rx_stats.last_rx = jiffies; u64_stats_init(&link_info->rx_stats.syncp); ewma_signal_init(&link_info->rx_stats_avg.signal); ewma_avg_signal_init(&link_info->status_stats.avg_ack_signal); for (i = 0; i < ARRAY_SIZE(link_info->rx_stats_avg.chain_signal); i++) ewma_signal_init(&link_info->rx_stats_avg.chain_signal[i]); link_info->rx_omi_bw_rx = IEEE80211_STA_RX_BW_MAX; link_info->rx_omi_bw_tx = IEEE80211_STA_RX_BW_MAX; link_info->rx_omi_bw_staging = IEEE80211_STA_RX_BW_MAX; /* * Cause (a) warning(s) if IEEE80211_STA_RX_BW_MAX != 320 * or if new values are added to the enum. */ switch (link_info->cur_max_bandwidth) { case IEEE80211_STA_RX_BW_20: case IEEE80211_STA_RX_BW_40: case IEEE80211_STA_RX_BW_80: case IEEE80211_STA_RX_BW_160: case IEEE80211_STA_RX_BW_MAX: /* intentionally nothing */ break; } return 0; } static void sta_info_add_link(struct sta_info *sta, unsigned int link_id, struct link_sta_info *link_info, struct ieee80211_link_sta *link_sta) { link_info->sta = sta; link_info->link_id = link_id; link_info->pub = link_sta; link_info->pub->sta = &sta->sta; link_sta->link_id = link_id; rcu_assign_pointer(sta->link[link_id], link_info); rcu_assign_pointer(sta->sta.link[link_id], link_sta); link_sta->smps_mode = IEEE80211_SMPS_OFF; link_sta->agg.max_rc_amsdu_len = IEEE80211_MAX_MPDU_LEN_HT_BA; } static struct sta_info * __sta_info_alloc(struct ieee80211_sub_if_data *sdata, const u8 *addr, int link_id, const u8 *link_addr, gfp_t gfp) { struct ieee80211_local *local = sdata->local; struct ieee80211_hw *hw = &local->hw; struct sta_info *sta; void *txq_data; int size; int i; sta = kzalloc(sizeof(*sta) + hw->sta_data_size, gfp); if (!sta) return NULL; sta->local = local; sta->sdata = sdata; if (sta_info_alloc_link(local, &sta->deflink, gfp)) goto free; if (link_id >= 0) { sta_info_add_link(sta, link_id, &sta->deflink, &sta->sta.deflink); sta->sta.valid_links = BIT(link_id); } else { sta_info_add_link(sta, 0, &sta->deflink, &sta->sta.deflink); } sta->sta.cur = &sta->sta.deflink.agg; spin_lock_init(&sta->lock); spin_lock_init(&sta->ps_lock); INIT_WORK(&sta->drv_deliver_wk, sta_deliver_ps_frames); wiphy_work_init(&sta->ampdu_mlme.work, ieee80211_ba_session_work); #ifdef CONFIG_MAC80211_MESH if (ieee80211_vif_is_mesh(&sdata->vif)) { sta->mesh = kzalloc(sizeof(*sta->mesh), gfp); if (!sta->mesh) goto free; sta->mesh->plink_sta = sta; spin_lock_init(&sta->mesh->plink_lock); if (!sdata->u.mesh.user_mpm) timer_setup(&sta->mesh->plink_timer, mesh_plink_timer, 0); sta->mesh->nonpeer_pm = NL80211_MESH_POWER_ACTIVE; } #endif memcpy(sta->addr, addr, ETH_ALEN); memcpy(sta->sta.addr, addr, ETH_ALEN); memcpy(sta->deflink.addr, link_addr, ETH_ALEN); memcpy(sta->sta.deflink.addr, link_addr, ETH_ALEN); sta->sta.max_rx_aggregation_subframes = local->hw.max_rx_aggregation_subframes; /* TODO link specific alloc and assignments for MLO Link STA */ /* Extended Key ID needs to install keys for keyid 0 and 1 Rx-only. * The Tx path starts to use a key as soon as the key slot ptk_idx * references to is not NULL. To not use the initial Rx-only key * prematurely for Tx initialize ptk_idx to an impossible PTK keyid * which always will refer to a NULL key. */ BUILD_BUG_ON(ARRAY_SIZE(sta->ptk) <= INVALID_PTK_KEYIDX); sta->ptk_idx = INVALID_PTK_KEYIDX; ieee80211_init_frag_cache(&sta->frags); sta->sta_state = IEEE80211_STA_NONE; if (sdata->vif.type == NL80211_IFTYPE_MESH_POINT) sta->amsdu_mesh_control = -1; /* Mark TID as unreserved */ sta->reserved_tid = IEEE80211_TID_UNRESERVED; sta->last_connected = ktime_get_seconds(); size = sizeof(struct txq_info) + ALIGN(hw->txq_data_size, sizeof(void *)); txq_data = kcalloc(ARRAY_SIZE(sta->sta.txq), size, gfp); if (!txq_data) goto free; for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) { struct txq_info *txq = txq_data + i * size; /* might not do anything for the (bufferable) MMPDU TXQ */ ieee80211_txq_init(sdata, sta, txq, i); } if (sta_prepare_rate_control(local, sta, gfp)) goto free_txq; sta->airtime_weight = IEEE80211_DEFAULT_AIRTIME_WEIGHT; for (i = 0; i < IEEE80211_NUM_ACS; i++) { skb_queue_head_init(&sta->ps_tx_buf[i]); skb_queue_head_init(&sta->tx_filtered[i]); sta->airtime[i].deficit = sta->airtime_weight; atomic_set(&sta->airtime[i].aql_tx_pending, 0); sta->airtime[i].aql_limit_low = local->aql_txq_limit_low[i]; sta->airtime[i].aql_limit_high = local->aql_txq_limit_high[i]; } for (i = 0; i < IEEE80211_NUM_TIDS; i++) sta->last_seq_ctrl[i] = cpu_to_le16(USHRT_MAX); for (i = 0; i < NUM_NL80211_BANDS; i++) { u32 mandatory = 0; int r; if (!hw->wiphy->bands[i]) continue; switch (i) { case NL80211_BAND_2GHZ: case NL80211_BAND_LC: /* * We use both here, even if we cannot really know for * sure the station will support both, but the only use * for this is when we don't know anything yet and send * management frames, and then we'll pick the lowest * possible rate anyway. * If we don't include _G here, we cannot find a rate * in P2P, and thus trigger the WARN_ONCE() in rate.c */ mandatory = IEEE80211_RATE_MANDATORY_B | IEEE80211_RATE_MANDATORY_G; break; case NL80211_BAND_5GHZ: mandatory = IEEE80211_RATE_MANDATORY_A; break; case NL80211_BAND_60GHZ: WARN_ON(1); mandatory = 0; break; } for (r = 0; r < hw->wiphy->bands[i]->n_bitrates; r++) { struct ieee80211_rate *rate; rate = &hw->wiphy->bands[i]->bitrates[r]; if (!(rate->flags & mandatory)) continue; sta->sta.deflink.supp_rates[i] |= BIT(r); } } sta->cparams.ce_threshold = CODEL_DISABLED_THRESHOLD; sta->cparams.target = MS2TIME(20); sta->cparams.interval = MS2TIME(100); sta->cparams.ecn = true; sta->cparams.ce_threshold_selector = 0; sta->cparams.ce_threshold_mask = 0; sta_dbg(sdata, "Allocated STA %pM\n", sta->sta.addr); return sta; free_txq: kfree(to_txq_info(sta->sta.txq[0])); free: sta_info_free_link(&sta->deflink); #ifdef CONFIG_MAC80211_MESH kfree(sta->mesh); #endif kfree(sta); return NULL; } struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, const u8 *addr, gfp_t gfp) { return __sta_info_alloc(sdata, addr, -1, addr, gfp); } struct sta_info *sta_info_alloc_with_link(struct ieee80211_sub_if_data *sdata, const u8 *mld_addr, unsigned int link_id, const u8 *link_addr, gfp_t gfp) { return __sta_info_alloc(sdata, mld_addr, link_id, link_addr, gfp); } static int sta_info_insert_check(struct sta_info *sta) { struct ieee80211_sub_if_data *sdata = sta->sdata; lockdep_assert_wiphy(sdata->local->hw.wiphy); /* * Can't be a WARN_ON because it can be triggered through a race: * something inserts a STA (on one CPU) without holding the RTNL * and another CPU turns off the net device. */ if (unlikely(!ieee80211_sdata_running(sdata))) return -ENETDOWN; if (WARN_ON(ether_addr_equal(sta->sta.addr, sdata->vif.addr) || !is_valid_ether_addr(sta->sta.addr))) return -EINVAL; /* The RCU read lock is required by rhashtable due to * asynchronous resize/rehash. We also require the mutex * for correctness. */ rcu_read_lock(); if (ieee80211_hw_check(&sdata->local->hw, NEEDS_UNIQUE_STA_ADDR) && ieee80211_find_sta_by_ifaddr(&sdata->local->hw, sta->addr, NULL)) { rcu_read_unlock(); return -ENOTUNIQ; } rcu_read_unlock(); return 0; } static int sta_info_insert_drv_state(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct sta_info *sta) { enum ieee80211_sta_state state; int err = 0; for (state = IEEE80211_STA_NOTEXIST; state < sta->sta_state; state++) { err = drv_sta_state(local, sdata, sta, state, state + 1); if (err) break; } if (!err) { /* * Drivers using legacy sta_add/sta_remove callbacks only * get uploaded set to true after sta_add is called. */ if (!local->ops->sta_add) sta->uploaded = true; return 0; } if (sdata->vif.type == NL80211_IFTYPE_ADHOC) { sdata_info(sdata, "failed to move IBSS STA %pM to state %d (%d) - keeping it anyway\n", sta->sta.addr, state + 1, err); err = 0; } /* unwind on error */ for (; state > IEEE80211_STA_NOTEXIST; state--) WARN_ON(drv_sta_state(local, sdata, sta, state, state - 1)); return err; } static void ieee80211_recalc_p2p_go_ps_allowed(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; bool allow_p2p_go_ps = sdata->vif.p2p; struct sta_info *sta; rcu_read_lock(); list_for_each_entry_rcu(sta, &local->sta_list, list) { if (sdata != sta->sdata || !test_sta_flag(sta, WLAN_STA_ASSOC)) continue; if (!sta->sta.support_p2p_ps) { allow_p2p_go_ps = false; break; } } rcu_read_unlock(); if (allow_p2p_go_ps != sdata->vif.bss_conf.allow_p2p_go_ps) { sdata->vif.bss_conf.allow_p2p_go_ps = allow_p2p_go_ps; ieee80211_link_info_change_notify(sdata, &sdata->deflink, BSS_CHANGED_P2P_PS); } } static int sta_info_insert_finish(struct sta_info *sta) __acquires(RCU) { struct ieee80211_local *local = sta->local; struct ieee80211_sub_if_data *sdata = sta->sdata; struct station_info *sinfo = NULL; int err = 0; lockdep_assert_wiphy(local->hw.wiphy); /* check if STA exists already */ if (sta_info_get_bss(sdata, sta->sta.addr)) { err = -EEXIST; goto out_cleanup; } sinfo = kzalloc(sizeof(struct station_info), GFP_KERNEL); if (!sinfo) { err = -ENOMEM; goto out_cleanup; } local->num_sta++; local->sta_generation++; smp_mb(); /* simplify things and don't accept BA sessions yet */ set_sta_flag(sta, WLAN_STA_BLOCK_BA); /* make the station visible */ err = sta_info_hash_add(local, sta); if (err) goto out_drop_sta; if (sta->sta.valid_links) { err = link_sta_info_hash_add(local, &sta->deflink); if (err) { sta_info_hash_del(local, sta); goto out_drop_sta; } } list_add_tail_rcu(&sta->list, &local->sta_list); /* update channel context before notifying the driver about state * change, this enables driver using the updated channel context right away. */ if (sta->sta_state >= IEEE80211_STA_ASSOC) { ieee80211_recalc_min_chandef(sta->sdata, -1); if (!sta->sta.support_p2p_ps) ieee80211_recalc_p2p_go_ps_allowed(sta->sdata); } /* notify driver */ err = sta_info_insert_drv_state(local, sdata, sta); if (err) goto out_remove; set_sta_flag(sta, WLAN_STA_INSERTED); /* accept BA sessions now */ clear_sta_flag(sta, WLAN_STA_BLOCK_BA); ieee80211_sta_debugfs_add(sta); rate_control_add_sta_debugfs(sta); if (sta->sta.valid_links) { int i; for (i = 0; i < ARRAY_SIZE(sta->link); i++) { struct link_sta_info *link_sta; link_sta = rcu_dereference_protected(sta->link[i], lockdep_is_held(&local->hw.wiphy->mtx)); if (!link_sta) continue; ieee80211_link_sta_debugfs_add(link_sta); if (sdata->vif.active_links & BIT(i)) ieee80211_link_sta_debugfs_drv_add(link_sta); } } else { ieee80211_link_sta_debugfs_add(&sta->deflink); ieee80211_link_sta_debugfs_drv_add(&sta->deflink); } sinfo->generation = local->sta_generation; cfg80211_new_sta(sdata->dev, sta->sta.addr, sinfo, GFP_KERNEL); kfree(sinfo); sta_dbg(sdata, "Inserted STA %pM\n", sta->sta.addr); /* move reference to rcu-protected */ rcu_read_lock(); if (ieee80211_vif_is_mesh(&sdata->vif)) mesh_accept_plinks_update(sdata); ieee80211_check_fast_xmit(sta); return 0; out_remove: if (sta->sta.valid_links) link_sta_info_hash_del(local, &sta->deflink); sta_info_hash_del(local, sta); list_del_rcu(&sta->list); out_drop_sta: local->num_sta--; synchronize_net(); out_cleanup: cleanup_single_sta(sta); kfree(sinfo); rcu_read_lock(); return err; } int sta_info_insert_rcu(struct sta_info *sta) __acquires(RCU) { struct ieee80211_local *local = sta->local; int err; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); err = sta_info_insert_check(sta); if (err) { sta_info_free(local, sta); rcu_read_lock(); return err; } return sta_info_insert_finish(sta); } int sta_info_insert(struct sta_info *sta) { int err = sta_info_insert_rcu(sta); rcu_read_unlock(); return err; } static inline void __bss_tim_set(u8 *tim, u16 id) { /* * This format has been mandated by the IEEE specifications, * so this line may not be changed to use the __set_bit() format. */ tim[id / 8] |= (1 << (id % 8)); } static inline void __bss_tim_clear(u8 *tim, u16 id) { /* * This format has been mandated by the IEEE specifications, * so this line may not be changed to use the __clear_bit() format. */ tim[id / 8] &= ~(1 << (id % 8)); } static inline bool __bss_tim_get(u8 *tim, u16 id) { /* * This format has been mandated by the IEEE specifications, * so this line may not be changed to use the test_bit() format. */ return tim[id / 8] & (1 << (id % 8)); } static unsigned long ieee80211_tids_for_ac(int ac) { /* If we ever support TIDs > 7, this obviously needs to be adjusted */ switch (ac) { case IEEE80211_AC_VO: return BIT(6) | BIT(7); case IEEE80211_AC_VI: return BIT(4) | BIT(5); case IEEE80211_AC_BE: return BIT(0) | BIT(3); case IEEE80211_AC_BK: return BIT(1) | BIT(2); default: WARN_ON(1); return 0; } } static void __sta_info_recalc_tim(struct sta_info *sta, bool ignore_pending) { struct ieee80211_local *local = sta->local; struct ps_data *ps; bool indicate_tim = false; u8 ignore_for_tim = sta->sta.uapsd_queues; int ac; u16 id = sta->sta.aid; if (sta->sdata->vif.type == NL80211_IFTYPE_AP || sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN) { if (WARN_ON_ONCE(!sta->sdata->bss)) return; ps = &sta->sdata->bss->ps; #ifdef CONFIG_MAC80211_MESH } else if (ieee80211_vif_is_mesh(&sta->sdata->vif)) { ps = &sta->sdata->u.mesh.ps; #endif } else { return; } /* No need to do anything if the driver does all */ if (ieee80211_hw_check(&local->hw, AP_LINK_PS) && !local->ops->set_tim) return; if (sta->dead) goto done; /* * If all ACs are delivery-enabled then we should build * the TIM bit for all ACs anyway; if only some are then * we ignore those and build the TIM bit using only the * non-enabled ones. */ if (ignore_for_tim == BIT(IEEE80211_NUM_ACS) - 1) ignore_for_tim = 0; if (ignore_pending) ignore_for_tim = BIT(IEEE80211_NUM_ACS) - 1; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { unsigned long tids; if (ignore_for_tim & ieee80211_ac_to_qos_mask[ac]) continue; indicate_tim |= !skb_queue_empty(&sta->tx_filtered[ac]) || !skb_queue_empty(&sta->ps_tx_buf[ac]); if (indicate_tim) break; tids = ieee80211_tids_for_ac(ac); indicate_tim |= sta->driver_buffered_tids & tids; indicate_tim |= sta->txq_buffered_tids & tids; } done: spin_lock_bh(&local->tim_lock); if (indicate_tim == __bss_tim_get(ps->tim, id)) goto out_unlock; if (indicate_tim) __bss_tim_set(ps->tim, id); else __bss_tim_clear(ps->tim, id); if (local->ops->set_tim && !WARN_ON(sta->dead)) { local->tim_in_locked_section = true; drv_set_tim(local, &sta->sta, indicate_tim); local->tim_in_locked_section = false; } out_unlock: spin_unlock_bh(&local->tim_lock); } void sta_info_recalc_tim(struct sta_info *sta) { __sta_info_recalc_tim(sta, false); } static bool sta_info_buffer_expired(struct sta_info *sta, struct sk_buff *skb) { struct ieee80211_tx_info *info; int timeout; if (!skb) return false; info = IEEE80211_SKB_CB(skb); /* Timeout: (2 * listen_interval * beacon_int * 1024 / 1000000) sec */ timeout = (sta->listen_interval * sta->sdata->vif.bss_conf.beacon_int * 32 / 15625) * HZ; if (timeout < STA_TX_BUFFER_EXPIRE) timeout = STA_TX_BUFFER_EXPIRE; return time_after(jiffies, info->control.jiffies + timeout); } static bool sta_info_cleanup_expire_buffered_ac(struct ieee80211_local *local, struct sta_info *sta, int ac) { unsigned long flags; struct sk_buff *skb; /* * First check for frames that should expire on the filtered * queue. Frames here were rejected by the driver and are on * a separate queue to avoid reordering with normal PS-buffered * frames. They also aren't accounted for right now in the * total_ps_buffered counter. */ for (;;) { spin_lock_irqsave(&sta->tx_filtered[ac].lock, flags); skb = skb_peek(&sta->tx_filtered[ac]); if (sta_info_buffer_expired(sta, skb)) skb = __skb_dequeue(&sta->tx_filtered[ac]); else skb = NULL; spin_unlock_irqrestore(&sta->tx_filtered[ac].lock, flags); /* * Frames are queued in order, so if this one * hasn't expired yet we can stop testing. If * we actually reached the end of the queue we * also need to stop, of course. */ if (!skb) break; ieee80211_free_txskb(&local->hw, skb); } /* * Now also check the normal PS-buffered queue, this will * only find something if the filtered queue was emptied * since the filtered frames are all before the normal PS * buffered frames. */ for (;;) { spin_lock_irqsave(&sta->ps_tx_buf[ac].lock, flags); skb = skb_peek(&sta->ps_tx_buf[ac]); if (sta_info_buffer_expired(sta, skb)) skb = __skb_dequeue(&sta->ps_tx_buf[ac]); else skb = NULL; spin_unlock_irqrestore(&sta->ps_tx_buf[ac].lock, flags); /* * frames are queued in order, so if this one * hasn't expired yet (or we reached the end of * the queue) we can stop testing */ if (!skb) break; local->total_ps_buffered--; ps_dbg(sta->sdata, "Buffered frame expired (STA %pM)\n", sta->sta.addr); ieee80211_free_txskb(&local->hw, skb); } /* * Finally, recalculate the TIM bit for this station -- it might * now be clear because the station was too slow to retrieve its * frames. */ sta_info_recalc_tim(sta); /* * Return whether there are any frames still buffered, this is * used to check whether the cleanup timer still needs to run, * if there are no frames we don't need to rearm the timer. */ return !(skb_queue_empty(&sta->ps_tx_buf[ac]) && skb_queue_empty(&sta->tx_filtered[ac])); } static bool sta_info_cleanup_expire_buffered(struct ieee80211_local *local, struct sta_info *sta) { bool have_buffered = false; int ac; /* This is only necessary for stations on BSS/MBSS interfaces */ if (!sta->sdata->bss && !ieee80211_vif_is_mesh(&sta->sdata->vif)) return false; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) have_buffered |= sta_info_cleanup_expire_buffered_ac(local, sta, ac); return have_buffered; } static int __must_check __sta_info_destroy_part1(struct sta_info *sta) { struct ieee80211_local *local; struct ieee80211_sub_if_data *sdata; int ret, i; might_sleep(); if (!sta) return -ENOENT; local = sta->local; sdata = sta->sdata; lockdep_assert_wiphy(local->hw.wiphy); /* * Before removing the station from the driver and * rate control, it might still start new aggregation * sessions -- block that to make sure the tear-down * will be sufficient. */ set_sta_flag(sta, WLAN_STA_BLOCK_BA); ieee80211_sta_tear_down_BA_sessions(sta, AGG_STOP_DESTROY_STA); /* * Before removing the station from the driver there might be pending * rx frames on RSS queues sent prior to the disassociation - wait for * all such frames to be processed. */ drv_sync_rx_queues(local, sta); for (i = 0; i < ARRAY_SIZE(sta->link); i++) { struct link_sta_info *link_sta; if (!(sta->sta.valid_links & BIT(i))) continue; link_sta = rcu_dereference_protected(sta->link[i], lockdep_is_held(&local->hw.wiphy->mtx)); link_sta_info_hash_del(local, link_sta); } ret = sta_info_hash_del(local, sta); if (WARN_ON(ret)) return ret; /* * for TDLS peers, make sure to return to the base channel before * removal. */ if (test_sta_flag(sta, WLAN_STA_TDLS_OFF_CHANNEL)) { drv_tdls_cancel_channel_switch(local, sdata, &sta->sta); clear_sta_flag(sta, WLAN_STA_TDLS_OFF_CHANNEL); } list_del_rcu(&sta->list); sta->removed = true; if (sta->uploaded) drv_sta_pre_rcu_remove(local, sta->sdata, sta); if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN && rcu_access_pointer(sdata->u.vlan.sta) == sta) RCU_INIT_POINTER(sdata->u.vlan.sta, NULL); return 0; } static int _sta_info_move_state(struct sta_info *sta, enum ieee80211_sta_state new_state, bool recalc) { struct ieee80211_local *local = sta->local; might_sleep(); if (sta->sta_state == new_state) return 0; /* check allowed transitions first */ switch (new_state) { case IEEE80211_STA_NONE: if (sta->sta_state != IEEE80211_STA_AUTH) return -EINVAL; break; case IEEE80211_STA_AUTH: if (sta->sta_state != IEEE80211_STA_NONE && sta->sta_state != IEEE80211_STA_ASSOC) return -EINVAL; break; case IEEE80211_STA_ASSOC: if (sta->sta_state != IEEE80211_STA_AUTH && sta->sta_state != IEEE80211_STA_AUTHORIZED) return -EINVAL; break; case IEEE80211_STA_AUTHORIZED: if (sta->sta_state != IEEE80211_STA_ASSOC) return -EINVAL; break; default: WARN(1, "invalid state %d", new_state); return -EINVAL; } sta_dbg(sta->sdata, "moving STA %pM to state %d\n", sta->sta.addr, new_state); /* notify the driver before the actual changes so it can * fail the transition */ if (test_sta_flag(sta, WLAN_STA_INSERTED)) { int err = drv_sta_state(sta->local, sta->sdata, sta, sta->sta_state, new_state); if (err) return err; } /* reflect the change in all state variables */ switch (new_state) { case IEEE80211_STA_NONE: if (sta->sta_state == IEEE80211_STA_AUTH) clear_bit(WLAN_STA_AUTH, &sta->_flags); break; case IEEE80211_STA_AUTH: if (sta->sta_state == IEEE80211_STA_NONE) { set_bit(WLAN_STA_AUTH, &sta->_flags); } else if (sta->sta_state == IEEE80211_STA_ASSOC) { clear_bit(WLAN_STA_ASSOC, &sta->_flags); if (recalc) { ieee80211_recalc_min_chandef(sta->sdata, -1); if (!sta->sta.support_p2p_ps) ieee80211_recalc_p2p_go_ps_allowed(sta->sdata); } } break; case IEEE80211_STA_ASSOC: if (sta->sta_state == IEEE80211_STA_AUTH) { set_bit(WLAN_STA_ASSOC, &sta->_flags); sta->assoc_at = ktime_get_boottime_ns(); if (recalc) { ieee80211_recalc_min_chandef(sta->sdata, -1); if (!sta->sta.support_p2p_ps) ieee80211_recalc_p2p_go_ps_allowed(sta->sdata); } } else if (sta->sta_state == IEEE80211_STA_AUTHORIZED) { ieee80211_vif_dec_num_mcast(sta->sdata); clear_bit(WLAN_STA_AUTHORIZED, &sta->_flags); /* * If we have encryption offload, flush (station) queues * (after ensuring concurrent TX completed) so we won't * transmit anything later unencrypted if/when keys are * also removed, which might otherwise happen depending * on how the hardware offload works. */ if (local->ops->set_key) { synchronize_net(); if (local->ops->flush_sta) drv_flush_sta(local, sta->sdata, sta); else ieee80211_flush_queues(local, sta->sdata, false); } ieee80211_clear_fast_xmit(sta); ieee80211_clear_fast_rx(sta); } break; case IEEE80211_STA_AUTHORIZED: if (sta->sta_state == IEEE80211_STA_ASSOC) { ieee80211_vif_inc_num_mcast(sta->sdata); set_bit(WLAN_STA_AUTHORIZED, &sta->_flags); ieee80211_check_fast_xmit(sta); ieee80211_check_fast_rx(sta); } if (sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN || sta->sdata->vif.type == NL80211_IFTYPE_AP) cfg80211_send_layer2_update(sta->sdata->dev, sta->sta.addr); break; default: break; } sta->sta_state = new_state; return 0; } int sta_info_move_state(struct sta_info *sta, enum ieee80211_sta_state new_state) { return _sta_info_move_state(sta, new_state, true); } static void __sta_info_destroy_part2(struct sta_info *sta, bool recalc) { struct ieee80211_local *local = sta->local; struct ieee80211_sub_if_data *sdata = sta->sdata; struct station_info *sinfo; int ret; /* * NOTE: This assumes at least synchronize_net() was done * after _part1 and before _part2! */ /* * There's a potential race in _part1 where we set WLAN_STA_BLOCK_BA * but someone might have just gotten past a check, and not yet into * queuing the work/creating the data/etc. * * Do another round of destruction so that the worker is certainly * canceled before we later free the station. * * Since this is after synchronize_rcu()/synchronize_net() we're now * certain that nobody can actually hold a reference to the STA and * be calling e.g. ieee80211_start_tx_ba_session(). */ ieee80211_sta_tear_down_BA_sessions(sta, AGG_STOP_DESTROY_STA); might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (sta->sta_state == IEEE80211_STA_AUTHORIZED) { ret = _sta_info_move_state(sta, IEEE80211_STA_ASSOC, recalc); WARN_ON_ONCE(ret); } /* now keys can no longer be reached */ ieee80211_free_sta_keys(local, sta); /* disable TIM bit - last chance to tell driver */ __sta_info_recalc_tim(sta, true); sta->dead = true; local->num_sta--; local->sta_generation++; while (sta->sta_state > IEEE80211_STA_NONE) { ret = _sta_info_move_state(sta, sta->sta_state - 1, recalc); if (ret) { WARN_ON_ONCE(1); break; } } if (sta->uploaded) { ret = drv_sta_state(local, sdata, sta, IEEE80211_STA_NONE, IEEE80211_STA_NOTEXIST); WARN_ON_ONCE(ret != 0); } sta_dbg(sdata, "Removed STA %pM\n", sta->sta.addr); sinfo = kzalloc(sizeof(*sinfo), GFP_KERNEL); if (sinfo) sta_set_sinfo(sta, sinfo, true); cfg80211_del_sta_sinfo(sdata->dev, sta->sta.addr, sinfo, GFP_KERNEL); kfree(sinfo); ieee80211_sta_debugfs_remove(sta); ieee80211_destroy_frag_cache(&sta->frags); cleanup_single_sta(sta); } int __must_check __sta_info_destroy(struct sta_info *sta) { int err = __sta_info_destroy_part1(sta); if (err) return err; synchronize_net(); __sta_info_destroy_part2(sta, true); return 0; } int sta_info_destroy_addr(struct ieee80211_sub_if_data *sdata, const u8 *addr) { struct sta_info *sta; lockdep_assert_wiphy(sdata->local->hw.wiphy); sta = sta_info_get(sdata, addr); return __sta_info_destroy(sta); } int sta_info_destroy_addr_bss(struct ieee80211_sub_if_data *sdata, const u8 *addr) { struct sta_info *sta; lockdep_assert_wiphy(sdata->local->hw.wiphy); sta = sta_info_get_bss(sdata, addr); return __sta_info_destroy(sta); } static void sta_info_cleanup(struct timer_list *t) { struct ieee80211_local *local = from_timer(local, t, sta_cleanup); struct sta_info *sta; bool timer_needed = false; rcu_read_lock(); list_for_each_entry_rcu(sta, &local->sta_list, list) if (sta_info_cleanup_expire_buffered(local, sta)) timer_needed = true; rcu_read_unlock(); if (local->quiescing) return; if (!timer_needed) return; mod_timer(&local->sta_cleanup, round_jiffies(jiffies + STA_INFO_CLEANUP_INTERVAL)); } int sta_info_init(struct ieee80211_local *local) { int err; err = rhltable_init(&local->sta_hash, &sta_rht_params); if (err) return err; err = rhltable_init(&local->link_sta_hash, &link_sta_rht_params); if (err) { rhltable_destroy(&local->sta_hash); return err; } spin_lock_init(&local->tim_lock); INIT_LIST_HEAD(&local->sta_list); timer_setup(&local->sta_cleanup, sta_info_cleanup, 0); return 0; } void sta_info_stop(struct ieee80211_local *local) { del_timer_sync(&local->sta_cleanup); rhltable_destroy(&local->sta_hash); rhltable_destroy(&local->link_sta_hash); } int __sta_info_flush(struct ieee80211_sub_if_data *sdata, bool vlans, int link_id, struct sta_info *do_not_flush_sta) { struct ieee80211_local *local = sdata->local; struct sta_info *sta, *tmp; LIST_HEAD(free_list); int ret = 0; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); WARN_ON(vlans && sdata->vif.type != NL80211_IFTYPE_AP); WARN_ON(vlans && !sdata->bss); list_for_each_entry_safe(sta, tmp, &local->sta_list, list) { if (sdata != sta->sdata && (!vlans || sdata->bss != sta->sdata->bss)) continue; if (sta == do_not_flush_sta) continue; if (link_id >= 0 && sta->sta.valid_links && !(sta->sta.valid_links & BIT(link_id))) continue; if (!WARN_ON(__sta_info_destroy_part1(sta))) list_add(&sta->free_list, &free_list); ret++; } if (!list_empty(&free_list)) { bool support_p2p_ps = true; synchronize_net(); list_for_each_entry_safe(sta, tmp, &free_list, free_list) { if (!sta->sta.support_p2p_ps) support_p2p_ps = false; __sta_info_destroy_part2(sta, false); } ieee80211_recalc_min_chandef(sdata, -1); if (!support_p2p_ps) ieee80211_recalc_p2p_go_ps_allowed(sdata); } return ret; } void ieee80211_sta_expire(struct ieee80211_sub_if_data *sdata, unsigned long exp_time) { struct ieee80211_local *local = sdata->local; struct sta_info *sta, *tmp; lockdep_assert_wiphy(local->hw.wiphy); list_for_each_entry_safe(sta, tmp, &local->sta_list, list) { unsigned long last_active = ieee80211_sta_last_active(sta); if (sdata != sta->sdata) continue; if (time_is_before_jiffies(last_active + exp_time)) { sta_dbg(sta->sdata, "expiring inactive STA %pM\n", sta->sta.addr); if (ieee80211_vif_is_mesh(&sdata->vif) && test_sta_flag(sta, WLAN_STA_PS_STA)) atomic_dec(&sdata->u.mesh.ps.num_sta_ps); WARN_ON(__sta_info_destroy(sta)); } } } struct ieee80211_sta *ieee80211_find_sta_by_ifaddr(struct ieee80211_hw *hw, const u8 *addr, const u8 *localaddr) { struct ieee80211_local *local = hw_to_local(hw); struct rhlist_head *tmp; struct sta_info *sta; /* * Just return a random station if localaddr is NULL * ... first in list. */ for_each_sta_info(local, addr, sta, tmp) { if (localaddr && !ether_addr_equal(sta->sdata->vif.addr, localaddr)) continue; if (!sta->uploaded) return NULL; return &sta->sta; } return NULL; } EXPORT_SYMBOL_GPL(ieee80211_find_sta_by_ifaddr); struct ieee80211_sta *ieee80211_find_sta(struct ieee80211_vif *vif, const u8 *addr) { struct sta_info *sta; if (!vif) return NULL; sta = sta_info_get_bss(vif_to_sdata(vif), addr); if (!sta) return NULL; if (!sta->uploaded) return NULL; return &sta->sta; } EXPORT_SYMBOL(ieee80211_find_sta); /* powersave support code */ void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; struct sk_buff_head pending; int filtered = 0, buffered = 0, ac, i; unsigned long flags; struct ps_data *ps; if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) sdata = container_of(sdata->bss, struct ieee80211_sub_if_data, u.ap); if (sdata->vif.type == NL80211_IFTYPE_AP) ps = &sdata->bss->ps; else if (ieee80211_vif_is_mesh(&sdata->vif)) ps = &sdata->u.mesh.ps; else return; clear_sta_flag(sta, WLAN_STA_SP); BUILD_BUG_ON(BITS_TO_LONGS(IEEE80211_NUM_TIDS) > 1); sta->driver_buffered_tids = 0; sta->txq_buffered_tids = 0; if (!ieee80211_hw_check(&local->hw, AP_LINK_PS)) drv_sta_notify(local, sdata, STA_NOTIFY_AWAKE, &sta->sta); for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) { if (!sta->sta.txq[i] || !txq_has_queue(sta->sta.txq[i])) continue; schedule_and_wake_txq(local, to_txq_info(sta->sta.txq[i])); } skb_queue_head_init(&pending); /* sync with ieee80211_tx_h_unicast_ps_buf */ spin_lock_bh(&sta->ps_lock); /* Send all buffered frames to the station */ for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { int count = skb_queue_len(&pending), tmp; spin_lock_irqsave(&sta->tx_filtered[ac].lock, flags); skb_queue_splice_tail_init(&sta->tx_filtered[ac], &pending); spin_unlock_irqrestore(&sta->tx_filtered[ac].lock, flags); tmp = skb_queue_len(&pending); filtered += tmp - count; count = tmp; spin_lock_irqsave(&sta->ps_tx_buf[ac].lock, flags); skb_queue_splice_tail_init(&sta->ps_tx_buf[ac], &pending); spin_unlock_irqrestore(&sta->ps_tx_buf[ac].lock, flags); tmp = skb_queue_len(&pending); buffered += tmp - count; } ieee80211_add_pending_skbs(local, &pending); /* now we're no longer in the deliver code */ clear_sta_flag(sta, WLAN_STA_PS_DELIVER); /* The station might have polled and then woken up before we responded, * so clear these flags now to avoid them sticking around. */ clear_sta_flag(sta, WLAN_STA_PSPOLL); clear_sta_flag(sta, WLAN_STA_UAPSD); spin_unlock_bh(&sta->ps_lock); atomic_dec(&ps->num_sta_ps); local->total_ps_buffered -= buffered; sta_info_recalc_tim(sta); ps_dbg(sdata, "STA %pM aid %d sending %d filtered/%d PS frames since STA woke up\n", sta->sta.addr, sta->sta.aid, filtered, buffered); ieee80211_check_fast_xmit(sta); } static void ieee80211_send_null_response(struct sta_info *sta, int tid, enum ieee80211_frame_release_type reason, bool call_driver, bool more_data) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; struct ieee80211_qos_hdr *nullfunc; struct sk_buff *skb; int size = sizeof(*nullfunc); __le16 fc; bool qos = sta->sta.wme; struct ieee80211_tx_info *info; struct ieee80211_chanctx_conf *chanctx_conf; if (qos) { fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_QOS_NULLFUNC | IEEE80211_FCTL_FROMDS); } else { size -= 2; fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_NULLFUNC | IEEE80211_FCTL_FROMDS); } skb = dev_alloc_skb(local->hw.extra_tx_headroom + size); if (!skb) return; skb_reserve(skb, local->hw.extra_tx_headroom); nullfunc = skb_put(skb, size); nullfunc->frame_control = fc; nullfunc->duration_id = 0; memcpy(nullfunc->addr1, sta->sta.addr, ETH_ALEN); memcpy(nullfunc->addr2, sdata->vif.addr, ETH_ALEN); memcpy(nullfunc->addr3, sdata->vif.addr, ETH_ALEN); nullfunc->seq_ctrl = 0; skb->priority = tid; skb_set_queue_mapping(skb, ieee802_1d_to_ac[tid]); if (qos) { nullfunc->qos_ctrl = cpu_to_le16(tid); if (reason == IEEE80211_FRAME_RELEASE_UAPSD) { nullfunc->qos_ctrl |= cpu_to_le16(IEEE80211_QOS_CTL_EOSP); if (more_data) nullfunc->frame_control |= cpu_to_le16(IEEE80211_FCTL_MOREDATA); } } info = IEEE80211_SKB_CB(skb); /* * Tell TX path to send this frame even though the * STA may still remain is PS mode after this frame * exchange. Also set EOSP to indicate this packet * ends the poll/service period. */ info->flags |= IEEE80211_TX_CTL_NO_PS_BUFFER | IEEE80211_TX_STATUS_EOSP | IEEE80211_TX_CTL_REQ_TX_STATUS; info->control.flags |= IEEE80211_TX_CTRL_PS_RESPONSE; if (call_driver) drv_allow_buffered_frames(local, sta, BIT(tid), 1, reason, false); skb->dev = sdata->dev; rcu_read_lock(); chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf); if (WARN_ON(!chanctx_conf)) { rcu_read_unlock(); kfree_skb(skb); return; } info->band = chanctx_conf->def.chan->band; ieee80211_xmit(sdata, sta, skb); rcu_read_unlock(); } static int find_highest_prio_tid(unsigned long tids) { /* lower 3 TIDs aren't ordered perfectly */ if (tids & 0xF8) return fls(tids) - 1; /* TID 0 is BE just like TID 3 */ if (tids & BIT(0)) return 0; return fls(tids) - 1; } /* Indicates if the MORE_DATA bit should be set in the last * frame obtained by ieee80211_sta_ps_get_frames. * Note that driver_release_tids is relevant only if * reason = IEEE80211_FRAME_RELEASE_PSPOLL */ static bool ieee80211_sta_ps_more_data(struct sta_info *sta, u8 ignored_acs, enum ieee80211_frame_release_type reason, unsigned long driver_release_tids) { int ac; /* If the driver has data on more than one TID then * certainly there's more data if we release just a * single frame now (from a single TID). This will * only happen for PS-Poll. */ if (reason == IEEE80211_FRAME_RELEASE_PSPOLL && hweight16(driver_release_tids) > 1) return true; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { if (ignored_acs & ieee80211_ac_to_qos_mask[ac]) continue; if (!skb_queue_empty(&sta->tx_filtered[ac]) || !skb_queue_empty(&sta->ps_tx_buf[ac])) return true; } return false; } static void ieee80211_sta_ps_get_frames(struct sta_info *sta, int n_frames, u8 ignored_acs, enum ieee80211_frame_release_type reason, struct sk_buff_head *frames, unsigned long *driver_release_tids) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; int ac; /* Get response frame(s) and more data bit for the last one. */ for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { unsigned long tids; if (ignored_acs & ieee80211_ac_to_qos_mask[ac]) continue; tids = ieee80211_tids_for_ac(ac); /* if we already have frames from software, then we can't also * release from hardware queues */ if (skb_queue_empty(frames)) { *driver_release_tids |= sta->driver_buffered_tids & tids; *driver_release_tids |= sta->txq_buffered_tids & tids; } if (!*driver_release_tids) { struct sk_buff *skb; while (n_frames > 0) { skb = skb_dequeue(&sta->tx_filtered[ac]); if (!skb) { skb = skb_dequeue( &sta->ps_tx_buf[ac]); if (skb) local->total_ps_buffered--; } if (!skb) break; n_frames--; __skb_queue_tail(frames, skb); } } /* If we have more frames buffered on this AC, then abort the * loop since we can't send more data from other ACs before * the buffered frames from this. */ if (!skb_queue_empty(&sta->tx_filtered[ac]) || !skb_queue_empty(&sta->ps_tx_buf[ac])) break; } } static void ieee80211_sta_ps_deliver_response(struct sta_info *sta, int n_frames, u8 ignored_acs, enum ieee80211_frame_release_type reason) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; unsigned long driver_release_tids = 0; struct sk_buff_head frames; bool more_data; /* Service or PS-Poll period starts */ set_sta_flag(sta, WLAN_STA_SP); __skb_queue_head_init(&frames); ieee80211_sta_ps_get_frames(sta, n_frames, ignored_acs, reason, &frames, &driver_release_tids); more_data = ieee80211_sta_ps_more_data(sta, ignored_acs, reason, driver_release_tids); if (driver_release_tids && reason == IEEE80211_FRAME_RELEASE_PSPOLL) driver_release_tids = BIT(find_highest_prio_tid(driver_release_tids)); if (skb_queue_empty(&frames) && !driver_release_tids) { int tid, ac; /* * For PS-Poll, this can only happen due to a race condition * when we set the TIM bit and the station notices it, but * before it can poll for the frame we expire it. * * For uAPSD, this is said in the standard (11.2.1.5 h): * At each unscheduled SP for a non-AP STA, the AP shall * attempt to transmit at least one MSDU or MMPDU, but no * more than the value specified in the Max SP Length field * in the QoS Capability element from delivery-enabled ACs, * that are destined for the non-AP STA. * * Since we have no other MSDU/MMPDU, transmit a QoS null frame. */ /* This will evaluate to 1, 3, 5 or 7. */ for (ac = IEEE80211_AC_VO; ac < IEEE80211_NUM_ACS; ac++) if (!(ignored_acs & ieee80211_ac_to_qos_mask[ac])) break; tid = 7 - 2 * ac; ieee80211_send_null_response(sta, tid, reason, true, false); } else if (!driver_release_tids) { struct sk_buff_head pending; struct sk_buff *skb; int num = 0; u16 tids = 0; bool need_null = false; skb_queue_head_init(&pending); while ((skb = __skb_dequeue(&frames))) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_hdr *hdr = (void *) skb->data; u8 *qoshdr = NULL; num++; /* * Tell TX path to send this frame even though the * STA may still remain is PS mode after this frame * exchange. */ info->flags |= IEEE80211_TX_CTL_NO_PS_BUFFER; info->control.flags |= IEEE80211_TX_CTRL_PS_RESPONSE; /* * Use MoreData flag to indicate whether there are * more buffered frames for this STA */ if (more_data || !skb_queue_empty(&frames)) hdr->frame_control |= cpu_to_le16(IEEE80211_FCTL_MOREDATA); else hdr->frame_control &= cpu_to_le16(~IEEE80211_FCTL_MOREDATA); if (ieee80211_is_data_qos(hdr->frame_control) || ieee80211_is_qos_nullfunc(hdr->frame_control)) qoshdr = ieee80211_get_qos_ctl(hdr); tids |= BIT(skb->priority); __skb_queue_tail(&pending, skb); /* end service period after last frame or add one */ if (!skb_queue_empty(&frames)) continue; if (reason != IEEE80211_FRAME_RELEASE_UAPSD) { /* for PS-Poll, there's only one frame */ info->flags |= IEEE80211_TX_STATUS_EOSP | IEEE80211_TX_CTL_REQ_TX_STATUS; break; } /* For uAPSD, things are a bit more complicated. If the * last frame has a QoS header (i.e. is a QoS-data or * QoS-nulldata frame) then just set the EOSP bit there * and be done. * If the frame doesn't have a QoS header (which means * it should be a bufferable MMPDU) then we can't set * the EOSP bit in the QoS header; add a QoS-nulldata * frame to the list to send it after the MMPDU. * * Note that this code is only in the mac80211-release * code path, we assume that the driver will not buffer * anything but QoS-data frames, or if it does, will * create the QoS-nulldata frame by itself if needed. * * Cf. 802.11-2012 10.2.1.10 (c). */ if (qoshdr) { *qoshdr |= IEEE80211_QOS_CTL_EOSP; info->flags |= IEEE80211_TX_STATUS_EOSP | IEEE80211_TX_CTL_REQ_TX_STATUS; } else { /* The standard isn't completely clear on this * as it says the more-data bit should be set * if there are more BUs. The QoS-Null frame * we're about to send isn't buffered yet, we * only create it below, but let's pretend it * was buffered just in case some clients only * expect more-data=0 when eosp=1. */ hdr->frame_control |= cpu_to_le16(IEEE80211_FCTL_MOREDATA); need_null = true; num++; } break; } drv_allow_buffered_frames(local, sta, tids, num, reason, more_data); ieee80211_add_pending_skbs(local, &pending); if (need_null) ieee80211_send_null_response( sta, find_highest_prio_tid(tids), reason, false, false); sta_info_recalc_tim(sta); } else { int tid; /* * We need to release a frame that is buffered somewhere in the * driver ... it'll have to handle that. * Note that the driver also has to check the number of frames * on the TIDs we're releasing from - if there are more than * n_frames it has to set the more-data bit (if we didn't ask * it to set it anyway due to other buffered frames); if there * are fewer than n_frames it has to make sure to adjust that * to allow the service period to end properly. */ drv_release_buffered_frames(local, sta, driver_release_tids, n_frames, reason, more_data); /* * Note that we don't recalculate the TIM bit here as it would * most likely have no effect at all unless the driver told us * that the TID(s) became empty before returning here from the * release function. * Either way, however, when the driver tells us that the TID(s) * became empty or we find that a txq became empty, we'll do the * TIM recalculation. */ for (tid = 0; tid < ARRAY_SIZE(sta->sta.txq); tid++) { if (!sta->sta.txq[tid] || !(driver_release_tids & BIT(tid)) || txq_has_queue(sta->sta.txq[tid])) continue; sta_info_recalc_tim(sta); break; } } } void ieee80211_sta_ps_deliver_poll_response(struct sta_info *sta) { u8 ignore_for_response = sta->sta.uapsd_queues; /* * If all ACs are delivery-enabled then we should reply * from any of them, if only some are enabled we reply * only from the non-enabled ones. */ if (ignore_for_response == BIT(IEEE80211_NUM_ACS) - 1) ignore_for_response = 0; ieee80211_sta_ps_deliver_response(sta, 1, ignore_for_response, IEEE80211_FRAME_RELEASE_PSPOLL); } void ieee80211_sta_ps_deliver_uapsd(struct sta_info *sta) { int n_frames = sta->sta.max_sp; u8 delivery_enabled = sta->sta.uapsd_queues; /* * If we ever grow support for TSPEC this might happen if * the TSPEC update from hostapd comes in between a trigger * frame setting WLAN_STA_UAPSD in the RX path and this * actually getting called. */ if (!delivery_enabled) return; switch (sta->sta.max_sp) { case 1: n_frames = 2; break; case 2: n_frames = 4; break; case 3: n_frames = 6; break; case 0: /* XXX: what is a good value? */ n_frames = 128; break; } ieee80211_sta_ps_deliver_response(sta, n_frames, ~delivery_enabled, IEEE80211_FRAME_RELEASE_UAPSD); } void ieee80211_sta_block_awake(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta, bool block) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); trace_api_sta_block_awake(sta->local, pubsta, block); if (block) { set_sta_flag(sta, WLAN_STA_PS_DRIVER); ieee80211_clear_fast_xmit(sta); return; } if (!test_sta_flag(sta, WLAN_STA_PS_DRIVER)) return; if (!test_sta_flag(sta, WLAN_STA_PS_STA)) { set_sta_flag(sta, WLAN_STA_PS_DELIVER); clear_sta_flag(sta, WLAN_STA_PS_DRIVER); ieee80211_queue_work(hw, &sta->drv_deliver_wk); } else if (test_sta_flag(sta, WLAN_STA_PSPOLL) || test_sta_flag(sta, WLAN_STA_UAPSD)) { /* must be asleep in this case */ clear_sta_flag(sta, WLAN_STA_PS_DRIVER); ieee80211_queue_work(hw, &sta->drv_deliver_wk); } else { clear_sta_flag(sta, WLAN_STA_PS_DRIVER); ieee80211_check_fast_xmit(sta); } } EXPORT_SYMBOL(ieee80211_sta_block_awake); void ieee80211_sta_eosp(struct ieee80211_sta *pubsta) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); struct ieee80211_local *local = sta->local; trace_api_eosp(local, pubsta); clear_sta_flag(sta, WLAN_STA_SP); } EXPORT_SYMBOL(ieee80211_sta_eosp); void ieee80211_send_eosp_nullfunc(struct ieee80211_sta *pubsta, int tid) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); enum ieee80211_frame_release_type reason; bool more_data; trace_api_send_eosp_nullfunc(sta->local, pubsta, tid); reason = IEEE80211_FRAME_RELEASE_UAPSD; more_data = ieee80211_sta_ps_more_data(sta, ~sta->sta.uapsd_queues, reason, 0); ieee80211_send_null_response(sta, tid, reason, false, more_data); } EXPORT_SYMBOL(ieee80211_send_eosp_nullfunc); void ieee80211_sta_set_buffered(struct ieee80211_sta *pubsta, u8 tid, bool buffered) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); if (WARN_ON(tid >= IEEE80211_NUM_TIDS)) return; trace_api_sta_set_buffered(sta->local, pubsta, tid, buffered); if (buffered) set_bit(tid, &sta->driver_buffered_tids); else clear_bit(tid, &sta->driver_buffered_tids); sta_info_recalc_tim(sta); } EXPORT_SYMBOL(ieee80211_sta_set_buffered); void ieee80211_sta_register_airtime(struct ieee80211_sta *pubsta, u8 tid, u32 tx_airtime, u32 rx_airtime) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); struct ieee80211_local *local = sta->sdata->local; u8 ac = ieee80211_ac_from_tid(tid); u32 airtime = 0; if (sta->local->airtime_flags & AIRTIME_USE_TX) airtime += tx_airtime; if (sta->local->airtime_flags & AIRTIME_USE_RX) airtime += rx_airtime; spin_lock_bh(&local->active_txq_lock[ac]); sta->airtime[ac].tx_airtime += tx_airtime; sta->airtime[ac].rx_airtime += rx_airtime; if (ieee80211_sta_keep_active(sta, ac)) sta->airtime[ac].deficit -= airtime; spin_unlock_bh(&local->active_txq_lock[ac]); } EXPORT_SYMBOL(ieee80211_sta_register_airtime); void __ieee80211_sta_recalc_aggregates(struct sta_info *sta, u16 active_links) { bool first = true; int link_id; if (!sta->sta.valid_links || !sta->sta.mlo) { sta->sta.cur = &sta->sta.deflink.agg; return; } rcu_read_lock(); for (link_id = 0; link_id < ARRAY_SIZE((sta)->link); link_id++) { struct ieee80211_link_sta *link_sta; int i; if (!(active_links & BIT(link_id))) continue; link_sta = rcu_dereference(sta->sta.link[link_id]); if (!link_sta) continue; if (first) { sta->cur = sta->sta.deflink.agg; first = false; continue; } sta->cur.max_amsdu_len = min(sta->cur.max_amsdu_len, link_sta->agg.max_amsdu_len); sta->cur.max_rc_amsdu_len = min(sta->cur.max_rc_amsdu_len, link_sta->agg.max_rc_amsdu_len); for (i = 0; i < ARRAY_SIZE(sta->cur.max_tid_amsdu_len); i++) sta->cur.max_tid_amsdu_len[i] = min(sta->cur.max_tid_amsdu_len[i], link_sta->agg.max_tid_amsdu_len[i]); } rcu_read_unlock(); sta->sta.cur = &sta->cur; } void ieee80211_sta_recalc_aggregates(struct ieee80211_sta *pubsta) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); __ieee80211_sta_recalc_aggregates(sta, sta->sdata->vif.active_links); } EXPORT_SYMBOL(ieee80211_sta_recalc_aggregates); void ieee80211_sta_update_pending_airtime(struct ieee80211_local *local, struct sta_info *sta, u8 ac, u16 tx_airtime, bool tx_completed) { int tx_pending; if (!wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_AQL)) return; if (!tx_completed) { if (sta) atomic_add(tx_airtime, &sta->airtime[ac].aql_tx_pending); atomic_add(tx_airtime, &local->aql_total_pending_airtime); atomic_add(tx_airtime, &local->aql_ac_pending_airtime[ac]); return; } if (sta) { tx_pending = atomic_sub_return(tx_airtime, &sta->airtime[ac].aql_tx_pending); if (tx_pending < 0) atomic_cmpxchg(&sta->airtime[ac].aql_tx_pending, tx_pending, 0); } atomic_sub(tx_airtime, &local->aql_total_pending_airtime); tx_pending = atomic_sub_return(tx_airtime, &local->aql_ac_pending_airtime[ac]); if (WARN_ONCE(tx_pending < 0, "Device %s AC %d pending airtime underflow: %u, %u", wiphy_name(local->hw.wiphy), ac, tx_pending, tx_airtime)) { atomic_cmpxchg(&local->aql_ac_pending_airtime[ac], tx_pending, 0); atomic_sub(tx_pending, &local->aql_total_pending_airtime); } } static struct ieee80211_sta_rx_stats * sta_get_last_rx_stats(struct sta_info *sta) { struct ieee80211_sta_rx_stats *stats = &sta->deflink.rx_stats; int cpu; if (!sta->deflink.pcpu_rx_stats) return stats; for_each_possible_cpu(cpu) { struct ieee80211_sta_rx_stats *cpustats; cpustats = per_cpu_ptr(sta->deflink.pcpu_rx_stats, cpu); if (time_after(cpustats->last_rx, stats->last_rx)) stats = cpustats; } return stats; } static void sta_stats_decode_rate(struct ieee80211_local *local, u32 rate, struct rate_info *rinfo) { rinfo->bw = STA_STATS_GET(BW, rate); switch (STA_STATS_GET(TYPE, rate)) { case STA_STATS_RATE_TYPE_VHT: rinfo->flags = RATE_INFO_FLAGS_VHT_MCS; rinfo->mcs = STA_STATS_GET(VHT_MCS, rate); rinfo->nss = STA_STATS_GET(VHT_NSS, rate); if (STA_STATS_GET(SGI, rate)) rinfo->flags |= RATE_INFO_FLAGS_SHORT_GI; break; case STA_STATS_RATE_TYPE_HT: rinfo->flags = RATE_INFO_FLAGS_MCS; rinfo->mcs = STA_STATS_GET(HT_MCS, rate); if (STA_STATS_GET(SGI, rate)) rinfo->flags |= RATE_INFO_FLAGS_SHORT_GI; break; case STA_STATS_RATE_TYPE_LEGACY: { struct ieee80211_supported_band *sband; u16 brate; unsigned int shift; int band = STA_STATS_GET(LEGACY_BAND, rate); int rate_idx = STA_STATS_GET(LEGACY_IDX, rate); sband = local->hw.wiphy->bands[band]; if (WARN_ON_ONCE(!sband->bitrates)) break; brate = sband->bitrates[rate_idx].bitrate; if (rinfo->bw == RATE_INFO_BW_5) shift = 2; else if (rinfo->bw == RATE_INFO_BW_10) shift = 1; else shift = 0; rinfo->legacy = DIV_ROUND_UP(brate, 1 << shift); break; } case STA_STATS_RATE_TYPE_HE: rinfo->flags = RATE_INFO_FLAGS_HE_MCS; rinfo->mcs = STA_STATS_GET(HE_MCS, rate); rinfo->nss = STA_STATS_GET(HE_NSS, rate); rinfo->he_gi = STA_STATS_GET(HE_GI, rate); rinfo->he_ru_alloc = STA_STATS_GET(HE_RU, rate); rinfo->he_dcm = STA_STATS_GET(HE_DCM, rate); break; case STA_STATS_RATE_TYPE_EHT: rinfo->flags = RATE_INFO_FLAGS_EHT_MCS; rinfo->mcs = STA_STATS_GET(EHT_MCS, rate); rinfo->nss = STA_STATS_GET(EHT_NSS, rate); rinfo->eht_gi = STA_STATS_GET(EHT_GI, rate); rinfo->eht_ru_alloc = STA_STATS_GET(EHT_RU, rate); break; } } static int sta_set_rate_info_rx(struct sta_info *sta, struct rate_info *rinfo) { u32 rate = READ_ONCE(sta_get_last_rx_stats(sta)->last_rate); if (rate == STA_STATS_RATE_INVALID) return -EINVAL; sta_stats_decode_rate(sta->local, rate, rinfo); return 0; } static inline u64 sta_get_tidstats_msdu(struct ieee80211_sta_rx_stats *rxstats, int tid) { unsigned int start; u64 value; do { start = u64_stats_fetch_begin(&rxstats->syncp); value = rxstats->msdu[tid]; } while (u64_stats_fetch_retry(&rxstats->syncp, start)); return value; } static void sta_set_tidstats(struct sta_info *sta, struct cfg80211_tid_stats *tidstats, int tid) { struct ieee80211_local *local = sta->local; int cpu; if (!(tidstats->filled & BIT(NL80211_TID_STATS_RX_MSDU))) { tidstats->rx_msdu += sta_get_tidstats_msdu(&sta->deflink.rx_stats, tid); if (sta->deflink.pcpu_rx_stats) { for_each_possible_cpu(cpu) { struct ieee80211_sta_rx_stats *cpurxs; cpurxs = per_cpu_ptr(sta->deflink.pcpu_rx_stats, cpu); tidstats->rx_msdu += sta_get_tidstats_msdu(cpurxs, tid); } } tidstats->filled |= BIT(NL80211_TID_STATS_RX_MSDU); } if (!(tidstats->filled & BIT(NL80211_TID_STATS_TX_MSDU))) { tidstats->filled |= BIT(NL80211_TID_STATS_TX_MSDU); tidstats->tx_msdu = sta->deflink.tx_stats.msdu[tid]; } if (!(tidstats->filled & BIT(NL80211_TID_STATS_TX_MSDU_RETRIES)) && ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) { tidstats->filled |= BIT(NL80211_TID_STATS_TX_MSDU_RETRIES); tidstats->tx_msdu_retries = sta->deflink.status_stats.msdu_retries[tid]; } if (!(tidstats->filled & BIT(NL80211_TID_STATS_TX_MSDU_FAILED)) && ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) { tidstats->filled |= BIT(NL80211_TID_STATS_TX_MSDU_FAILED); tidstats->tx_msdu_failed = sta->deflink.status_stats.msdu_failed[tid]; } if (tid < IEEE80211_NUM_TIDS) { spin_lock_bh(&local->fq.lock); rcu_read_lock(); tidstats->filled |= BIT(NL80211_TID_STATS_TXQ_STATS); ieee80211_fill_txq_stats(&tidstats->txq_stats, to_txq_info(sta->sta.txq[tid])); rcu_read_unlock(); spin_unlock_bh(&local->fq.lock); } } static inline u64 sta_get_stats_bytes(struct ieee80211_sta_rx_stats *rxstats) { unsigned int start; u64 value; do { start = u64_stats_fetch_begin(&rxstats->syncp); value = rxstats->bytes; } while (u64_stats_fetch_retry(&rxstats->syncp, start)); return value; } void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, bool tidstats) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; u32 thr = 0; int i, ac, cpu; struct ieee80211_sta_rx_stats *last_rxstats; last_rxstats = sta_get_last_rx_stats(sta); sinfo->generation = sdata->local->sta_generation; /* do before driver, so beacon filtering drivers have a * chance to e.g. just add the number of filtered beacons * (or just modify the value entirely, of course) */ if (sdata->vif.type == NL80211_IFTYPE_STATION) sinfo->rx_beacon = sdata->deflink.u.mgd.count_beacon_signal; drv_sta_statistics(local, sdata, &sta->sta, sinfo); sinfo->filled |= BIT_ULL(NL80211_STA_INFO_INACTIVE_TIME) | BIT_ULL(NL80211_STA_INFO_STA_FLAGS) | BIT_ULL(NL80211_STA_INFO_BSS_PARAM) | BIT_ULL(NL80211_STA_INFO_CONNECTED_TIME) | BIT_ULL(NL80211_STA_INFO_ASSOC_AT_BOOTTIME) | BIT_ULL(NL80211_STA_INFO_RX_DROP_MISC); if (sdata->vif.type == NL80211_IFTYPE_STATION) { sinfo->beacon_loss_count = sdata->deflink.u.mgd.beacon_loss_count; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_BEACON_LOSS); } sinfo->connected_time = ktime_get_seconds() - sta->last_connected; sinfo->assoc_at = sta->assoc_at; sinfo->inactive_time = jiffies_to_msecs(jiffies - ieee80211_sta_last_active(sta)); if (!(sinfo->filled & (BIT_ULL(NL80211_STA_INFO_TX_BYTES64) | BIT_ULL(NL80211_STA_INFO_TX_BYTES)))) { sinfo->tx_bytes = 0; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) sinfo->tx_bytes += sta->deflink.tx_stats.bytes[ac]; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_BYTES64); } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_PACKETS))) { sinfo->tx_packets = 0; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) sinfo->tx_packets += sta->deflink.tx_stats.packets[ac]; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_PACKETS); } if (!(sinfo->filled & (BIT_ULL(NL80211_STA_INFO_RX_BYTES64) | BIT_ULL(NL80211_STA_INFO_RX_BYTES)))) { sinfo->rx_bytes += sta_get_stats_bytes(&sta->deflink.rx_stats); if (sta->deflink.pcpu_rx_stats) { for_each_possible_cpu(cpu) { struct ieee80211_sta_rx_stats *cpurxs; cpurxs = per_cpu_ptr(sta->deflink.pcpu_rx_stats, cpu); sinfo->rx_bytes += sta_get_stats_bytes(cpurxs); } } sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_BYTES64); } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_RX_PACKETS))) { sinfo->rx_packets = sta->deflink.rx_stats.packets; if (sta->deflink.pcpu_rx_stats) { for_each_possible_cpu(cpu) { struct ieee80211_sta_rx_stats *cpurxs; cpurxs = per_cpu_ptr(sta->deflink.pcpu_rx_stats, cpu); sinfo->rx_packets += cpurxs->packets; } } sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_PACKETS); } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_RETRIES))) { sinfo->tx_retries = sta->deflink.status_stats.retry_count; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_RETRIES); } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_FAILED))) { sinfo->tx_failed = sta->deflink.status_stats.retry_failed; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_FAILED); } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_RX_DURATION))) { for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) sinfo->rx_duration += sta->airtime[ac].rx_airtime; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_DURATION); } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_DURATION))) { for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) sinfo->tx_duration += sta->airtime[ac].tx_airtime; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_DURATION); } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_AIRTIME_WEIGHT))) { sinfo->airtime_weight = sta->airtime_weight; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_AIRTIME_WEIGHT); } sinfo->rx_dropped_misc = sta->deflink.rx_stats.dropped; if (sta->deflink.pcpu_rx_stats) { for_each_possible_cpu(cpu) { struct ieee80211_sta_rx_stats *cpurxs; cpurxs = per_cpu_ptr(sta->deflink.pcpu_rx_stats, cpu); sinfo->rx_dropped_misc += cpurxs->dropped; } } if (sdata->vif.type == NL80211_IFTYPE_STATION && !(sdata->vif.driver_flags & IEEE80211_VIF_BEACON_FILTER)) { sinfo->filled |= BIT_ULL(NL80211_STA_INFO_BEACON_RX) | BIT_ULL(NL80211_STA_INFO_BEACON_SIGNAL_AVG); sinfo->rx_beacon_signal_avg = ieee80211_ave_rssi(&sdata->vif); } if (ieee80211_hw_check(&sta->local->hw, SIGNAL_DBM) || ieee80211_hw_check(&sta->local->hw, SIGNAL_UNSPEC)) { if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_SIGNAL))) { sinfo->signal = (s8)last_rxstats->last_signal; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_SIGNAL); } if (!sta->deflink.pcpu_rx_stats && !(sinfo->filled & BIT_ULL(NL80211_STA_INFO_SIGNAL_AVG))) { sinfo->signal_avg = -ewma_signal_read(&sta->deflink.rx_stats_avg.signal); sinfo->filled |= BIT_ULL(NL80211_STA_INFO_SIGNAL_AVG); } } /* for the average - if pcpu_rx_stats isn't set - rxstats must point to * the sta->rx_stats struct, so the check here is fine with and without * pcpu statistics */ if (last_rxstats->chains && !(sinfo->filled & (BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL) | BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL_AVG)))) { sinfo->filled |= BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL); if (!sta->deflink.pcpu_rx_stats) sinfo->filled |= BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL_AVG); sinfo->chains = last_rxstats->chains; for (i = 0; i < ARRAY_SIZE(sinfo->chain_signal); i++) { sinfo->chain_signal[i] = last_rxstats->chain_signal_last[i]; sinfo->chain_signal_avg[i] = -ewma_signal_read(&sta->deflink.rx_stats_avg.chain_signal[i]); } } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_BITRATE)) && !sta->sta.valid_links && ieee80211_rate_valid(&sta->deflink.tx_stats.last_rate)) { sta_set_rate_info_tx(sta, &sta->deflink.tx_stats.last_rate, &sinfo->txrate); sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_BITRATE); } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_RX_BITRATE)) && !sta->sta.valid_links) { if (sta_set_rate_info_rx(sta, &sinfo->rxrate) == 0) sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_BITRATE); } if (tidstats && !cfg80211_sinfo_alloc_tid_stats(sinfo, GFP_KERNEL)) { for (i = 0; i < IEEE80211_NUM_TIDS + 1; i++) sta_set_tidstats(sta, &sinfo->pertid[i], i); } if (ieee80211_vif_is_mesh(&sdata->vif)) { #ifdef CONFIG_MAC80211_MESH sinfo->filled |= BIT_ULL(NL80211_STA_INFO_LLID) | BIT_ULL(NL80211_STA_INFO_PLID) | BIT_ULL(NL80211_STA_INFO_PLINK_STATE) | BIT_ULL(NL80211_STA_INFO_LOCAL_PM) | BIT_ULL(NL80211_STA_INFO_PEER_PM) | BIT_ULL(NL80211_STA_INFO_NONPEER_PM) | BIT_ULL(NL80211_STA_INFO_CONNECTED_TO_GATE) | BIT_ULL(NL80211_STA_INFO_CONNECTED_TO_AS); sinfo->llid = sta->mesh->llid; sinfo->plid = sta->mesh->plid; sinfo->plink_state = sta->mesh->plink_state; if (test_sta_flag(sta, WLAN_STA_TOFFSET_KNOWN)) { sinfo->filled |= BIT_ULL(NL80211_STA_INFO_T_OFFSET); sinfo->t_offset = sta->mesh->t_offset; } sinfo->local_pm = sta->mesh->local_pm; sinfo->peer_pm = sta->mesh->peer_pm; sinfo->nonpeer_pm = sta->mesh->nonpeer_pm; sinfo->connected_to_gate = sta->mesh->connected_to_gate; sinfo->connected_to_as = sta->mesh->connected_to_as; #endif } sinfo->bss_param.flags = 0; if (sdata->vif.bss_conf.use_cts_prot) sinfo->bss_param.flags |= BSS_PARAM_FLAGS_CTS_PROT; if (sdata->vif.bss_conf.use_short_preamble) sinfo->bss_param.flags |= BSS_PARAM_FLAGS_SHORT_PREAMBLE; if (sdata->vif.bss_conf.use_short_slot) sinfo->bss_param.flags |= BSS_PARAM_FLAGS_SHORT_SLOT_TIME; sinfo->bss_param.dtim_period = sdata->vif.bss_conf.dtim_period; sinfo->bss_param.beacon_interval = sdata->vif.bss_conf.beacon_int; sinfo->sta_flags.set = 0; sinfo->sta_flags.mask = BIT(NL80211_STA_FLAG_AUTHORIZED) | BIT(NL80211_STA_FLAG_SHORT_PREAMBLE) | BIT(NL80211_STA_FLAG_WME) | BIT(NL80211_STA_FLAG_MFP) | BIT(NL80211_STA_FLAG_AUTHENTICATED) | BIT(NL80211_STA_FLAG_ASSOCIATED) | BIT(NL80211_STA_FLAG_TDLS_PEER); if (test_sta_flag(sta, WLAN_STA_AUTHORIZED)) sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_AUTHORIZED); if (test_sta_flag(sta, WLAN_STA_SHORT_PREAMBLE)) sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_SHORT_PREAMBLE); if (sta->sta.wme) sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_WME); if (test_sta_flag(sta, WLAN_STA_MFP)) sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_MFP); if (test_sta_flag(sta, WLAN_STA_AUTH)) sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_AUTHENTICATED); if (test_sta_flag(sta, WLAN_STA_ASSOC)) sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_ASSOCIATED); if (test_sta_flag(sta, WLAN_STA_TDLS_PEER)) sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_TDLS_PEER); thr = sta_get_expected_throughput(sta); if (thr != 0) { sinfo->filled |= BIT_ULL(NL80211_STA_INFO_EXPECTED_THROUGHPUT); sinfo->expected_throughput = thr; } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL)) && sta->deflink.status_stats.ack_signal_filled) { sinfo->ack_signal = sta->deflink.status_stats.last_ack_signal; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL); } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL_AVG)) && sta->deflink.status_stats.ack_signal_filled) { sinfo->avg_ack_signal = -(s8)ewma_avg_signal_read( &sta->deflink.status_stats.avg_ack_signal); sinfo->filled |= BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL_AVG); } if (ieee80211_vif_is_mesh(&sdata->vif)) { sinfo->filled |= BIT_ULL(NL80211_STA_INFO_AIRTIME_LINK_METRIC); sinfo->airtime_link_metric = airtime_link_metric_get(local, sta); } } u32 sta_get_expected_throughput(struct sta_info *sta) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; struct rate_control_ref *ref = NULL; u32 thr = 0; if (test_sta_flag(sta, WLAN_STA_RATE_CONTROL)) ref = local->rate_ctrl; /* check if the driver has a SW RC implementation */ if (ref && ref->ops->get_expected_throughput) thr = ref->ops->get_expected_throughput(sta->rate_ctrl_priv); else thr = drv_get_expected_throughput(local, sta); return thr; } unsigned long ieee80211_sta_last_active(struct sta_info *sta) { struct ieee80211_sta_rx_stats *stats = sta_get_last_rx_stats(sta); if (!sta->deflink.status_stats.last_ack || time_after(stats->last_rx, sta->deflink.status_stats.last_ack)) return stats->last_rx; return sta->deflink.status_stats.last_ack; } static void sta_update_codel_params(struct sta_info *sta, u32 thr) { if (thr && thr < STA_SLOW_THRESHOLD * sta->local->num_sta) { sta->cparams.target = MS2TIME(50); sta->cparams.interval = MS2TIME(300); sta->cparams.ecn = false; } else { sta->cparams.target = MS2TIME(20); sta->cparams.interval = MS2TIME(100); sta->cparams.ecn = true; } } void ieee80211_sta_set_expected_throughput(struct ieee80211_sta *pubsta, u32 thr) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); sta_update_codel_params(sta, thr); } int ieee80211_sta_allocate_link(struct sta_info *sta, unsigned int link_id) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct sta_link_alloc *alloc; int ret; lockdep_assert_wiphy(sdata->local->hw.wiphy); WARN_ON(!test_sta_flag(sta, WLAN_STA_INSERTED)); /* must represent an MLD from the start */ if (WARN_ON(!sta->sta.valid_links)) return -EINVAL; if (WARN_ON(sta->sta.valid_links & BIT(link_id) || sta->link[link_id])) return -EBUSY; alloc = kzalloc(sizeof(*alloc), GFP_KERNEL); if (!alloc) return -ENOMEM; ret = sta_info_alloc_link(sdata->local, &alloc->info, GFP_KERNEL); if (ret) { kfree(alloc); return ret; } sta_info_add_link(sta, link_id, &alloc->info, &alloc->sta); ieee80211_link_sta_debugfs_add(&alloc->info); return 0; } void ieee80211_sta_free_link(struct sta_info *sta, unsigned int link_id) { lockdep_assert_wiphy(sta->sdata->local->hw.wiphy); WARN_ON(!test_sta_flag(sta, WLAN_STA_INSERTED)); sta_remove_link(sta, link_id, false); } int ieee80211_sta_activate_link(struct sta_info *sta, unsigned int link_id) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct link_sta_info *link_sta; u16 old_links = sta->sta.valid_links; u16 new_links = old_links | BIT(link_id); int ret; link_sta = rcu_dereference_protected(sta->link[link_id], lockdep_is_held(&sdata->local->hw.wiphy->mtx)); if (WARN_ON(old_links == new_links || !link_sta)) return -EINVAL; rcu_read_lock(); if (link_sta_info_hash_lookup(sdata->local, link_sta->addr)) { rcu_read_unlock(); return -EALREADY; } /* we only modify under the mutex so this is fine */ rcu_read_unlock(); sta->sta.valid_links = new_links; if (WARN_ON(!test_sta_flag(sta, WLAN_STA_INSERTED))) goto hash; ieee80211_recalc_min_chandef(sdata, link_id); /* Ensure the values are updated for the driver, * redone by sta_remove_link on failure. */ ieee80211_sta_recalc_aggregates(&sta->sta); ret = drv_change_sta_links(sdata->local, sdata, &sta->sta, old_links, new_links); if (ret) { sta->sta.valid_links = old_links; sta_remove_link(sta, link_id, false); return ret; } hash: ret = link_sta_info_hash_add(sdata->local, link_sta); WARN_ON(ret); return 0; } void ieee80211_sta_remove_link(struct sta_info *sta, unsigned int link_id) { struct ieee80211_sub_if_data *sdata = sta->sdata; u16 old_links = sta->sta.valid_links; lockdep_assert_wiphy(sdata->local->hw.wiphy); sta->sta.valid_links &= ~BIT(link_id); if (!WARN_ON(!test_sta_flag(sta, WLAN_STA_INSERTED))) drv_change_sta_links(sdata->local, sdata, &sta->sta, old_links, sta->sta.valid_links); sta_remove_link(sta, link_id, true); } void ieee80211_sta_set_max_amsdu_subframes(struct sta_info *sta, const u8 *ext_capab, unsigned int ext_capab_len) { u8 val; sta->sta.max_amsdu_subframes = 0; if (ext_capab_len < 8) return; /* The sender might not have sent the last bit, consider it to be 0 */ val = u8_get_bits(ext_capab[7], WLAN_EXT_CAPA8_MAX_MSDU_IN_AMSDU_LSB); /* we did get all the bits, take the MSB as well */ if (ext_capab_len >= 9) val |= u8_get_bits(ext_capab[8], WLAN_EXT_CAPA9_MAX_MSDU_IN_AMSDU_MSB) << 1; if (val) sta->sta.max_amsdu_subframes = 4 << (4 - val); } #ifdef CONFIG_LOCKDEP bool lockdep_sta_mutex_held(struct ieee80211_sta *pubsta) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); return lockdep_is_held(&sta->local->hw.wiphy->mtx); } EXPORT_SYMBOL(lockdep_sta_mutex_held); #endif
5 1 3 2 3 2 2 8 6 2 1 1 3 2 1 1 1 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/symlink.c * * Only fast symlinks left here - the rest is done by generic code. AV, 1999 * * Copyright (C) 1992, 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) * * from * * linux/fs/minix/symlink.c * * Copyright (C) 1991, 1992 Linus Torvalds * * ext4 symlink handling code */ #include <linux/fs.h> #include <linux/namei.h> #include "ext4.h" #include "xattr.h" static const char *ext4_encrypted_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { struct buffer_head *bh = NULL; const void *caddr; unsigned int max_size; const char *paddr; if (!dentry) return ERR_PTR(-ECHILD); if (ext4_inode_is_fast_symlink(inode)) { caddr = EXT4_I(inode)->i_data; max_size = sizeof(EXT4_I(inode)->i_data); } else { bh = ext4_bread(NULL, inode, 0, 0); if (IS_ERR(bh)) return ERR_CAST(bh); if (!bh) { EXT4_ERROR_INODE(inode, "bad symlink."); return ERR_PTR(-EFSCORRUPTED); } caddr = bh->b_data; max_size = inode->i_sb->s_blocksize; } paddr = fscrypt_get_symlink(inode, caddr, max_size, done); brelse(bh); return paddr; } static int ext4_encrypted_symlink_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { ext4_getattr(idmap, path, stat, request_mask, query_flags); return fscrypt_symlink_getattr(path, stat); } static void ext4_free_link(void *bh) { brelse(bh); } static const char *ext4_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *callback) { struct buffer_head *bh; char *inline_link; /* * Create a new inlined symlink is not supported, just provide a * method to read the leftovers. */ if (ext4_has_inline_data(inode)) { if (!dentry) return ERR_PTR(-ECHILD); inline_link = ext4_read_inline_link(inode); if (!IS_ERR(inline_link)) set_delayed_call(callback, kfree_link, inline_link); return inline_link; } if (!dentry) { bh = ext4_getblk(NULL, inode, 0, EXT4_GET_BLOCKS_CACHED_NOWAIT); if (IS_ERR(bh) || !bh) return ERR_PTR(-ECHILD); if (!ext4_buffer_uptodate(bh)) { brelse(bh); return ERR_PTR(-ECHILD); } } else { bh = ext4_bread(NULL, inode, 0, 0); if (IS_ERR(bh)) return ERR_CAST(bh); if (!bh) { EXT4_ERROR_INODE(inode, "bad symlink."); return ERR_PTR(-EFSCORRUPTED); } } set_delayed_call(callback, ext4_free_link, bh); nd_terminate_link(bh->b_data, inode->i_size, inode->i_sb->s_blocksize - 1); return bh->b_data; } const struct inode_operations ext4_encrypted_symlink_inode_operations = { .get_link = ext4_encrypted_get_link, .setattr = ext4_setattr, .getattr = ext4_encrypted_symlink_getattr, .listxattr = ext4_listxattr, }; const struct inode_operations ext4_symlink_inode_operations = { .get_link = ext4_get_link, .setattr = ext4_setattr, .getattr = ext4_getattr, .listxattr = ext4_listxattr, }; const struct inode_operations ext4_fast_symlink_inode_operations = { .get_link = simple_get_link, .setattr = ext4_setattr, .getattr = ext4_getattr, .listxattr = ext4_listxattr, };
83 83 58 58 76 76 76 76 145 146 119 118 393 394 383 381 383 127 128 568 568 411 411 16 16 81 82 75 75 73 73 4 4 48 49 72 73 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 // SPDX-License-Identifier: GPL-2.0 #include <linux/bitops.h> #include <linux/slab.h> #include <linux/blkdev.h> #include <linux/sched/mm.h> #include <linux/atomic.h> #include <linux/vmalloc.h> #include "ctree.h" #include "volumes.h" #include "zoned.h" #include "rcu-string.h" #include "disk-io.h" #include "block-group.h" #include "dev-replace.h" #include "space-info.h" #include "fs.h" #include "accessors.h" #include "bio.h" /* Maximum number of zones to report per blkdev_report_zones() call */ #define BTRFS_REPORT_NR_ZONES 4096 /* Invalid allocation pointer value for missing devices */ #define WP_MISSING_DEV ((u64)-1) /* Pseudo write pointer value for conventional zone */ #define WP_CONVENTIONAL ((u64)-2) /* * Location of the first zone of superblock logging zone pairs. * * - primary superblock: 0B (zone 0) * - first copy: 512G (zone starting at that offset) * - second copy: 4T (zone starting at that offset) */ #define BTRFS_SB_LOG_PRIMARY_OFFSET (0ULL) #define BTRFS_SB_LOG_FIRST_OFFSET (512ULL * SZ_1G) #define BTRFS_SB_LOG_SECOND_OFFSET (4096ULL * SZ_1G) #define BTRFS_SB_LOG_FIRST_SHIFT const_ilog2(BTRFS_SB_LOG_FIRST_OFFSET) #define BTRFS_SB_LOG_SECOND_SHIFT const_ilog2(BTRFS_SB_LOG_SECOND_OFFSET) /* Number of superblock log zones */ #define BTRFS_NR_SB_LOG_ZONES 2 /* * Minimum of active zones we need: * * - BTRFS_SUPER_MIRROR_MAX zones for superblock mirrors * - 3 zones to ensure at least one zone per SYSTEM, META and DATA block group * - 1 zone for tree-log dedicated block group * - 1 zone for relocation */ #define BTRFS_MIN_ACTIVE_ZONES (BTRFS_SUPER_MIRROR_MAX + 5) /* * Minimum / maximum supported zone size. Currently, SMR disks have a zone * size of 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range. * We do not expect the zone size to become larger than 8GiB or smaller than * 4MiB in the near future. */ #define BTRFS_MAX_ZONE_SIZE SZ_8G #define BTRFS_MIN_ZONE_SIZE SZ_4M #define SUPER_INFO_SECTORS ((u64)BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT) static void wait_eb_writebacks(struct btrfs_block_group *block_group); static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written); static inline bool sb_zone_is_full(const struct blk_zone *zone) { return (zone->cond == BLK_ZONE_COND_FULL) || (zone->wp + SUPER_INFO_SECTORS > zone->start + zone->capacity); } static int copy_zone_info_cb(struct blk_zone *zone, unsigned int idx, void *data) { struct blk_zone *zones = data; memcpy(&zones[idx], zone, sizeof(*zone)); return 0; } static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones, u64 *wp_ret) { bool empty[BTRFS_NR_SB_LOG_ZONES]; bool full[BTRFS_NR_SB_LOG_ZONES]; sector_t sector; for (int i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) { ASSERT(zones[i].type != BLK_ZONE_TYPE_CONVENTIONAL); empty[i] = (zones[i].cond == BLK_ZONE_COND_EMPTY); full[i] = sb_zone_is_full(&zones[i]); } /* * Possible states of log buffer zones * * Empty[0] In use[0] Full[0] * Empty[1] * 0 1 * In use[1] x x 1 * Full[1] 0 0 C * * Log position: * *: Special case, no superblock is written * 0: Use write pointer of zones[0] * 1: Use write pointer of zones[1] * C: Compare super blocks from zones[0] and zones[1], use the latest * one determined by generation * x: Invalid state */ if (empty[0] && empty[1]) { /* Special case to distinguish no superblock to read */ *wp_ret = zones[0].start << SECTOR_SHIFT; return -ENOENT; } else if (full[0] && full[1]) { /* Compare two super blocks */ struct address_space *mapping = bdev->bd_mapping; struct page *page[BTRFS_NR_SB_LOG_ZONES]; struct btrfs_super_block *super[BTRFS_NR_SB_LOG_ZONES]; for (int i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) { u64 zone_end = (zones[i].start + zones[i].capacity) << SECTOR_SHIFT; u64 bytenr = ALIGN_DOWN(zone_end, BTRFS_SUPER_INFO_SIZE) - BTRFS_SUPER_INFO_SIZE; page[i] = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS); if (IS_ERR(page[i])) { if (i == 1) btrfs_release_disk_super(super[0]); return PTR_ERR(page[i]); } super[i] = page_address(page[i]); } if (btrfs_super_generation(super[0]) > btrfs_super_generation(super[1])) sector = zones[1].start; else sector = zones[0].start; for (int i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) btrfs_release_disk_super(super[i]); } else if (!full[0] && (empty[1] || full[1])) { sector = zones[0].wp; } else if (full[0]) { sector = zones[1].wp; } else { return -EUCLEAN; } *wp_ret = sector << SECTOR_SHIFT; return 0; } /* * Get the first zone number of the superblock mirror */ static inline u32 sb_zone_number(int shift, int mirror) { u64 zone = U64_MAX; ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX); switch (mirror) { case 0: zone = 0; break; case 1: zone = 1ULL << (BTRFS_SB_LOG_FIRST_SHIFT - shift); break; case 2: zone = 1ULL << (BTRFS_SB_LOG_SECOND_SHIFT - shift); break; } ASSERT(zone <= U32_MAX); return (u32)zone; } static inline sector_t zone_start_sector(u32 zone_number, struct block_device *bdev) { return (sector_t)zone_number << ilog2(bdev_zone_sectors(bdev)); } static inline u64 zone_start_physical(u32 zone_number, struct btrfs_zoned_device_info *zone_info) { return (u64)zone_number << zone_info->zone_size_shift; } /* * Emulate blkdev_report_zones() for a non-zoned device. It slices up the block * device into static sized chunks and fake a conventional zone on each of * them. */ static int emulate_report_zones(struct btrfs_device *device, u64 pos, struct blk_zone *zones, unsigned int nr_zones) { const sector_t zone_sectors = device->fs_info->zone_size >> SECTOR_SHIFT; sector_t bdev_size = bdev_nr_sectors(device->bdev); unsigned int i; pos >>= SECTOR_SHIFT; for (i = 0; i < nr_zones; i++) { zones[i].start = i * zone_sectors + pos; zones[i].len = zone_sectors; zones[i].capacity = zone_sectors; zones[i].wp = zones[i].start + zone_sectors; zones[i].type = BLK_ZONE_TYPE_CONVENTIONAL; zones[i].cond = BLK_ZONE_COND_NOT_WP; if (zones[i].wp >= bdev_size) { i++; break; } } return i; } static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, struct blk_zone *zones, unsigned int *nr_zones) { struct btrfs_zoned_device_info *zinfo = device->zone_info; int ret; if (!*nr_zones) return 0; if (!bdev_is_zoned(device->bdev)) { ret = emulate_report_zones(device, pos, zones, *nr_zones); *nr_zones = ret; return 0; } /* Check cache */ if (zinfo->zone_cache) { unsigned int i; u32 zno; ASSERT(IS_ALIGNED(pos, zinfo->zone_size)); zno = pos >> zinfo->zone_size_shift; /* * We cannot report zones beyond the zone end. So, it is OK to * cap *nr_zones to at the end. */ *nr_zones = min_t(u32, *nr_zones, zinfo->nr_zones - zno); for (i = 0; i < *nr_zones; i++) { struct blk_zone *zone_info; zone_info = &zinfo->zone_cache[zno + i]; if (!zone_info->len) break; } if (i == *nr_zones) { /* Cache hit on all the zones */ memcpy(zones, zinfo->zone_cache + zno, sizeof(*zinfo->zone_cache) * *nr_zones); return 0; } } ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones, copy_zone_info_cb, zones); if (ret < 0) { btrfs_err_in_rcu(device->fs_info, "zoned: failed to read zone %llu on %s (devid %llu)", pos, rcu_str_deref(device->name), device->devid); return ret; } *nr_zones = ret; if (!ret) return -EIO; /* Populate cache */ if (zinfo->zone_cache) { u32 zno = pos >> zinfo->zone_size_shift; memcpy(zinfo->zone_cache + zno, zones, sizeof(*zinfo->zone_cache) * *nr_zones); } return 0; } /* The emulated zone size is determined from the size of device extent */ static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info) { BTRFS_PATH_AUTO_FREE(path); struct btrfs_root *root = fs_info->dev_root; struct btrfs_key key; struct extent_buffer *leaf; struct btrfs_dev_extent *dext; int ret = 0; key.objectid = 1; key.type = BTRFS_DEV_EXTENT_KEY; key.offset = 0; path = btrfs_alloc_path(); if (!path) return -ENOMEM; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) return ret; if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { ret = btrfs_next_leaf(root, path); if (ret < 0) return ret; /* No dev extents at all? Not good */ if (ret > 0) return -EUCLEAN; } leaf = path->nodes[0]; dext = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); fs_info->zone_size = btrfs_dev_extent_length(leaf, dext); return 0; } int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info) { struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; int ret = 0; /* fs_info->zone_size might not set yet. Use the incomapt flag here. */ if (!btrfs_fs_incompat(fs_info, ZONED)) return 0; mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry(device, &fs_devices->devices, dev_list) { /* We can skip reading of zone info for missing devices */ if (!device->bdev) continue; ret = btrfs_get_dev_zone_info(device, true); if (ret) break; } mutex_unlock(&fs_devices->device_list_mutex); return ret; } int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) { struct btrfs_fs_info *fs_info = device->fs_info; struct btrfs_zoned_device_info *zone_info = NULL; struct block_device *bdev = device->bdev; unsigned int max_active_zones; unsigned int nactive; sector_t nr_sectors; sector_t sector = 0; struct blk_zone *zones = NULL; unsigned int i, nreported = 0, nr_zones; sector_t zone_sectors; char *model, *emulated; int ret; /* * Cannot use btrfs_is_zoned here, since fs_info::zone_size might not * yet be set. */ if (!btrfs_fs_incompat(fs_info, ZONED)) return 0; if (device->zone_info) return 0; zone_info = kzalloc(sizeof(*zone_info), GFP_KERNEL); if (!zone_info) return -ENOMEM; device->zone_info = zone_info; if (!bdev_is_zoned(bdev)) { if (!fs_info->zone_size) { ret = calculate_emulated_zone_size(fs_info); if (ret) goto out; } ASSERT(fs_info->zone_size); zone_sectors = fs_info->zone_size >> SECTOR_SHIFT; } else { zone_sectors = bdev_zone_sectors(bdev); } ASSERT(is_power_of_two_u64(zone_sectors)); zone_info->zone_size = zone_sectors << SECTOR_SHIFT; /* We reject devices with a zone size larger than 8GB */ if (zone_info->zone_size > BTRFS_MAX_ZONE_SIZE) { btrfs_err_in_rcu(fs_info, "zoned: %s: zone size %llu larger than supported maximum %llu", rcu_str_deref(device->name), zone_info->zone_size, BTRFS_MAX_ZONE_SIZE); ret = -EINVAL; goto out; } else if (zone_info->zone_size < BTRFS_MIN_ZONE_SIZE) { btrfs_err_in_rcu(fs_info, "zoned: %s: zone size %llu smaller than supported minimum %u", rcu_str_deref(device->name), zone_info->zone_size, BTRFS_MIN_ZONE_SIZE); ret = -EINVAL; goto out; } nr_sectors = bdev_nr_sectors(bdev); zone_info->zone_size_shift = ilog2(zone_info->zone_size); zone_info->nr_zones = nr_sectors >> ilog2(zone_sectors); if (!IS_ALIGNED(nr_sectors, zone_sectors)) zone_info->nr_zones++; max_active_zones = bdev_max_active_zones(bdev); if (max_active_zones && max_active_zones < BTRFS_MIN_ACTIVE_ZONES) { btrfs_err_in_rcu(fs_info, "zoned: %s: max active zones %u is too small, need at least %u active zones", rcu_str_deref(device->name), max_active_zones, BTRFS_MIN_ACTIVE_ZONES); ret = -EINVAL; goto out; } zone_info->max_active_zones = max_active_zones; zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); if (!zone_info->seq_zones) { ret = -ENOMEM; goto out; } zone_info->empty_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); if (!zone_info->empty_zones) { ret = -ENOMEM; goto out; } zone_info->active_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); if (!zone_info->active_zones) { ret = -ENOMEM; goto out; } zones = kvcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL); if (!zones) { ret = -ENOMEM; goto out; } /* * Enable zone cache only for a zoned device. On a non-zoned device, we * fill the zone info with emulated CONVENTIONAL zones, so no need to * use the cache. */ if (populate_cache && bdev_is_zoned(device->bdev)) { zone_info->zone_cache = vcalloc(zone_info->nr_zones, sizeof(struct blk_zone)); if (!zone_info->zone_cache) { btrfs_err_in_rcu(device->fs_info, "zoned: failed to allocate zone cache for %s", rcu_str_deref(device->name)); ret = -ENOMEM; goto out; } } /* Get zones type */ nactive = 0; while (sector < nr_sectors) { nr_zones = BTRFS_REPORT_NR_ZONES; ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT, zones, &nr_zones); if (ret) goto out; for (i = 0; i < nr_zones; i++) { if (zones[i].type == BLK_ZONE_TYPE_SEQWRITE_REQ) __set_bit(nreported, zone_info->seq_zones); switch (zones[i].cond) { case BLK_ZONE_COND_EMPTY: __set_bit(nreported, zone_info->empty_zones); break; case BLK_ZONE_COND_IMP_OPEN: case BLK_ZONE_COND_EXP_OPEN: case BLK_ZONE_COND_CLOSED: __set_bit(nreported, zone_info->active_zones); nactive++; break; } nreported++; } sector = zones[nr_zones - 1].start + zones[nr_zones - 1].len; } if (nreported != zone_info->nr_zones) { btrfs_err_in_rcu(device->fs_info, "inconsistent number of zones on %s (%u/%u)", rcu_str_deref(device->name), nreported, zone_info->nr_zones); ret = -EIO; goto out; } if (max_active_zones) { if (nactive > max_active_zones) { btrfs_err_in_rcu(device->fs_info, "zoned: %u active zones on %s exceeds max_active_zones %u", nactive, rcu_str_deref(device->name), max_active_zones); ret = -EIO; goto out; } atomic_set(&zone_info->active_zones_left, max_active_zones - nactive); set_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags); } /* Validate superblock log */ nr_zones = BTRFS_NR_SB_LOG_ZONES; for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { u32 sb_zone; u64 sb_wp; int sb_pos = BTRFS_NR_SB_LOG_ZONES * i; sb_zone = sb_zone_number(zone_info->zone_size_shift, i); if (sb_zone + 1 >= zone_info->nr_zones) continue; ret = btrfs_get_dev_zones(device, zone_start_physical(sb_zone, zone_info), &zone_info->sb_zones[sb_pos], &nr_zones); if (ret) goto out; if (nr_zones != BTRFS_NR_SB_LOG_ZONES) { btrfs_err_in_rcu(device->fs_info, "zoned: failed to read super block log zone info at devid %llu zone %u", device->devid, sb_zone); ret = -EUCLEAN; goto out; } /* * If zones[0] is conventional, always use the beginning of the * zone to record superblock. No need to validate in that case. */ if (zone_info->sb_zones[BTRFS_NR_SB_LOG_ZONES * i].type == BLK_ZONE_TYPE_CONVENTIONAL) continue; ret = sb_write_pointer(device->bdev, &zone_info->sb_zones[sb_pos], &sb_wp); if (ret != -ENOENT && ret) { btrfs_err_in_rcu(device->fs_info, "zoned: super block log zone corrupted devid %llu zone %u", device->devid, sb_zone); ret = -EUCLEAN; goto out; } } kvfree(zones); if (bdev_is_zoned(bdev)) { model = "host-managed zoned"; emulated = ""; } else { model = "regular"; emulated = "emulated "; } btrfs_info_in_rcu(fs_info, "%s block device %s, %u %szones of %llu bytes", model, rcu_str_deref(device->name), zone_info->nr_zones, emulated, zone_info->zone_size); return 0; out: kvfree(zones); btrfs_destroy_dev_zone_info(device); return ret; } void btrfs_destroy_dev_zone_info(struct btrfs_device *device) { struct btrfs_zoned_device_info *zone_info = device->zone_info; if (!zone_info) return; bitmap_free(zone_info->active_zones); bitmap_free(zone_info->seq_zones); bitmap_free(zone_info->empty_zones); vfree(zone_info->zone_cache); kfree(zone_info); device->zone_info = NULL; } struct btrfs_zoned_device_info *btrfs_clone_dev_zone_info(struct btrfs_device *orig_dev) { struct btrfs_zoned_device_info *zone_info; zone_info = kmemdup(orig_dev->zone_info, sizeof(*zone_info), GFP_KERNEL); if (!zone_info) return NULL; zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); if (!zone_info->seq_zones) goto out; bitmap_copy(zone_info->seq_zones, orig_dev->zone_info->seq_zones, zone_info->nr_zones); zone_info->empty_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); if (!zone_info->empty_zones) goto out; bitmap_copy(zone_info->empty_zones, orig_dev->zone_info->empty_zones, zone_info->nr_zones); zone_info->active_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); if (!zone_info->active_zones) goto out; bitmap_copy(zone_info->active_zones, orig_dev->zone_info->active_zones, zone_info->nr_zones); zone_info->zone_cache = NULL; return zone_info; out: bitmap_free(zone_info->seq_zones); bitmap_free(zone_info->empty_zones); bitmap_free(zone_info->active_zones); kfree(zone_info); return NULL; } static int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone) { unsigned int nr_zones = 1; int ret; ret = btrfs_get_dev_zones(device, pos, zone, &nr_zones); if (ret != 0 || !nr_zones) return ret ? ret : -EIO; return 0; } static int btrfs_check_for_zoned_device(struct btrfs_fs_info *fs_info) { struct btrfs_device *device; list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) { if (device->bdev && bdev_is_zoned(device->bdev)) { btrfs_err(fs_info, "zoned: mode not enabled but zoned device found: %pg", device->bdev); return -EINVAL; } } return 0; } int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) { struct queue_limits *lim = &fs_info->limits; struct btrfs_device *device; u64 zone_size = 0; int ret; /* * Host-Managed devices can't be used without the ZONED flag. With the * ZONED all devices can be used, using zone emulation if required. */ if (!btrfs_fs_incompat(fs_info, ZONED)) return btrfs_check_for_zoned_device(fs_info); blk_set_stacking_limits(lim); list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) { struct btrfs_zoned_device_info *zone_info = device->zone_info; if (!device->bdev) continue; if (!zone_size) { zone_size = zone_info->zone_size; } else if (zone_info->zone_size != zone_size) { btrfs_err(fs_info, "zoned: unequal block device zone sizes: have %llu found %llu", zone_info->zone_size, zone_size); return -EINVAL; } /* * With the zoned emulation, we can have non-zoned device on the * zoned mode. In this case, we don't have a valid max zone * append size. */ if (bdev_is_zoned(device->bdev)) blk_stack_limits(lim, bdev_limits(device->bdev), 0); } ret = blk_validate_limits(lim); if (ret) { btrfs_err(fs_info, "zoned: failed to validate queue limits"); return ret; } /* * stripe_size is always aligned to BTRFS_STRIPE_LEN in * btrfs_create_chunk(). Since we want stripe_len == zone_size, * check the alignment here. */ if (!IS_ALIGNED(zone_size, BTRFS_STRIPE_LEN)) { btrfs_err(fs_info, "zoned: zone size %llu not aligned to stripe %u", zone_size, BTRFS_STRIPE_LEN); return -EINVAL; } if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { btrfs_err(fs_info, "zoned: mixed block groups not supported"); return -EINVAL; } fs_info->zone_size = zone_size; /* * Also limit max_zone_append_size by max_segments * PAGE_SIZE. * Technically, we can have multiple pages per segment. But, since * we add the pages one by one to a bio, and cannot increase the * metadata reservation even if it increases the number of extents, it * is safe to stick with the limit. */ fs_info->max_zone_append_size = ALIGN_DOWN( min3((u64)lim->max_zone_append_sectors << SECTOR_SHIFT, (u64)lim->max_sectors << SECTOR_SHIFT, (u64)lim->max_segments << PAGE_SHIFT), fs_info->sectorsize); fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED; fs_info->max_extent_size = min_not_zero(fs_info->max_extent_size, fs_info->max_zone_append_size); /* * Check mount options here, because we might change fs_info->zoned * from fs_info->zone_size. */ ret = btrfs_check_mountopts_zoned(fs_info, &fs_info->mount_opt); if (ret) return ret; btrfs_info(fs_info, "zoned mode enabled with zone size %llu", zone_size); return 0; } int btrfs_check_mountopts_zoned(const struct btrfs_fs_info *info, unsigned long long *mount_opt) { if (!btrfs_is_zoned(info)) return 0; /* * Space cache writing is not COWed. Disable that to avoid write errors * in sequential zones. */ if (btrfs_raw_test_opt(*mount_opt, SPACE_CACHE)) { btrfs_err(info, "zoned: space cache v1 is not supported"); return -EINVAL; } if (btrfs_raw_test_opt(*mount_opt, NODATACOW)) { btrfs_err(info, "zoned: NODATACOW not supported"); return -EINVAL; } if (btrfs_raw_test_opt(*mount_opt, DISCARD_ASYNC)) { btrfs_info(info, "zoned: async discard ignored and disabled for zoned mode"); btrfs_clear_opt(*mount_opt, DISCARD_ASYNC); } return 0; } static int sb_log_location(struct block_device *bdev, struct blk_zone *zones, int rw, u64 *bytenr_ret) { u64 wp; int ret; if (zones[0].type == BLK_ZONE_TYPE_CONVENTIONAL) { *bytenr_ret = zones[0].start << SECTOR_SHIFT; return 0; } ret = sb_write_pointer(bdev, zones, &wp); if (ret != -ENOENT && ret < 0) return ret; if (rw == WRITE) { struct blk_zone *reset = NULL; if (wp == zones[0].start << SECTOR_SHIFT) reset = &zones[0]; else if (wp == zones[1].start << SECTOR_SHIFT) reset = &zones[1]; if (reset && reset->cond != BLK_ZONE_COND_EMPTY) { unsigned int nofs_flags; ASSERT(sb_zone_is_full(reset)); nofs_flags = memalloc_nofs_save(); ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, reset->start, reset->len); memalloc_nofs_restore(nofs_flags); if (ret) return ret; reset->cond = BLK_ZONE_COND_EMPTY; reset->wp = reset->start; } } else if (ret != -ENOENT) { /* * For READ, we want the previous one. Move write pointer to * the end of a zone, if it is at the head of a zone. */ u64 zone_end = 0; if (wp == zones[0].start << SECTOR_SHIFT) zone_end = zones[1].start + zones[1].capacity; else if (wp == zones[1].start << SECTOR_SHIFT) zone_end = zones[0].start + zones[0].capacity; if (zone_end) wp = ALIGN_DOWN(zone_end << SECTOR_SHIFT, BTRFS_SUPER_INFO_SIZE); wp -= BTRFS_SUPER_INFO_SIZE; } *bytenr_ret = wp; return 0; } int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw, u64 *bytenr_ret) { struct blk_zone zones[BTRFS_NR_SB_LOG_ZONES]; sector_t zone_sectors; u32 sb_zone; int ret; u8 zone_sectors_shift; sector_t nr_sectors; u32 nr_zones; if (!bdev_is_zoned(bdev)) { *bytenr_ret = btrfs_sb_offset(mirror); return 0; } ASSERT(rw == READ || rw == WRITE); zone_sectors = bdev_zone_sectors(bdev); if (!is_power_of_2(zone_sectors)) return -EINVAL; zone_sectors_shift = ilog2(zone_sectors); nr_sectors = bdev_nr_sectors(bdev); nr_zones = nr_sectors >> zone_sectors_shift; sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror); if (sb_zone + 1 >= nr_zones) return -ENOENT; ret = blkdev_report_zones(bdev, zone_start_sector(sb_zone, bdev), BTRFS_NR_SB_LOG_ZONES, copy_zone_info_cb, zones); if (ret < 0) return ret; if (ret != BTRFS_NR_SB_LOG_ZONES) return -EIO; return sb_log_location(bdev, zones, rw, bytenr_ret); } int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw, u64 *bytenr_ret) { struct btrfs_zoned_device_info *zinfo = device->zone_info; u32 zone_num; /* * For a zoned filesystem on a non-zoned block device, use the same * super block locations as regular filesystem. Doing so, the super * block can always be retrieved and the zoned flag of the volume * detected from the super block information. */ if (!bdev_is_zoned(device->bdev)) { *bytenr_ret = btrfs_sb_offset(mirror); return 0; } zone_num = sb_zone_number(zinfo->zone_size_shift, mirror); if (zone_num + 1 >= zinfo->nr_zones) return -ENOENT; return sb_log_location(device->bdev, &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror], rw, bytenr_ret); } static inline bool is_sb_log_zone(struct btrfs_zoned_device_info *zinfo, int mirror) { u32 zone_num; if (!zinfo) return false; zone_num = sb_zone_number(zinfo->zone_size_shift, mirror); if (zone_num + 1 >= zinfo->nr_zones) return false; if (!test_bit(zone_num, zinfo->seq_zones)) return false; return true; } int btrfs_advance_sb_log(struct btrfs_device *device, int mirror) { struct btrfs_zoned_device_info *zinfo = device->zone_info; struct blk_zone *zone; int i; if (!is_sb_log_zone(zinfo, mirror)) return 0; zone = &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror]; for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) { /* Advance the next zone */ if (zone->cond == BLK_ZONE_COND_FULL) { zone++; continue; } if (zone->cond == BLK_ZONE_COND_EMPTY) zone->cond = BLK_ZONE_COND_IMP_OPEN; zone->wp += SUPER_INFO_SECTORS; if (sb_zone_is_full(zone)) { /* * No room left to write new superblock. Since * superblock is written with REQ_SYNC, it is safe to * finish the zone now. * * If the write pointer is exactly at the capacity, * explicit ZONE_FINISH is not necessary. */ if (zone->wp != zone->start + zone->capacity) { unsigned int nofs_flags; int ret; nofs_flags = memalloc_nofs_save(); ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, zone->start, zone->len); memalloc_nofs_restore(nofs_flags); if (ret) return ret; } zone->wp = zone->start + zone->len; zone->cond = BLK_ZONE_COND_FULL; } return 0; } /* All the zones are FULL. Should not reach here. */ ASSERT(0); return -EIO; } int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror) { unsigned int nofs_flags; sector_t zone_sectors; sector_t nr_sectors; u8 zone_sectors_shift; u32 sb_zone; u32 nr_zones; int ret; zone_sectors = bdev_zone_sectors(bdev); zone_sectors_shift = ilog2(zone_sectors); nr_sectors = bdev_nr_sectors(bdev); nr_zones = nr_sectors >> zone_sectors_shift; sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror); if (sb_zone + 1 >= nr_zones) return -ENOENT; nofs_flags = memalloc_nofs_save(); ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, zone_start_sector(sb_zone, bdev), zone_sectors * BTRFS_NR_SB_LOG_ZONES); memalloc_nofs_restore(nofs_flags); return ret; } /* * Find allocatable zones within a given region. * * @device: the device to allocate a region on * @hole_start: the position of the hole to allocate the region * @num_bytes: size of wanted region * @hole_end: the end of the hole * @return: position of allocatable zones * * Allocatable region should not contain any superblock locations. */ u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start, u64 hole_end, u64 num_bytes) { struct btrfs_zoned_device_info *zinfo = device->zone_info; const u8 shift = zinfo->zone_size_shift; u64 nzones = num_bytes >> shift; u64 pos = hole_start; u64 begin, end; bool have_sb; int i; ASSERT(IS_ALIGNED(hole_start, zinfo->zone_size)); ASSERT(IS_ALIGNED(num_bytes, zinfo->zone_size)); while (pos < hole_end) { begin = pos >> shift; end = begin + nzones; if (end > zinfo->nr_zones) return hole_end; /* Check if zones in the region are all empty */ if (btrfs_dev_is_sequential(device, pos) && !bitmap_test_range_all_set(zinfo->empty_zones, begin, nzones)) { pos += zinfo->zone_size; continue; } have_sb = false; for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { u32 sb_zone; u64 sb_pos; sb_zone = sb_zone_number(shift, i); if (!(end <= sb_zone || sb_zone + BTRFS_NR_SB_LOG_ZONES <= begin)) { have_sb = true; pos = zone_start_physical( sb_zone + BTRFS_NR_SB_LOG_ZONES, zinfo); break; } /* We also need to exclude regular superblock positions */ sb_pos = btrfs_sb_offset(i); if (!(pos + num_bytes <= sb_pos || sb_pos + BTRFS_SUPER_INFO_SIZE <= pos)) { have_sb = true; pos = ALIGN(sb_pos + BTRFS_SUPER_INFO_SIZE, zinfo->zone_size); break; } } if (!have_sb) break; } return pos; } static bool btrfs_dev_set_active_zone(struct btrfs_device *device, u64 pos) { struct btrfs_zoned_device_info *zone_info = device->zone_info; unsigned int zno = (pos >> zone_info->zone_size_shift); /* We can use any number of zones */ if (zone_info->max_active_zones == 0) return true; if (!test_bit(zno, zone_info->active_zones)) { /* Active zone left? */ if (atomic_dec_if_positive(&zone_info->active_zones_left) < 0) return false; if (test_and_set_bit(zno, zone_info->active_zones)) { /* Someone already set the bit */ atomic_inc(&zone_info->active_zones_left); } } return true; } static void btrfs_dev_clear_active_zone(struct btrfs_device *device, u64 pos) { struct btrfs_zoned_device_info *zone_info = device->zone_info; unsigned int zno = (pos >> zone_info->zone_size_shift); /* We can use any number of zones */ if (zone_info->max_active_zones == 0) return; if (test_and_clear_bit(zno, zone_info->active_zones)) atomic_inc(&zone_info->active_zones_left); } int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical, u64 length, u64 *bytes) { unsigned int nofs_flags; int ret; *bytes = 0; nofs_flags = memalloc_nofs_save(); ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_RESET, physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT); memalloc_nofs_restore(nofs_flags); if (ret) return ret; *bytes = length; while (length) { btrfs_dev_set_zone_empty(device, physical); btrfs_dev_clear_active_zone(device, physical); physical += device->zone_info->zone_size; length -= device->zone_info->zone_size; } return 0; } int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size) { struct btrfs_zoned_device_info *zinfo = device->zone_info; const u8 shift = zinfo->zone_size_shift; unsigned long begin = start >> shift; unsigned long nbits = size >> shift; u64 pos; int ret; ASSERT(IS_ALIGNED(start, zinfo->zone_size)); ASSERT(IS_ALIGNED(size, zinfo->zone_size)); if (begin + nbits > zinfo->nr_zones) return -ERANGE; /* All the zones are conventional */ if (bitmap_test_range_all_zero(zinfo->seq_zones, begin, nbits)) return 0; /* All the zones are sequential and empty */ if (bitmap_test_range_all_set(zinfo->seq_zones, begin, nbits) && bitmap_test_range_all_set(zinfo->empty_zones, begin, nbits)) return 0; for (pos = start; pos < start + size; pos += zinfo->zone_size) { u64 reset_bytes; if (!btrfs_dev_is_sequential(device, pos) || btrfs_dev_is_empty_zone(device, pos)) continue; /* Free regions should be empty */ btrfs_warn_in_rcu( device->fs_info, "zoned: resetting device %s (devid %llu) zone %llu for allocation", rcu_str_deref(device->name), device->devid, pos >> shift); WARN_ON_ONCE(1); ret = btrfs_reset_device_zone(device, pos, zinfo->zone_size, &reset_bytes); if (ret) return ret; } return 0; } /* * Calculate an allocation pointer from the extent allocation information * for a block group consist of conventional zones. It is pointed to the * end of the highest addressed extent in the block group as an allocation * offset. */ static int calculate_alloc_pointer(struct btrfs_block_group *cache, u64 *offset_ret, bool new) { struct btrfs_fs_info *fs_info = cache->fs_info; struct btrfs_root *root; BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_key found_key; int ret; u64 length; /* * Avoid tree lookups for a new block group, there's no use for it. * It must always be 0. * * Also, we have a lock chain of extent buffer lock -> chunk mutex. * For new a block group, this function is called from * btrfs_make_block_group() which is already taking the chunk mutex. * Thus, we cannot call calculate_alloc_pointer() which takes extent * buffer locks to avoid deadlock. */ if (new) { *offset_ret = 0; return 0; } path = btrfs_alloc_path(); if (!path) return -ENOMEM; key.objectid = cache->start + cache->length; key.type = 0; key.offset = 0; root = btrfs_extent_root(fs_info, key.objectid); ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); /* We should not find the exact match */ if (!ret) ret = -EUCLEAN; if (ret < 0) return ret; ret = btrfs_previous_extent_item(root, path, cache->start); if (ret) { if (ret == 1) { ret = 0; *offset_ret = 0; } return ret; } btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); if (found_key.type == BTRFS_EXTENT_ITEM_KEY) length = found_key.offset; else length = fs_info->nodesize; if (!(found_key.objectid >= cache->start && found_key.objectid + length <= cache->start + cache->length)) { return -EUCLEAN; } *offset_ret = found_key.objectid + length - cache->start; return 0; } struct zone_info { u64 physical; u64 capacity; u64 alloc_offset; }; static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx, struct zone_info *info, unsigned long *active, struct btrfs_chunk_map *map) { struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; struct btrfs_device *device; int dev_replace_is_ongoing = 0; unsigned int nofs_flag; struct blk_zone zone; int ret; info->physical = map->stripes[zone_idx].physical; down_read(&dev_replace->rwsem); device = map->stripes[zone_idx].dev; if (!device->bdev) { up_read(&dev_replace->rwsem); info->alloc_offset = WP_MISSING_DEV; return 0; } /* Consider a zone as active if we can allow any number of active zones. */ if (!device->zone_info->max_active_zones) __set_bit(zone_idx, active); if (!btrfs_dev_is_sequential(device, info->physical)) { up_read(&dev_replace->rwsem); info->alloc_offset = WP_CONVENTIONAL; return 0; } /* This zone will be used for allocation, so mark this zone non-empty. */ btrfs_dev_clear_zone_empty(device, info->physical); dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) btrfs_dev_clear_zone_empty(dev_replace->tgtdev, info->physical); /* * The group is mapped to a sequential zone. Get the zone write pointer * to determine the allocation offset within the zone. */ WARN_ON(!IS_ALIGNED(info->physical, fs_info->zone_size)); nofs_flag = memalloc_nofs_save(); ret = btrfs_get_dev_zone(device, info->physical, &zone); memalloc_nofs_restore(nofs_flag); if (ret) { up_read(&dev_replace->rwsem); if (ret != -EIO && ret != -EOPNOTSUPP) return ret; info->alloc_offset = WP_MISSING_DEV; return 0; } if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) { btrfs_err_in_rcu(fs_info, "zoned: unexpected conventional zone %llu on device %s (devid %llu)", zone.start << SECTOR_SHIFT, rcu_str_deref(device->name), device->devid); up_read(&dev_replace->rwsem); return -EIO; } info->capacity = (zone.capacity << SECTOR_SHIFT); switch (zone.cond) { case BLK_ZONE_COND_OFFLINE: case BLK_ZONE_COND_READONLY: btrfs_err_in_rcu(fs_info, "zoned: offline/readonly zone %llu on device %s (devid %llu)", (info->physical >> device->zone_info->zone_size_shift), rcu_str_deref(device->name), device->devid); info->alloc_offset = WP_MISSING_DEV; break; case BLK_ZONE_COND_EMPTY: info->alloc_offset = 0; break; case BLK_ZONE_COND_FULL: info->alloc_offset = info->capacity; break; default: /* Partially used zone. */ info->alloc_offset = ((zone.wp - zone.start) << SECTOR_SHIFT); __set_bit(zone_idx, active); break; } up_read(&dev_replace->rwsem); return 0; } static int btrfs_load_block_group_single(struct btrfs_block_group *bg, struct zone_info *info, unsigned long *active) { if (info->alloc_offset == WP_MISSING_DEV) { btrfs_err(bg->fs_info, "zoned: cannot recover write pointer for zone %llu", info->physical); return -EIO; } bg->alloc_offset = info->alloc_offset; bg->zone_capacity = info->capacity; if (test_bit(0, active)) set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags); return 0; } static int btrfs_load_block_group_dup(struct btrfs_block_group *bg, struct btrfs_chunk_map *map, struct zone_info *zone_info, unsigned long *active) { struct btrfs_fs_info *fs_info = bg->fs_info; if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) { btrfs_err(fs_info, "zoned: data DUP profile needs raid-stripe-tree"); return -EINVAL; } bg->zone_capacity = min_not_zero(zone_info[0].capacity, zone_info[1].capacity); if (zone_info[0].alloc_offset == WP_MISSING_DEV) { btrfs_err(bg->fs_info, "zoned: cannot recover write pointer for zone %llu", zone_info[0].physical); return -EIO; } if (zone_info[1].alloc_offset == WP_MISSING_DEV) { btrfs_err(bg->fs_info, "zoned: cannot recover write pointer for zone %llu", zone_info[1].physical); return -EIO; } if (zone_info[0].alloc_offset != zone_info[1].alloc_offset) { btrfs_err(bg->fs_info, "zoned: write pointer offset mismatch of zones in DUP profile"); return -EIO; } if (test_bit(0, active) != test_bit(1, active)) { if (!btrfs_zone_activate(bg)) return -EIO; } else if (test_bit(0, active)) { set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags); } bg->alloc_offset = zone_info[0].alloc_offset; return 0; } static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg, struct btrfs_chunk_map *map, struct zone_info *zone_info, unsigned long *active) { struct btrfs_fs_info *fs_info = bg->fs_info; int i; if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) { btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree", btrfs_bg_type_to_raid_name(map->type)); return -EINVAL; } /* In case a device is missing we have a cap of 0, so don't use it. */ bg->zone_capacity = min_not_zero(zone_info[0].capacity, zone_info[1].capacity); for (i = 0; i < map->num_stripes; i++) { if (zone_info[i].alloc_offset == WP_MISSING_DEV || zone_info[i].alloc_offset == WP_CONVENTIONAL) continue; if ((zone_info[0].alloc_offset != zone_info[i].alloc_offset) && !btrfs_test_opt(fs_info, DEGRADED)) { btrfs_err(fs_info, "zoned: write pointer offset mismatch of zones in %s profile", btrfs_bg_type_to_raid_name(map->type)); return -EIO; } if (test_bit(0, active) != test_bit(i, active)) { if (!btrfs_test_opt(fs_info, DEGRADED) && !btrfs_zone_activate(bg)) { return -EIO; } } else { if (test_bit(0, active)) set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags); } } if (zone_info[0].alloc_offset != WP_MISSING_DEV) bg->alloc_offset = zone_info[0].alloc_offset; else bg->alloc_offset = zone_info[i - 1].alloc_offset; return 0; } static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg, struct btrfs_chunk_map *map, struct zone_info *zone_info, unsigned long *active) { struct btrfs_fs_info *fs_info = bg->fs_info; if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) { btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree", btrfs_bg_type_to_raid_name(map->type)); return -EINVAL; } for (int i = 0; i < map->num_stripes; i++) { if (zone_info[i].alloc_offset == WP_MISSING_DEV || zone_info[i].alloc_offset == WP_CONVENTIONAL) continue; if (test_bit(0, active) != test_bit(i, active)) { if (!btrfs_zone_activate(bg)) return -EIO; } else { if (test_bit(0, active)) set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags); } bg->zone_capacity += zone_info[i].capacity; bg->alloc_offset += zone_info[i].alloc_offset; } return 0; } static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg, struct btrfs_chunk_map *map, struct zone_info *zone_info, unsigned long *active) { struct btrfs_fs_info *fs_info = bg->fs_info; if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) { btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree", btrfs_bg_type_to_raid_name(map->type)); return -EINVAL; } for (int i = 0; i < map->num_stripes; i++) { if (zone_info[i].alloc_offset == WP_MISSING_DEV || zone_info[i].alloc_offset == WP_CONVENTIONAL) continue; if (test_bit(0, active) != test_bit(i, active)) { if (!btrfs_zone_activate(bg)) return -EIO; } else { if (test_bit(0, active)) set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags); } if ((i % map->sub_stripes) == 0) { bg->zone_capacity += zone_info[i].capacity; bg->alloc_offset += zone_info[i].alloc_offset; } } return 0; } int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) { struct btrfs_fs_info *fs_info = cache->fs_info; struct btrfs_chunk_map *map; u64 logical = cache->start; u64 length = cache->length; struct zone_info *zone_info = NULL; int ret; int i; unsigned long *active = NULL; u64 last_alloc = 0; u32 num_sequential = 0, num_conventional = 0; u64 profile; if (!btrfs_is_zoned(fs_info)) return 0; /* Sanity check */ if (!IS_ALIGNED(length, fs_info->zone_size)) { btrfs_err(fs_info, "zoned: block group %llu len %llu unaligned to zone size %llu", logical, length, fs_info->zone_size); return -EIO; } map = btrfs_find_chunk_map(fs_info, logical, length); if (!map) return -EINVAL; cache->physical_map = map; zone_info = kcalloc(map->num_stripes, sizeof(*zone_info), GFP_NOFS); if (!zone_info) { ret = -ENOMEM; goto out; } active = bitmap_zalloc(map->num_stripes, GFP_NOFS); if (!active) { ret = -ENOMEM; goto out; } for (i = 0; i < map->num_stripes; i++) { ret = btrfs_load_zone_info(fs_info, i, &zone_info[i], active, map); if (ret) goto out; if (zone_info[i].alloc_offset == WP_CONVENTIONAL) num_conventional++; else num_sequential++; } if (num_sequential > 0) set_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &cache->runtime_flags); if (num_conventional > 0) { /* Zone capacity is always zone size in emulation */ cache->zone_capacity = cache->length; ret = calculate_alloc_pointer(cache, &last_alloc, new); if (ret) { btrfs_err(fs_info, "zoned: failed to determine allocation offset of bg %llu", cache->start); goto out; } else if (map->num_stripes == num_conventional) { cache->alloc_offset = last_alloc; set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags); goto out; } } profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK; switch (profile) { case 0: /* single */ ret = btrfs_load_block_group_single(cache, &zone_info[0], active); break; case BTRFS_BLOCK_GROUP_DUP: ret = btrfs_load_block_group_dup(cache, map, zone_info, active); break; case BTRFS_BLOCK_GROUP_RAID1: case BTRFS_BLOCK_GROUP_RAID1C3: case BTRFS_BLOCK_GROUP_RAID1C4: ret = btrfs_load_block_group_raid1(cache, map, zone_info, active); break; case BTRFS_BLOCK_GROUP_RAID0: ret = btrfs_load_block_group_raid0(cache, map, zone_info, active); break; case BTRFS_BLOCK_GROUP_RAID10: ret = btrfs_load_block_group_raid10(cache, map, zone_info, active); break; case BTRFS_BLOCK_GROUP_RAID5: case BTRFS_BLOCK_GROUP_RAID6: default: btrfs_err(fs_info, "zoned: profile %s not yet supported", btrfs_bg_type_to_raid_name(map->type)); ret = -EINVAL; goto out; } if (ret == -EIO && profile != 0 && profile != BTRFS_BLOCK_GROUP_RAID0 && profile != BTRFS_BLOCK_GROUP_RAID10) { /* * Detected broken write pointer. Make this block group * unallocatable by setting the allocation pointer at the end of * allocatable region. Relocating this block group will fix the * mismatch. * * Currently, we cannot handle RAID0 or RAID10 case like this * because we don't have a proper zone_capacity value. But, * reading from this block group won't work anyway by a missing * stripe. */ cache->alloc_offset = cache->zone_capacity; ret = 0; } out: /* Reject non SINGLE data profiles without RST */ if ((map->type & BTRFS_BLOCK_GROUP_DATA) && (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) && !fs_info->stripe_root) { btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree", btrfs_bg_type_to_raid_name(map->type)); return -EINVAL; } if (cache->alloc_offset > cache->zone_capacity) { btrfs_err(fs_info, "zoned: invalid write pointer %llu (larger than zone capacity %llu) in block group %llu", cache->alloc_offset, cache->zone_capacity, cache->start); ret = -EIO; } /* An extent is allocated after the write pointer */ if (!ret && num_conventional && last_alloc > cache->alloc_offset) { btrfs_err(fs_info, "zoned: got wrong write pointer in BG %llu: %llu > %llu", logical, last_alloc, cache->alloc_offset); ret = -EIO; } if (!ret) { cache->meta_write_pointer = cache->alloc_offset + cache->start; if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags)) { btrfs_get_block_group(cache); spin_lock(&fs_info->zone_active_bgs_lock); list_add_tail(&cache->active_bg_list, &fs_info->zone_active_bgs); spin_unlock(&fs_info->zone_active_bgs_lock); } } else { btrfs_free_chunk_map(cache->physical_map); cache->physical_map = NULL; } bitmap_free(active); kfree(zone_info); return ret; } void btrfs_calc_zone_unusable(struct btrfs_block_group *cache) { u64 unusable, free; if (!btrfs_is_zoned(cache->fs_info)) return; WARN_ON(cache->bytes_super != 0); unusable = (cache->alloc_offset - cache->used) + (cache->length - cache->zone_capacity); free = cache->zone_capacity - cache->alloc_offset; /* We only need ->free_space in ALLOC_SEQ block groups */ cache->cached = BTRFS_CACHE_FINISHED; cache->free_space_ctl->free_space = free; cache->zone_unusable = unusable; } bool btrfs_use_zone_append(struct btrfs_bio *bbio) { u64 start = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT); struct btrfs_inode *inode = bbio->inode; struct btrfs_fs_info *fs_info = bbio->fs_info; struct btrfs_block_group *cache; bool ret = false; if (!btrfs_is_zoned(fs_info)) return false; if (!inode || !is_data_inode(inode)) return false; if (btrfs_op(&bbio->bio) != BTRFS_MAP_WRITE) return false; /* * Using REQ_OP_ZONE_APPEND for relocation can break assumptions on the * extent layout the relocation code has. * Furthermore we have set aside own block-group from which only the * relocation "process" can allocate and make sure only one process at a * time can add pages to an extent that gets relocated, so it's safe to * use regular REQ_OP_WRITE for this special case. */ if (btrfs_is_data_reloc_root(inode->root)) return false; cache = btrfs_lookup_block_group(fs_info, start); ASSERT(cache); if (!cache) return false; ret = !!test_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &cache->runtime_flags); btrfs_put_block_group(cache); return ret; } void btrfs_record_physical_zoned(struct btrfs_bio *bbio) { const u64 physical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; struct btrfs_ordered_sum *sum = bbio->sums; if (physical < bbio->orig_physical) sum->logical -= bbio->orig_physical - physical; else sum->logical += physical - bbio->orig_physical; } static void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered, u64 logical) { struct extent_map_tree *em_tree = &ordered->inode->extent_tree; struct extent_map *em; ordered->disk_bytenr = logical; write_lock(&em_tree->lock); em = search_extent_mapping(em_tree, ordered->file_offset, ordered->num_bytes); /* The em should be a new COW extent, thus it should not have an offset. */ ASSERT(em->offset == 0); em->disk_bytenr = logical; free_extent_map(em); write_unlock(&em_tree->lock); } static bool btrfs_zoned_split_ordered(struct btrfs_ordered_extent *ordered, u64 logical, u64 len) { struct btrfs_ordered_extent *new; if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) && split_extent_map(ordered->inode, ordered->file_offset, ordered->num_bytes, len, logical)) return false; new = btrfs_split_ordered_extent(ordered, len); if (IS_ERR(new)) return false; new->disk_bytenr = logical; btrfs_finish_one_ordered(new); return true; } void btrfs_finish_ordered_zoned(struct btrfs_ordered_extent *ordered) { struct btrfs_inode *inode = ordered->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_ordered_sum *sum; u64 logical, len; /* * Write to pre-allocated region is for the data relocation, and so * it should use WRITE operation. No split/rewrite are necessary. */ if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) return; ASSERT(!list_empty(&ordered->list)); /* The ordered->list can be empty in the above pre-alloc case. */ sum = list_first_entry(&ordered->list, struct btrfs_ordered_sum, list); logical = sum->logical; len = sum->len; while (len < ordered->disk_num_bytes) { sum = list_next_entry(sum, list); if (sum->logical == logical + len) { len += sum->len; continue; } if (!btrfs_zoned_split_ordered(ordered, logical, len)) { set_bit(BTRFS_ORDERED_IOERR, &ordered->flags); btrfs_err(fs_info, "failed to split ordered extent"); goto out; } logical = sum->logical; len = sum->len; } if (ordered->disk_bytenr != logical) btrfs_rewrite_logical_zoned(ordered, logical); out: /* * If we end up here for nodatasum I/O, the btrfs_ordered_sum structures * were allocated by btrfs_alloc_dummy_sum only to record the logical * addresses and don't contain actual checksums. We thus must free them * here so that we don't attempt to log the csums later. */ if ((inode->flags & BTRFS_INODE_NODATASUM) || test_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state)) { while ((sum = list_first_entry_or_null(&ordered->list, typeof(*sum), list))) { list_del(&sum->list); kfree(sum); } } } static bool check_bg_is_active(struct btrfs_eb_write_context *ctx, struct btrfs_block_group **active_bg) { const struct writeback_control *wbc = ctx->wbc; struct btrfs_block_group *block_group = ctx->zoned_bg; struct btrfs_fs_info *fs_info = block_group->fs_info; if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) return true; if (fs_info->treelog_bg == block_group->start) { if (!btrfs_zone_activate(block_group)) { int ret_fin = btrfs_zone_finish_one_bg(fs_info); if (ret_fin != 1 || !btrfs_zone_activate(block_group)) return false; } } else if (*active_bg != block_group) { struct btrfs_block_group *tgt = *active_bg; /* zoned_meta_io_lock protects fs_info->active_{meta,system}_bg. */ lockdep_assert_held(&fs_info->zoned_meta_io_lock); if (tgt) { /* * If there is an unsent IO left in the allocated area, * we cannot wait for them as it may cause a deadlock. */ if (tgt->meta_write_pointer < tgt->start + tgt->alloc_offset) { if (wbc->sync_mode == WB_SYNC_NONE || (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)) return false; } /* Pivot active metadata/system block group. */ btrfs_zoned_meta_io_unlock(fs_info); wait_eb_writebacks(tgt); do_zone_finish(tgt, true); btrfs_zoned_meta_io_lock(fs_info); if (*active_bg == tgt) { btrfs_put_block_group(tgt); *active_bg = NULL; } } if (!btrfs_zone_activate(block_group)) return false; if (*active_bg != block_group) { ASSERT(*active_bg == NULL); *active_bg = block_group; btrfs_get_block_group(block_group); } } return true; } /* * Check if @ctx->eb is aligned to the write pointer. * * Return: * 0: @ctx->eb is at the write pointer. You can write it. * -EAGAIN: There is a hole. The caller should handle the case. * -EBUSY: There is a hole, but the caller can just bail out. */ int btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, struct btrfs_eb_write_context *ctx) { const struct writeback_control *wbc = ctx->wbc; const struct extent_buffer *eb = ctx->eb; struct btrfs_block_group *block_group = ctx->zoned_bg; if (!btrfs_is_zoned(fs_info)) return 0; if (block_group) { if (block_group->start > eb->start || block_group->start + block_group->length <= eb->start) { btrfs_put_block_group(block_group); block_group = NULL; ctx->zoned_bg = NULL; } } if (!block_group) { block_group = btrfs_lookup_block_group(fs_info, eb->start); if (!block_group) return 0; ctx->zoned_bg = block_group; } if (block_group->meta_write_pointer == eb->start) { struct btrfs_block_group **tgt; if (!test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags)) return 0; if (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM) tgt = &fs_info->active_system_bg; else tgt = &fs_info->active_meta_bg; if (check_bg_is_active(ctx, tgt)) return 0; } /* * Since we may release fs_info->zoned_meta_io_lock, someone can already * start writing this eb. In that case, we can just bail out. */ if (block_group->meta_write_pointer > eb->start) return -EBUSY; /* If for_sync, this hole will be filled with transaction commit. */ if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) return -EAGAIN; return -EBUSY; } int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length) { if (!btrfs_dev_is_sequential(device, physical)) return -EOPNOTSUPP; return blkdev_issue_zeroout(device->bdev, physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT, GFP_NOFS, 0); } static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical, struct blk_zone *zone) { struct btrfs_io_context *bioc = NULL; u64 mapped_length = PAGE_SIZE; unsigned int nofs_flag; int nmirrors; int i, ret; ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, &mapped_length, &bioc, NULL, NULL); if (ret || !bioc || mapped_length < PAGE_SIZE) { ret = -EIO; goto out_put_bioc; } if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { ret = -EINVAL; goto out_put_bioc; } nofs_flag = memalloc_nofs_save(); nmirrors = (int)bioc->num_stripes; for (i = 0; i < nmirrors; i++) { u64 physical = bioc->stripes[i].physical; struct btrfs_device *dev = bioc->stripes[i].dev; /* Missing device */ if (!dev->bdev) continue; ret = btrfs_get_dev_zone(dev, physical, zone); /* Failing device */ if (ret == -EIO || ret == -EOPNOTSUPP) continue; break; } memalloc_nofs_restore(nofs_flag); out_put_bioc: btrfs_put_bioc(bioc); return ret; } /* * Synchronize write pointer in a zone at @physical_start on @tgt_dev, by * filling zeros between @physical_pos to a write pointer of dev-replace * source device. */ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, u64 physical_start, u64 physical_pos) { struct btrfs_fs_info *fs_info = tgt_dev->fs_info; struct blk_zone zone; u64 length; u64 wp; int ret; if (!btrfs_dev_is_sequential(tgt_dev, physical_pos)) return 0; ret = read_zone_info(fs_info, logical, &zone); if (ret) return ret; wp = physical_start + ((zone.wp - zone.start) << SECTOR_SHIFT); if (physical_pos == wp) return 0; if (physical_pos > wp) return -EUCLEAN; length = wp - physical_pos; return btrfs_zoned_issue_zeroout(tgt_dev, physical_pos, length); } /* * Activate block group and underlying device zones * * @block_group: the block group to activate * * Return: true on success, false otherwise */ bool btrfs_zone_activate(struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_chunk_map *map; struct btrfs_device *device; u64 physical; const bool is_data = (block_group->flags & BTRFS_BLOCK_GROUP_DATA); bool ret; int i; if (!btrfs_is_zoned(block_group->fs_info)) return true; map = block_group->physical_map; spin_lock(&fs_info->zone_active_bgs_lock); spin_lock(&block_group->lock); if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) { ret = true; goto out_unlock; } /* No space left */ if (btrfs_zoned_bg_is_full(block_group)) { ret = false; goto out_unlock; } for (i = 0; i < map->num_stripes; i++) { struct btrfs_zoned_device_info *zinfo; int reserved = 0; device = map->stripes[i].dev; physical = map->stripes[i].physical; zinfo = device->zone_info; if (zinfo->max_active_zones == 0) continue; if (is_data) reserved = zinfo->reserved_active_zones; /* * For the data block group, leave active zones for one * metadata block group and one system block group. */ if (atomic_read(&zinfo->active_zones_left) <= reserved) { ret = false; goto out_unlock; } if (!btrfs_dev_set_active_zone(device, physical)) { /* Cannot activate the zone */ ret = false; goto out_unlock; } if (!is_data) zinfo->reserved_active_zones--; } /* Successfully activated all the zones */ set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags); spin_unlock(&block_group->lock); /* For the active block group list */ btrfs_get_block_group(block_group); list_add_tail(&block_group->active_bg_list, &fs_info->zone_active_bgs); spin_unlock(&fs_info->zone_active_bgs_lock); return true; out_unlock: spin_unlock(&block_group->lock); spin_unlock(&fs_info->zone_active_bgs_lock); return ret; } static void wait_eb_writebacks(struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = block_group->fs_info; const u64 end = block_group->start + block_group->length; struct radix_tree_iter iter; struct extent_buffer *eb; void __rcu **slot; rcu_read_lock(); radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, block_group->start >> fs_info->sectorsize_bits) { eb = radix_tree_deref_slot(slot); if (!eb) continue; if (radix_tree_deref_retry(eb)) { slot = radix_tree_iter_retry(&iter); continue; } if (eb->start < block_group->start) continue; if (eb->start >= end) break; slot = radix_tree_iter_resume(slot, &iter); rcu_read_unlock(); wait_on_extent_buffer_writeback(eb); rcu_read_lock(); } rcu_read_unlock(); } static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written) { struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_chunk_map *map; const bool is_metadata = (block_group->flags & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)); struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; int ret = 0; int i; spin_lock(&block_group->lock); if (!test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) { spin_unlock(&block_group->lock); return 0; } /* Check if we have unwritten allocated space */ if (is_metadata && block_group->start + block_group->alloc_offset > block_group->meta_write_pointer) { spin_unlock(&block_group->lock); return -EAGAIN; } /* * If we are sure that the block group is full (= no more room left for * new allocation) and the IO for the last usable block is completed, we * don't need to wait for the other IOs. This holds because we ensure * the sequential IO submissions using the ZONE_APPEND command for data * and block_group->meta_write_pointer for metadata. */ if (!fully_written) { if (test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) { spin_unlock(&block_group->lock); return -EAGAIN; } spin_unlock(&block_group->lock); ret = btrfs_inc_block_group_ro(block_group, false); if (ret) return ret; /* Ensure all writes in this block group finish */ btrfs_wait_block_group_reservations(block_group); /* No need to wait for NOCOW writers. Zoned mode does not allow that */ btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group); /* Wait for extent buffers to be written. */ if (is_metadata) wait_eb_writebacks(block_group); spin_lock(&block_group->lock); /* * Bail out if someone already deactivated the block group, or * allocated space is left in the block group. */ if (!test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) { spin_unlock(&block_group->lock); btrfs_dec_block_group_ro(block_group); return 0; } if (block_group->reserved || test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) { spin_unlock(&block_group->lock); btrfs_dec_block_group_ro(block_group); return -EAGAIN; } } clear_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags); block_group->alloc_offset = block_group->zone_capacity; if (block_group->flags & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)) block_group->meta_write_pointer = block_group->start + block_group->zone_capacity; block_group->free_space_ctl->free_space = 0; btrfs_clear_treelog_bg(block_group); btrfs_clear_data_reloc_bg(block_group); spin_unlock(&block_group->lock); down_read(&dev_replace->rwsem); map = block_group->physical_map; for (i = 0; i < map->num_stripes; i++) { struct btrfs_device *device = map->stripes[i].dev; const u64 physical = map->stripes[i].physical; struct btrfs_zoned_device_info *zinfo = device->zone_info; unsigned int nofs_flags; if (zinfo->max_active_zones == 0) continue; nofs_flags = memalloc_nofs_save(); ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, physical >> SECTOR_SHIFT, zinfo->zone_size >> SECTOR_SHIFT); memalloc_nofs_restore(nofs_flags); if (ret) { up_read(&dev_replace->rwsem); return ret; } if (!(block_group->flags & BTRFS_BLOCK_GROUP_DATA)) zinfo->reserved_active_zones++; btrfs_dev_clear_active_zone(device, physical); } up_read(&dev_replace->rwsem); if (!fully_written) btrfs_dec_block_group_ro(block_group); spin_lock(&fs_info->zone_active_bgs_lock); ASSERT(!list_empty(&block_group->active_bg_list)); list_del_init(&block_group->active_bg_list); spin_unlock(&fs_info->zone_active_bgs_lock); /* For active_bg_list */ btrfs_put_block_group(block_group); clear_and_wake_up_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags); return 0; } int btrfs_zone_finish(struct btrfs_block_group *block_group) { if (!btrfs_is_zoned(block_group->fs_info)) return 0; return do_zone_finish(block_group, false); } bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags) { struct btrfs_fs_info *fs_info = fs_devices->fs_info; struct btrfs_device *device; bool ret = false; if (!btrfs_is_zoned(fs_info)) return true; /* Check if there is a device with active zones left */ mutex_lock(&fs_info->chunk_mutex); spin_lock(&fs_info->zone_active_bgs_lock); list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { struct btrfs_zoned_device_info *zinfo = device->zone_info; int reserved = 0; if (!device->bdev) continue; if (!zinfo->max_active_zones) { ret = true; break; } if (flags & BTRFS_BLOCK_GROUP_DATA) reserved = zinfo->reserved_active_zones; switch (flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) { case 0: /* single */ ret = (atomic_read(&zinfo->active_zones_left) >= (1 + reserved)); break; case BTRFS_BLOCK_GROUP_DUP: ret = (atomic_read(&zinfo->active_zones_left) >= (2 + reserved)); break; } if (ret) break; } spin_unlock(&fs_info->zone_active_bgs_lock); mutex_unlock(&fs_info->chunk_mutex); if (!ret) set_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags); return ret; } void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length) { struct btrfs_block_group *block_group; u64 min_alloc_bytes; if (!btrfs_is_zoned(fs_info)) return; block_group = btrfs_lookup_block_group(fs_info, logical); ASSERT(block_group); /* No MIXED_BG on zoned btrfs. */ if (block_group->flags & BTRFS_BLOCK_GROUP_DATA) min_alloc_bytes = fs_info->sectorsize; else min_alloc_bytes = fs_info->nodesize; /* Bail out if we can allocate more data from this block group. */ if (logical + length + min_alloc_bytes <= block_group->start + block_group->zone_capacity) goto out; do_zone_finish(block_group, true); out: btrfs_put_block_group(block_group); } static void btrfs_zone_finish_endio_workfn(struct work_struct *work) { struct btrfs_block_group *bg = container_of(work, struct btrfs_block_group, zone_finish_work); wait_on_extent_buffer_writeback(bg->last_eb); free_extent_buffer(bg->last_eb); btrfs_zone_finish_endio(bg->fs_info, bg->start, bg->length); btrfs_put_block_group(bg); } void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, struct extent_buffer *eb) { if (!test_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &bg->runtime_flags) || eb->start + eb->len * 2 <= bg->start + bg->zone_capacity) return; if (WARN_ON(bg->zone_finish_work.func == btrfs_zone_finish_endio_workfn)) { btrfs_err(bg->fs_info, "double scheduling of bg %llu zone finishing", bg->start); return; } /* For the work */ btrfs_get_block_group(bg); atomic_inc(&eb->refs); bg->last_eb = eb; INIT_WORK(&bg->zone_finish_work, btrfs_zone_finish_endio_workfn); queue_work(system_unbound_wq, &bg->zone_finish_work); } void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) { struct btrfs_fs_info *fs_info = bg->fs_info; spin_lock(&fs_info->relocation_bg_lock); if (fs_info->data_reloc_bg == bg->start) fs_info->data_reloc_bg = 0; spin_unlock(&fs_info->relocation_bg_lock); } void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) { struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; if (!btrfs_is_zoned(fs_info)) return; mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry(device, &fs_devices->devices, dev_list) { if (device->zone_info) { vfree(device->zone_info->zone_cache); device->zone_info->zone_cache = NULL; } } mutex_unlock(&fs_devices->device_list_mutex); } bool btrfs_zoned_should_reclaim(const struct btrfs_fs_info *fs_info) { struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; u64 used = 0; u64 total = 0; u64 factor; ASSERT(btrfs_is_zoned(fs_info)); if (fs_info->bg_reclaim_threshold == 0) return false; mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry(device, &fs_devices->devices, dev_list) { if (!device->bdev) continue; total += device->disk_total_bytes; used += device->bytes_used; } mutex_unlock(&fs_devices->device_list_mutex); factor = div64_u64(used * 100, total); return factor >= fs_info->bg_reclaim_threshold; } void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical, u64 length) { struct btrfs_block_group *block_group; if (!btrfs_is_zoned(fs_info)) return; block_group = btrfs_lookup_block_group(fs_info, logical); /* It should be called on a previous data relocation block group. */ ASSERT(block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)); spin_lock(&block_group->lock); if (!test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) goto out; /* All relocation extents are written. */ if (block_group->start + block_group->alloc_offset == logical + length) { /* * Now, release this block group for further allocations and * zone finish. */ clear_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags); } out: spin_unlock(&block_group->lock); btrfs_put_block_group(block_group); } int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info) { struct btrfs_block_group *block_group; struct btrfs_block_group *min_bg = NULL; u64 min_avail = U64_MAX; int ret; spin_lock(&fs_info->zone_active_bgs_lock); list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) { u64 avail; spin_lock(&block_group->lock); if (block_group->reserved || block_group->alloc_offset == 0 || (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM) || test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) { spin_unlock(&block_group->lock); continue; } avail = block_group->zone_capacity - block_group->alloc_offset; if (min_avail > avail) { if (min_bg) btrfs_put_block_group(min_bg); min_bg = block_group; min_avail = avail; btrfs_get_block_group(min_bg); } spin_unlock(&block_group->lock); } spin_unlock(&fs_info->zone_active_bgs_lock); if (!min_bg) return 0; ret = btrfs_zone_finish(min_bg); btrfs_put_block_group(min_bg); return ret < 0 ? ret : 1; } int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, bool do_finish) { struct btrfs_block_group *bg; int index; if (!btrfs_is_zoned(fs_info) || (space_info->flags & BTRFS_BLOCK_GROUP_DATA)) return 0; for (;;) { int ret; bool need_finish = false; down_read(&space_info->groups_sem); for (index = 0; index < BTRFS_NR_RAID_TYPES; index++) { list_for_each_entry(bg, &space_info->block_groups[index], list) { if (!spin_trylock(&bg->lock)) continue; if (btrfs_zoned_bg_is_full(bg) || test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags)) { spin_unlock(&bg->lock); continue; } spin_unlock(&bg->lock); if (btrfs_zone_activate(bg)) { up_read(&space_info->groups_sem); return 1; } need_finish = true; } } up_read(&space_info->groups_sem); if (!do_finish || !need_finish) break; ret = btrfs_zone_finish_one_bg(fs_info); if (ret == 0) break; if (ret < 0) return ret; } return 0; } /* * Reserve zones for one metadata block group, one tree-log block group, and one * system block group. */ void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info) { struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_block_group *block_group; struct btrfs_device *device; /* Reserve zones for normal SINGLE metadata and tree-log block group. */ unsigned int metadata_reserve = 2; /* Reserve a zone for SINGLE system block group. */ unsigned int system_reserve = 1; if (!test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags)) return; /* * This function is called from the mount context. So, there is no * parallel process touching the bits. No need for read_seqretry(). */ if (fs_info->avail_metadata_alloc_bits & BTRFS_BLOCK_GROUP_DUP) metadata_reserve = 4; if (fs_info->avail_system_alloc_bits & BTRFS_BLOCK_GROUP_DUP) system_reserve = 2; /* Apply the reservation on all the devices. */ mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry(device, &fs_devices->devices, dev_list) { if (!device->bdev) continue; device->zone_info->reserved_active_zones = metadata_reserve + system_reserve; } mutex_unlock(&fs_devices->device_list_mutex); /* Release reservation for currently active block groups. */ spin_lock(&fs_info->zone_active_bgs_lock); list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) { struct btrfs_chunk_map *map = block_group->physical_map; if (!(block_group->flags & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM))) continue; for (int i = 0; i < map->num_stripes; i++) map->stripes[i].dev->zone_info->reserved_active_zones--; } spin_unlock(&fs_info->zone_active_bgs_lock); } /* * Reset the zones of unused block groups from @space_info->bytes_zone_unusable. * * @space_info: the space to work on * @num_bytes: targeting reclaim bytes * * This one resets the zones of a block group, so we can reuse the region * without removing the block group. On the other hand, btrfs_delete_unused_bgs() * just removes a block group and frees up the underlying zones. So, we still * need to allocate a new block group to reuse the zones. * * Resetting is faster than deleting/recreating a block group. It is similar * to freeing the logical space on the regular mode. However, we cannot change * the block group's profile with this operation. */ int btrfs_reset_unused_block_groups(struct btrfs_space_info *space_info, u64 num_bytes) { struct btrfs_fs_info *fs_info = space_info->fs_info; const sector_t zone_size_sectors = fs_info->zone_size >> SECTOR_SHIFT; if (!btrfs_is_zoned(fs_info)) return 0; while (num_bytes > 0) { struct btrfs_chunk_map *map; struct btrfs_block_group *bg = NULL; bool found = false; u64 reclaimed = 0; /* * Here, we choose a fully zone_unusable block group. It's * technically possible to reset a partly zone_unusable block * group, which still has some free space left. However, * handling that needs to cope with the allocation side, which * makes the logic more complex. So, let's handle the easy case * for now. */ spin_lock(&fs_info->unused_bgs_lock); list_for_each_entry(bg, &fs_info->unused_bgs, bg_list) { if ((bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) != space_info->flags) continue; /* * Use trylock to avoid locking order violation. In * btrfs_reclaim_bgs_work(), the lock order is * &bg->lock -> &fs_info->unused_bgs_lock. We skip a * block group if we cannot take its lock. */ if (!spin_trylock(&bg->lock)) continue; if (btrfs_is_block_group_used(bg) || bg->zone_unusable < bg->length) { spin_unlock(&bg->lock); continue; } spin_unlock(&bg->lock); found = true; break; } if (!found) { spin_unlock(&fs_info->unused_bgs_lock); return 0; } list_del_init(&bg->bg_list); btrfs_put_block_group(bg); spin_unlock(&fs_info->unused_bgs_lock); /* * Since the block group is fully zone_unusable and we cannot * allocate from this block group anymore, we don't need to set * this block group read-only. */ down_read(&fs_info->dev_replace.rwsem); map = bg->physical_map; for (int i = 0; i < map->num_stripes; i++) { struct btrfs_io_stripe *stripe = &map->stripes[i]; unsigned int nofs_flags; int ret; nofs_flags = memalloc_nofs_save(); ret = blkdev_zone_mgmt(stripe->dev->bdev, REQ_OP_ZONE_RESET, stripe->physical >> SECTOR_SHIFT, zone_size_sectors); memalloc_nofs_restore(nofs_flags); if (ret) { up_read(&fs_info->dev_replace.rwsem); return ret; } } up_read(&fs_info->dev_replace.rwsem); spin_lock(&space_info->lock); spin_lock(&bg->lock); ASSERT(!btrfs_is_block_group_used(bg)); if (bg->ro) { spin_unlock(&bg->lock); spin_unlock(&space_info->lock); continue; } reclaimed = bg->alloc_offset; bg->zone_unusable = bg->length - bg->zone_capacity; bg->alloc_offset = 0; /* * This holds because we currently reset fully used then freed * block group. */ ASSERT(reclaimed == bg->zone_capacity); bg->free_space_ctl->free_space += reclaimed; space_info->bytes_zone_unusable -= reclaimed; spin_unlock(&bg->lock); btrfs_return_free_space(space_info, reclaimed); spin_unlock(&space_info->lock); if (num_bytes <= reclaimed) break; num_bytes -= reclaimed; } return 0; }
2 17 2 8 4 13 16 17 16 17 1 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 #ifndef DEFUTIL_H #define DEFUTIL_H #include <linux/zutil.h> #define Assert(err, str) #define Trace(dummy) #define Tracev(dummy) #define Tracecv(err, dummy) #define Tracevv(dummy) #define LENGTH_CODES 29 /* number of length codes, not counting the special END_BLOCK code */ #define LITERALS 256 /* number of literal bytes 0..255 */ #define L_CODES (LITERALS+1+LENGTH_CODES) /* number of Literal or Length codes, including the END_BLOCK code */ #define D_CODES 30 /* number of distance codes */ #define BL_CODES 19 /* number of codes used to transfer the bit lengths */ #define HEAP_SIZE (2*L_CODES+1) /* maximum heap size */ #define MAX_BITS 15 /* All codes must not exceed MAX_BITS bits */ #define INIT_STATE 42 #define BUSY_STATE 113 #define FINISH_STATE 666 /* Stream status */ /* Data structure describing a single value and its code string. */ typedef struct ct_data_s { union { ush freq; /* frequency count */ ush code; /* bit string */ } fc; union { ush dad; /* father node in Huffman tree */ ush len; /* length of bit string */ } dl; } ct_data; #define Freq fc.freq #define Code fc.code #define Dad dl.dad #define Len dl.len typedef struct static_tree_desc_s static_tree_desc; typedef struct tree_desc_s { ct_data *dyn_tree; /* the dynamic tree */ int max_code; /* largest code with non zero frequency */ static_tree_desc *stat_desc; /* the corresponding static tree */ } tree_desc; typedef ush Pos; typedef unsigned IPos; /* A Pos is an index in the character window. We use short instead of int to * save space in the various tables. IPos is used only for parameter passing. */ typedef struct deflate_state { z_streamp strm; /* pointer back to this zlib stream */ int status; /* as the name implies */ Byte *pending_buf; /* output still pending */ ulg pending_buf_size; /* size of pending_buf */ Byte *pending_out; /* next pending byte to output to the stream */ int pending; /* nb of bytes in the pending buffer */ int noheader; /* suppress zlib header and adler32 */ Byte data_type; /* UNKNOWN, BINARY or ASCII */ Byte method; /* STORED (for zip only) or DEFLATED */ int last_flush; /* value of flush param for previous deflate call */ /* used by deflate.c: */ uInt w_size; /* LZ77 window size (32K by default) */ uInt w_bits; /* log2(w_size) (8..16) */ uInt w_mask; /* w_size - 1 */ Byte *window; /* Sliding window. Input bytes are read into the second half of the window, * and move to the first half later to keep a dictionary of at least wSize * bytes. With this organization, matches are limited to a distance of * wSize-MAX_MATCH bytes, but this ensures that IO is always * performed with a length multiple of the block size. Also, it limits * the window size to 64K, which is quite useful on MSDOS. * To do: use the user input buffer as sliding window. */ ulg window_size; /* Actual size of window: 2*wSize, except when the user input buffer * is directly used as sliding window. */ Pos *prev; /* Link to older string with same hash index. To limit the size of this * array to 64K, this link is maintained only for the last 32K strings. * An index in this array is thus a window index modulo 32K. */ Pos *head; /* Heads of the hash chains or NIL. */ uInt ins_h; /* hash index of string to be inserted */ uInt hash_size; /* number of elements in hash table */ uInt hash_bits; /* log2(hash_size) */ uInt hash_mask; /* hash_size-1 */ uInt hash_shift; /* Number of bits by which ins_h must be shifted at each input * step. It must be such that after MIN_MATCH steps, the oldest * byte no longer takes part in the hash key, that is: * hash_shift * MIN_MATCH >= hash_bits */ long block_start; /* Window position at the beginning of the current output block. Gets * negative when the window is moved backwards. */ uInt match_length; /* length of best match */ IPos prev_match; /* previous match */ int match_available; /* set if previous match exists */ uInt strstart; /* start of string to insert */ uInt match_start; /* start of matching string */ uInt lookahead; /* number of valid bytes ahead in window */ uInt prev_length; /* Length of the best match at previous step. Matches not greater than this * are discarded. This is used in the lazy match evaluation. */ uInt max_chain_length; /* To speed up deflation, hash chains are never searched beyond this * length. A higher limit improves compression ratio but degrades the * speed. */ uInt max_lazy_match; /* Attempt to find a better match only when the current match is strictly * smaller than this value. This mechanism is used only for compression * levels >= 4. */ # define max_insert_length max_lazy_match /* Insert new strings in the hash table only if the match length is not * greater than this length. This saves time but degrades compression. * max_insert_length is used only for compression levels <= 3. */ int level; /* compression level (1..9) */ int strategy; /* favor or force Huffman coding*/ uInt good_match; /* Use a faster search when the previous match is longer than this */ int nice_match; /* Stop searching when current match exceeds this */ /* used by trees.c: */ /* Didn't use ct_data typedef below to suppress compiler warning */ struct ct_data_s dyn_ltree[HEAP_SIZE]; /* literal and length tree */ struct ct_data_s dyn_dtree[2*D_CODES+1]; /* distance tree */ struct ct_data_s bl_tree[2*BL_CODES+1]; /* Huffman tree for bit lengths */ struct tree_desc_s l_desc; /* desc. for literal tree */ struct tree_desc_s d_desc; /* desc. for distance tree */ struct tree_desc_s bl_desc; /* desc. for bit length tree */ ush bl_count[MAX_BITS+1]; /* number of codes at each bit length for an optimal tree */ int heap[2*L_CODES+1]; /* heap used to build the Huffman trees */ int heap_len; /* number of elements in the heap */ int heap_max; /* element of largest frequency */ /* The sons of heap[n] are heap[2*n] and heap[2*n+1]. heap[0] is not used. * The same heap array is used to build all trees. */ uch depth[2*L_CODES+1]; /* Depth of each subtree used as tie breaker for trees of equal frequency */ uch *l_buf; /* buffer for literals or lengths */ uInt lit_bufsize; /* Size of match buffer for literals/lengths. There are 4 reasons for * limiting lit_bufsize to 64K: * - frequencies can be kept in 16 bit counters * - if compression is not successful for the first block, all input * data is still in the window so we can still emit a stored block even * when input comes from standard input. (This can also be done for * all blocks if lit_bufsize is not greater than 32K.) * - if compression is not successful for a file smaller than 64K, we can * even emit a stored file instead of a stored block (saving 5 bytes). * This is applicable only for zip (not gzip or zlib). * - creating new Huffman trees less frequently may not provide fast * adaptation to changes in the input data statistics. (Take for * example a binary file with poorly compressible code followed by * a highly compressible string table.) Smaller buffer sizes give * fast adaptation but have of course the overhead of transmitting * trees more frequently. * - I can't count above 4 */ uInt last_lit; /* running index in l_buf */ ush *d_buf; /* Buffer for distances. To simplify the code, d_buf and l_buf have * the same number of elements. To use different lengths, an extra flag * array would be necessary. */ ulg opt_len; /* bit length of current block with optimal trees */ ulg static_len; /* bit length of current block with static trees */ ulg compressed_len; /* total bit length of compressed file */ uInt matches; /* number of string matches in current block */ int last_eob_len; /* bit length of EOB code for last block */ #ifdef DEBUG_ZLIB ulg bits_sent; /* bit length of the compressed data */ #endif ush bi_buf; /* Output buffer. bits are inserted starting at the bottom (least * significant bits). */ int bi_valid; /* Number of valid bits in bi_buf. All bits above the last valid bit * are always zero. */ } deflate_state; #ifdef CONFIG_ZLIB_DFLTCC #define zlib_deflate_window_memsize(windowBits) \ (2 * (1 << (windowBits)) * sizeof(Byte) + PAGE_SIZE) #else #define zlib_deflate_window_memsize(windowBits) \ (2 * (1 << (windowBits)) * sizeof(Byte)) #endif #define zlib_deflate_prev_memsize(windowBits) \ ((1 << (windowBits)) * sizeof(Pos)) #define zlib_deflate_head_memsize(memLevel) \ ((1 << ((memLevel)+7)) * sizeof(Pos)) #define zlib_deflate_overlay_memsize(memLevel) \ ((1 << ((memLevel)+6)) * (sizeof(ush)+2)) /* Output a byte on the stream. * IN assertion: there is enough room in pending_buf. */ #define put_byte(s, c) {s->pending_buf[s->pending++] = (c);} #define MIN_LOOKAHEAD (MAX_MATCH+MIN_MATCH+1) /* Minimum amount of lookahead, except at the end of the input file. * See deflate.c for comments about the MIN_MATCH+1. */ #define MAX_DIST(s) ((s)->w_size-MIN_LOOKAHEAD) /* In order to simplify the code, particularly on 16 bit machines, match * distances are limited to MAX_DIST instead of WSIZE. */ /* in trees.c */ void zlib_tr_init (deflate_state *s); int zlib_tr_tally (deflate_state *s, unsigned dist, unsigned lc); ulg zlib_tr_flush_block (deflate_state *s, char *buf, ulg stored_len, int eof); void zlib_tr_align (deflate_state *s); void zlib_tr_stored_block (deflate_state *s, char *buf, ulg stored_len, int eof); void zlib_tr_stored_type_only (deflate_state *); /* =========================================================================== * Output a short LSB first on the stream. * IN assertion: there is enough room in pendingBuf. */ #define put_short(s, w) { \ put_byte(s, (uch)((w) & 0xff)); \ put_byte(s, (uch)((ush)(w) >> 8)); \ } /* =========================================================================== * Reverse the first len bits of a code, using straightforward code (a faster * method would use a table) * IN assertion: 1 <= len <= 15 */ static inline unsigned bi_reverse( unsigned code, /* the value to invert */ int len /* its bit length */ ) { register unsigned res = 0; do { res |= code & 1; code >>= 1, res <<= 1; } while (--len > 0); return res >> 1; } /* =========================================================================== * Flush the bit buffer, keeping at most 7 bits in it. */ static inline void bi_flush(deflate_state *s) { if (s->bi_valid == 16) { put_short(s, s->bi_buf); s->bi_buf = 0; s->bi_valid = 0; } else if (s->bi_valid >= 8) { put_byte(s, (Byte)s->bi_buf); s->bi_buf >>= 8; s->bi_valid -= 8; } } /* =========================================================================== * Flush the bit buffer and align the output on a byte boundary */ static inline void bi_windup(deflate_state *s) { if (s->bi_valid > 8) { put_short(s, s->bi_buf); } else if (s->bi_valid > 0) { put_byte(s, (Byte)s->bi_buf); } s->bi_buf = 0; s->bi_valid = 0; #ifdef DEBUG_ZLIB s->bits_sent = (s->bits_sent+7) & ~7; #endif } typedef enum { need_more, /* block not completed, need more input or more output */ block_done, /* block flush performed */ finish_started, /* finish started, need only more output at next deflate */ finish_done /* finish done, accept no more input or output */ } block_state; #define Buf_size (8 * 2*sizeof(char)) /* Number of bits used within bi_buf. (bi_buf might be implemented on * more than 16 bits on some systems.) */ /* =========================================================================== * Send a value on a given number of bits. * IN assertion: length <= 16 and value fits in length bits. */ #ifdef DEBUG_ZLIB static void send_bits (deflate_state *s, int value, int length); static void send_bits( deflate_state *s, int value, /* value to send */ int length /* number of bits */ ) { Tracevv((stderr," l %2d v %4x ", length, value)); Assert(length > 0 && length <= 15, "invalid length"); s->bits_sent += (ulg)length; /* If not enough room in bi_buf, use (valid) bits from bi_buf and * (16 - bi_valid) bits from value, leaving (width - (16-bi_valid)) * unused bits in value. */ if (s->bi_valid > (int)Buf_size - length) { s->bi_buf |= (value << s->bi_valid); put_short(s, s->bi_buf); s->bi_buf = (ush)value >> (Buf_size - s->bi_valid); s->bi_valid += length - Buf_size; } else { s->bi_buf |= value << s->bi_valid; s->bi_valid += length; } } #else /* !DEBUG_ZLIB */ #define send_bits(s, value, length) \ { int len = length;\ if (s->bi_valid > (int)Buf_size - len) {\ int val = value;\ s->bi_buf |= (val << s->bi_valid);\ put_short(s, s->bi_buf);\ s->bi_buf = (ush)val >> (Buf_size - s->bi_valid);\ s->bi_valid += len - Buf_size;\ } else {\ s->bi_buf |= (value) << s->bi_valid;\ s->bi_valid += len;\ }\ } #endif /* DEBUG_ZLIB */ static inline void zlib_tr_send_bits( deflate_state *s, int value, int length ) { send_bits(s, value, length); } /* ========================================================================= * Flush as much pending output as possible. All deflate() output goes * through this function so some applications may wish to modify it * to avoid allocating a large strm->next_out buffer and copying into it. * (See also read_buf()). */ static inline void flush_pending( z_streamp strm ) { unsigned len; deflate_state *s = (deflate_state *) strm->state; bi_flush(s); len = s->pending; if (len > strm->avail_out) len = strm->avail_out; if (len == 0) return; if (strm->next_out != NULL) { memcpy(strm->next_out, s->pending_out, len); strm->next_out += len; } s->pending_out += len; strm->total_out += len; strm->avail_out -= len; s->pending -= len; if (s->pending == 0) { s->pending_out = s->pending_buf; } } #endif /* DEFUTIL_H */
14 6 14 1 1 6 13 13 5 6 4 12 4 9 9 1 15 15 7 9 9 9 9 9 9 2 2 2 8 2 9 9 2 7 7 9 9 7 7 7 7 7 2 2 6 6 12 10 2 2 2 2 1 2 2 2 2 1 1 1 1 7 1 1 1 1 7 1 1 7 6 7 6 2 2 7 7 23 1 22 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 // SPDX-License-Identifier: GPL-2.0-only /* * kexec.c - kexec system call core code. * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/btf.h> #include <linux/capability.h> #include <linux/mm.h> #include <linux/file.h> #include <linux/slab.h> #include <linux/fs.h> #include <linux/kexec.h> #include <linux/mutex.h> #include <linux/list.h> #include <linux/highmem.h> #include <linux/syscalls.h> #include <linux/reboot.h> #include <linux/ioport.h> #include <linux/hardirq.h> #include <linux/elf.h> #include <linux/elfcore.h> #include <linux/utsname.h> #include <linux/numa.h> #include <linux/suspend.h> #include <linux/device.h> #include <linux/freezer.h> #include <linux/panic_notifier.h> #include <linux/pm.h> #include <linux/cpu.h> #include <linux/uaccess.h> #include <linux/io.h> #include <linux/console.h> #include <linux/vmalloc.h> #include <linux/swap.h> #include <linux/syscore_ops.h> #include <linux/compiler.h> #include <linux/hugetlb.h> #include <linux/objtool.h> #include <linux/kmsg_dump.h> #include <asm/page.h> #include <asm/sections.h> #include <crypto/hash.h> #include "kexec_internal.h" atomic_t __kexec_lock = ATOMIC_INIT(0); /* Flag to indicate we are going to kexec a new kernel */ bool kexec_in_progress = false; bool kexec_file_dbg_print; /* * When kexec transitions to the new kernel there is a one-to-one * mapping between physical and virtual addresses. On processors * where you can disable the MMU this is trivial, and easy. For * others it is still a simple predictable page table to setup. * * In that environment kexec copies the new kernel to its final * resting place. This means I can only support memory whose * physical address can fit in an unsigned long. In particular * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled. * If the assembly stub has more restrictive requirements * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be * defined more restrictively in <asm/kexec.h>. * * The code for the transition from the current kernel to the * new kernel is placed in the control_code_buffer, whose size * is given by KEXEC_CONTROL_PAGE_SIZE. In the best case only a single * page of memory is necessary, but some architectures require more. * Because this memory must be identity mapped in the transition from * virtual to physical addresses it must live in the range * 0 - TASK_SIZE, as only the user space mappings are arbitrarily * modifiable. * * The assembly stub in the control code buffer is passed a linked list * of descriptor pages detailing the source pages of the new kernel, * and the destination addresses of those source pages. As this data * structure is not used in the context of the current OS, it must * be self-contained. * * The code has been made to work with highmem pages and will use a * destination page in its final resting place (if it happens * to allocate it). The end product of this is that most of the * physical address space, and most of RAM can be used. * * Future directions include: * - allocating a page table with the control code buffer identity * mapped, to simplify machine_kexec and make kexec_on_panic more * reliable. */ /* * KIMAGE_NO_DEST is an impossible destination address..., for * allocating pages whose destination address we do not care about. */ #define KIMAGE_NO_DEST (-1UL) #define PAGE_COUNT(x) (((x) + PAGE_SIZE - 1) >> PAGE_SHIFT) static struct page *kimage_alloc_page(struct kimage *image, gfp_t gfp_mask, unsigned long dest); int sanity_check_segment_list(struct kimage *image) { int i; unsigned long nr_segments = image->nr_segments; unsigned long total_pages = 0; unsigned long nr_pages = totalram_pages(); /* * Verify we have good destination addresses. The caller is * responsible for making certain we don't attempt to load * the new image into invalid or reserved areas of RAM. This * just verifies it is an address we can use. * * Since the kernel does everything in page size chunks ensure * the destination addresses are page aligned. Too many * special cases crop of when we don't do this. The most * insidious is getting overlapping destination addresses * simply because addresses are changed to page size * granularity. */ for (i = 0; i < nr_segments; i++) { unsigned long mstart, mend; mstart = image->segment[i].mem; mend = mstart + image->segment[i].memsz; if (mstart > mend) return -EADDRNOTAVAIL; if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK)) return -EADDRNOTAVAIL; if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT) return -EADDRNOTAVAIL; } /* Verify our destination addresses do not overlap. * If we alloed overlapping destination addresses * through very weird things can happen with no * easy explanation as one segment stops on another. */ for (i = 0; i < nr_segments; i++) { unsigned long mstart, mend; unsigned long j; mstart = image->segment[i].mem; mend = mstart + image->segment[i].memsz; for (j = 0; j < i; j++) { unsigned long pstart, pend; pstart = image->segment[j].mem; pend = pstart + image->segment[j].memsz; /* Do the segments overlap ? */ if ((mend > pstart) && (mstart < pend)) return -EINVAL; } } /* Ensure our buffer sizes are strictly less than * our memory sizes. This should always be the case, * and it is easier to check up front than to be surprised * later on. */ for (i = 0; i < nr_segments; i++) { if (image->segment[i].bufsz > image->segment[i].memsz) return -EINVAL; } /* * Verify that no more than half of memory will be consumed. If the * request from userspace is too large, a large amount of time will be * wasted allocating pages, which can cause a soft lockup. */ for (i = 0; i < nr_segments; i++) { if (PAGE_COUNT(image->segment[i].memsz) > nr_pages / 2) return -EINVAL; total_pages += PAGE_COUNT(image->segment[i].memsz); } if (total_pages > nr_pages / 2) return -EINVAL; #ifdef CONFIG_CRASH_DUMP /* * Verify we have good destination addresses. Normally * the caller is responsible for making certain we don't * attempt to load the new image into invalid or reserved * areas of RAM. But crash kernels are preloaded into a * reserved area of ram. We must ensure the addresses * are in the reserved area otherwise preloading the * kernel could corrupt things. */ if (image->type == KEXEC_TYPE_CRASH) { for (i = 0; i < nr_segments; i++) { unsigned long mstart, mend; mstart = image->segment[i].mem; mend = mstart + image->segment[i].memsz - 1; /* Ensure we are within the crash kernel limits */ if ((mstart < phys_to_boot_phys(crashk_res.start)) || (mend > phys_to_boot_phys(crashk_res.end))) return -EADDRNOTAVAIL; } } #endif return 0; } struct kimage *do_kimage_alloc_init(void) { struct kimage *image; /* Allocate a controlling structure */ image = kzalloc(sizeof(*image), GFP_KERNEL); if (!image) return NULL; image->head = 0; image->entry = &image->head; image->last_entry = &image->head; image->control_page = ~0; /* By default this does not apply */ image->type = KEXEC_TYPE_DEFAULT; /* Initialize the list of control pages */ INIT_LIST_HEAD(&image->control_pages); /* Initialize the list of destination pages */ INIT_LIST_HEAD(&image->dest_pages); /* Initialize the list of unusable pages */ INIT_LIST_HEAD(&image->unusable_pages); #ifdef CONFIG_CRASH_HOTPLUG image->hp_action = KEXEC_CRASH_HP_NONE; image->elfcorehdr_index = -1; image->elfcorehdr_updated = false; #endif return image; } int kimage_is_destination_range(struct kimage *image, unsigned long start, unsigned long end) { unsigned long i; for (i = 0; i < image->nr_segments; i++) { unsigned long mstart, mend; mstart = image->segment[i].mem; mend = mstart + image->segment[i].memsz - 1; if ((end >= mstart) && (start <= mend)) return 1; } return 0; } static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order) { struct page *pages; if (fatal_signal_pending(current)) return NULL; pages = alloc_pages(gfp_mask & ~__GFP_ZERO, order); if (pages) { unsigned int count, i; pages->mapping = NULL; set_page_private(pages, order); count = 1 << order; for (i = 0; i < count; i++) SetPageReserved(pages + i); arch_kexec_post_alloc_pages(page_address(pages), count, gfp_mask); if (gfp_mask & __GFP_ZERO) for (i = 0; i < count; i++) clear_highpage(pages + i); } return pages; } static void kimage_free_pages(struct page *page) { unsigned int order, count, i; order = page_private(page); count = 1 << order; arch_kexec_pre_free_pages(page_address(page), count); for (i = 0; i < count; i++) ClearPageReserved(page + i); __free_pages(page, order); } void kimage_free_page_list(struct list_head *list) { struct page *page, *next; list_for_each_entry_safe(page, next, list, lru) { list_del(&page->lru); kimage_free_pages(page); } } static struct page *kimage_alloc_normal_control_pages(struct kimage *image, unsigned int order) { /* Control pages are special, they are the intermediaries * that are needed while we copy the rest of the pages * to their final resting place. As such they must * not conflict with either the destination addresses * or memory the kernel is already using. * * The only case where we really need more than one of * these are for architectures where we cannot disable * the MMU and must instead generate an identity mapped * page table for all of the memory. * * At worst this runs in O(N) of the image size. */ struct list_head extra_pages; struct page *pages; unsigned int count; count = 1 << order; INIT_LIST_HEAD(&extra_pages); /* Loop while I can allocate a page and the page allocated * is a destination page. */ do { unsigned long pfn, epfn, addr, eaddr; pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order); if (!pages) break; pfn = page_to_boot_pfn(pages); epfn = pfn + count; addr = pfn << PAGE_SHIFT; eaddr = (epfn << PAGE_SHIFT) - 1; if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) || kimage_is_destination_range(image, addr, eaddr)) { list_add(&pages->lru, &extra_pages); pages = NULL; } } while (!pages); if (pages) { /* Remember the allocated page... */ list_add(&pages->lru, &image->control_pages); /* Because the page is already in it's destination * location we will never allocate another page at * that address. Therefore kimage_alloc_pages * will not return it (again) and we don't need * to give it an entry in image->segment[]. */ } /* Deal with the destination pages I have inadvertently allocated. * * Ideally I would convert multi-page allocations into single * page allocations, and add everything to image->dest_pages. * * For now it is simpler to just free the pages. */ kimage_free_page_list(&extra_pages); return pages; } #ifdef CONFIG_CRASH_DUMP static struct page *kimage_alloc_crash_control_pages(struct kimage *image, unsigned int order) { /* Control pages are special, they are the intermediaries * that are needed while we copy the rest of the pages * to their final resting place. As such they must * not conflict with either the destination addresses * or memory the kernel is already using. * * Control pages are also the only pags we must allocate * when loading a crash kernel. All of the other pages * are specified by the segments and we just memcpy * into them directly. * * The only case where we really need more than one of * these are for architectures where we cannot disable * the MMU and must instead generate an identity mapped * page table for all of the memory. * * Given the low demand this implements a very simple * allocator that finds the first hole of the appropriate * size in the reserved memory region, and allocates all * of the memory up to and including the hole. */ unsigned long hole_start, hole_end, size; struct page *pages; pages = NULL; size = (1 << order) << PAGE_SHIFT; hole_start = ALIGN(image->control_page, size); hole_end = hole_start + size - 1; while (hole_end <= crashk_res.end) { unsigned long i; cond_resched(); if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT) break; /* See if I overlap any of the segments */ for (i = 0; i < image->nr_segments; i++) { unsigned long mstart, mend; mstart = image->segment[i].mem; mend = mstart + image->segment[i].memsz - 1; if ((hole_end >= mstart) && (hole_start <= mend)) { /* Advance the hole to the end of the segment */ hole_start = ALIGN(mend, size); hole_end = hole_start + size - 1; break; } } /* If I don't overlap any segments I have found my hole! */ if (i == image->nr_segments) { pages = pfn_to_page(hole_start >> PAGE_SHIFT); image->control_page = hole_end + 1; break; } } /* Ensure that these pages are decrypted if SME is enabled. */ if (pages) arch_kexec_post_alloc_pages(page_address(pages), 1 << order, 0); return pages; } #endif struct page *kimage_alloc_control_pages(struct kimage *image, unsigned int order) { struct page *pages = NULL; switch (image->type) { case KEXEC_TYPE_DEFAULT: pages = kimage_alloc_normal_control_pages(image, order); break; #ifdef CONFIG_CRASH_DUMP case KEXEC_TYPE_CRASH: pages = kimage_alloc_crash_control_pages(image, order); break; #endif } return pages; } static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) { if (*image->entry != 0) image->entry++; if (image->entry == image->last_entry) { kimage_entry_t *ind_page; struct page *page; page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST); if (!page) return -ENOMEM; ind_page = page_address(page); *image->entry = virt_to_boot_phys(ind_page) | IND_INDIRECTION; image->entry = ind_page; image->last_entry = ind_page + ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1); } *image->entry = entry; image->entry++; *image->entry = 0; return 0; } static int kimage_set_destination(struct kimage *image, unsigned long destination) { destination &= PAGE_MASK; return kimage_add_entry(image, destination | IND_DESTINATION); } static int kimage_add_page(struct kimage *image, unsigned long page) { page &= PAGE_MASK; return kimage_add_entry(image, page | IND_SOURCE); } static void kimage_free_extra_pages(struct kimage *image) { /* Walk through and free any extra destination pages I may have */ kimage_free_page_list(&image->dest_pages); /* Walk through and free any unusable pages I have cached */ kimage_free_page_list(&image->unusable_pages); } void kimage_terminate(struct kimage *image) { if (*image->entry != 0) image->entry++; *image->entry = IND_DONE; } #define for_each_kimage_entry(image, ptr, entry) \ for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \ ptr = (entry & IND_INDIRECTION) ? \ boot_phys_to_virt((entry & PAGE_MASK)) : ptr + 1) static void kimage_free_entry(kimage_entry_t entry) { struct page *page; page = boot_pfn_to_page(entry >> PAGE_SHIFT); kimage_free_pages(page); } void kimage_free(struct kimage *image) { kimage_entry_t *ptr, entry; kimage_entry_t ind = 0; if (!image) return; #ifdef CONFIG_CRASH_DUMP if (image->vmcoreinfo_data_copy) { crash_update_vmcoreinfo_safecopy(NULL); vunmap(image->vmcoreinfo_data_copy); } #endif kimage_free_extra_pages(image); for_each_kimage_entry(image, ptr, entry) { if (entry & IND_INDIRECTION) { /* Free the previous indirection page */ if (ind & IND_INDIRECTION) kimage_free_entry(ind); /* Save this indirection page until we are * done with it. */ ind = entry; } else if (entry & IND_SOURCE) kimage_free_entry(entry); } /* Free the final indirection page */ if (ind & IND_INDIRECTION) kimage_free_entry(ind); /* Handle any machine specific cleanup */ machine_kexec_cleanup(image); /* Free the kexec control pages... */ kimage_free_page_list(&image->control_pages); /* * Free up any temporary buffers allocated. This might hit if * error occurred much later after buffer allocation. */ if (image->file_mode) kimage_file_post_load_cleanup(image); kfree(image); } static kimage_entry_t *kimage_dst_used(struct kimage *image, unsigned long page) { kimage_entry_t *ptr, entry; unsigned long destination = 0; for_each_kimage_entry(image, ptr, entry) { if (entry & IND_DESTINATION) destination = entry & PAGE_MASK; else if (entry & IND_SOURCE) { if (page == destination) return ptr; destination += PAGE_SIZE; } } return NULL; } static struct page *kimage_alloc_page(struct kimage *image, gfp_t gfp_mask, unsigned long destination) { /* * Here we implement safeguards to ensure that a source page * is not copied to its destination page before the data on * the destination page is no longer useful. * * To do this we maintain the invariant that a source page is * either its own destination page, or it is not a * destination page at all. * * That is slightly stronger than required, but the proof * that no problems will not occur is trivial, and the * implementation is simply to verify. * * When allocating all pages normally this algorithm will run * in O(N) time, but in the worst case it will run in O(N^2) * time. If the runtime is a problem the data structures can * be fixed. */ struct page *page; unsigned long addr; /* * Walk through the list of destination pages, and see if I * have a match. */ list_for_each_entry(page, &image->dest_pages, lru) { addr = page_to_boot_pfn(page) << PAGE_SHIFT; if (addr == destination) { list_del(&page->lru); return page; } } page = NULL; while (1) { kimage_entry_t *old; /* Allocate a page, if we run out of memory give up */ page = kimage_alloc_pages(gfp_mask, 0); if (!page) return NULL; /* If the page cannot be used file it away */ if (page_to_boot_pfn(page) > (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { list_add(&page->lru, &image->unusable_pages); continue; } addr = page_to_boot_pfn(page) << PAGE_SHIFT; /* If it is the destination page we want use it */ if (addr == destination) break; /* If the page is not a destination page use it */ if (!kimage_is_destination_range(image, addr, addr + PAGE_SIZE - 1)) break; /* * I know that the page is someones destination page. * See if there is already a source page for this * destination page. And if so swap the source pages. */ old = kimage_dst_used(image, addr); if (old) { /* If so move it */ unsigned long old_addr; struct page *old_page; old_addr = *old & PAGE_MASK; old_page = boot_pfn_to_page(old_addr >> PAGE_SHIFT); copy_highpage(page, old_page); *old = addr | (*old & ~PAGE_MASK); /* The old page I have found cannot be a * destination page, so return it if it's * gfp_flags honor the ones passed in. */ if (!(gfp_mask & __GFP_HIGHMEM) && PageHighMem(old_page)) { kimage_free_pages(old_page); continue; } page = old_page; break; } /* Place the page on the destination list, to be used later */ list_add(&page->lru, &image->dest_pages); } return page; } static int kimage_load_normal_segment(struct kimage *image, struct kexec_segment *segment) { unsigned long maddr; size_t ubytes, mbytes; int result; unsigned char __user *buf = NULL; unsigned char *kbuf = NULL; if (image->file_mode) kbuf = segment->kbuf; else buf = segment->buf; ubytes = segment->bufsz; mbytes = segment->memsz; maddr = segment->mem; result = kimage_set_destination(image, maddr); if (result < 0) goto out; while (mbytes) { struct page *page; char *ptr; size_t uchunk, mchunk; page = kimage_alloc_page(image, GFP_HIGHUSER, maddr); if (!page) { result = -ENOMEM; goto out; } result = kimage_add_page(image, page_to_boot_pfn(page) << PAGE_SHIFT); if (result < 0) goto out; ptr = kmap_local_page(page); /* Start with a clear page */ clear_page(ptr); ptr += maddr & ~PAGE_MASK; mchunk = min_t(size_t, mbytes, PAGE_SIZE - (maddr & ~PAGE_MASK)); uchunk = min(ubytes, mchunk); if (uchunk) { /* For file based kexec, source pages are in kernel memory */ if (image->file_mode) memcpy(ptr, kbuf, uchunk); else result = copy_from_user(ptr, buf, uchunk); ubytes -= uchunk; if (image->file_mode) kbuf += uchunk; else buf += uchunk; } kunmap_local(ptr); if (result) { result = -EFAULT; goto out; } maddr += mchunk; mbytes -= mchunk; cond_resched(); } out: return result; } #ifdef CONFIG_CRASH_DUMP static int kimage_load_crash_segment(struct kimage *image, struct kexec_segment *segment) { /* For crash dumps kernels we simply copy the data from * user space to it's destination. * We do things a page at a time for the sake of kmap. */ unsigned long maddr; size_t ubytes, mbytes; int result; unsigned char __user *buf = NULL; unsigned char *kbuf = NULL; result = 0; if (image->file_mode) kbuf = segment->kbuf; else buf = segment->buf; ubytes = segment->bufsz; mbytes = segment->memsz; maddr = segment->mem; while (mbytes) { struct page *page; char *ptr; size_t uchunk, mchunk; page = boot_pfn_to_page(maddr >> PAGE_SHIFT); if (!page) { result = -ENOMEM; goto out; } arch_kexec_post_alloc_pages(page_address(page), 1, 0); ptr = kmap_local_page(page); ptr += maddr & ~PAGE_MASK; mchunk = min_t(size_t, mbytes, PAGE_SIZE - (maddr & ~PAGE_MASK)); uchunk = min(ubytes, mchunk); if (mchunk > uchunk) { /* Zero the trailing part of the page */ memset(ptr + uchunk, 0, mchunk - uchunk); } if (uchunk) { /* For file based kexec, source pages are in kernel memory */ if (image->file_mode) memcpy(ptr, kbuf, uchunk); else result = copy_from_user(ptr, buf, uchunk); ubytes -= uchunk; if (image->file_mode) kbuf += uchunk; else buf += uchunk; } kexec_flush_icache_page(page); kunmap_local(ptr); arch_kexec_pre_free_pages(page_address(page), 1); if (result) { result = -EFAULT; goto out; } maddr += mchunk; mbytes -= mchunk; cond_resched(); } out: return result; } #endif int kimage_load_segment(struct kimage *image, struct kexec_segment *segment) { int result = -ENOMEM; switch (image->type) { case KEXEC_TYPE_DEFAULT: result = kimage_load_normal_segment(image, segment); break; #ifdef CONFIG_CRASH_DUMP case KEXEC_TYPE_CRASH: result = kimage_load_crash_segment(image, segment); break; #endif } return result; } struct kexec_load_limit { /* Mutex protects the limit count. */ struct mutex mutex; int limit; }; static struct kexec_load_limit load_limit_reboot = { .mutex = __MUTEX_INITIALIZER(load_limit_reboot.mutex), .limit = -1, }; static struct kexec_load_limit load_limit_panic = { .mutex = __MUTEX_INITIALIZER(load_limit_panic.mutex), .limit = -1, }; struct kimage *kexec_image; struct kimage *kexec_crash_image; static int kexec_load_disabled; #ifdef CONFIG_SYSCTL static int kexec_limit_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct kexec_load_limit *limit = table->data; int val; struct ctl_table tmp = { .data = &val, .maxlen = sizeof(val), .mode = table->mode, }; int ret; if (write) { ret = proc_dointvec(&tmp, write, buffer, lenp, ppos); if (ret) return ret; if (val < 0) return -EINVAL; mutex_lock(&limit->mutex); if (limit->limit != -1 && val >= limit->limit) ret = -EINVAL; else limit->limit = val; mutex_unlock(&limit->mutex); return ret; } mutex_lock(&limit->mutex); val = limit->limit; mutex_unlock(&limit->mutex); return proc_dointvec(&tmp, write, buffer, lenp, ppos); } static const struct ctl_table kexec_core_sysctls[] = { { .procname = "kexec_load_disabled", .data = &kexec_load_disabled, .maxlen = sizeof(int), .mode = 0644, /* only handle a transition from default "0" to "1" */ .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE, .extra2 = SYSCTL_ONE, }, { .procname = "kexec_load_limit_panic", .data = &load_limit_panic, .mode = 0644, .proc_handler = kexec_limit_handler, }, { .procname = "kexec_load_limit_reboot", .data = &load_limit_reboot, .mode = 0644, .proc_handler = kexec_limit_handler, }, }; static int __init kexec_core_sysctl_init(void) { register_sysctl_init("kernel", kexec_core_sysctls); return 0; } late_initcall(kexec_core_sysctl_init); #endif bool kexec_load_permitted(int kexec_image_type) { struct kexec_load_limit *limit; /* * Only the superuser can use the kexec syscall and if it has not * been disabled. */ if (!capable(CAP_SYS_BOOT) || kexec_load_disabled) return false; /* Check limit counter and decrease it.*/ limit = (kexec_image_type == KEXEC_TYPE_CRASH) ? &load_limit_panic : &load_limit_reboot; mutex_lock(&limit->mutex); if (!limit->limit) { mutex_unlock(&limit->mutex); return false; } if (limit->limit != -1) limit->limit--; mutex_unlock(&limit->mutex); return true; } /* * Move into place and start executing a preloaded standalone * executable. If nothing was preloaded return an error. */ int kernel_kexec(void) { int error = 0; if (!kexec_trylock()) return -EBUSY; if (!kexec_image) { error = -EINVAL; goto Unlock; } #ifdef CONFIG_KEXEC_JUMP if (kexec_image->preserve_context) { /* * This flow is analogous to hibernation flows that occur * before creating an image and before jumping from the * restore kernel to the image one, so it uses the same * device callbacks as those two flows. */ pm_prepare_console(); error = freeze_processes(); if (error) { error = -EBUSY; goto Restore_console; } suspend_console(); error = dpm_suspend_start(PMSG_FREEZE); if (error) goto Resume_console; /* * dpm_suspend_end() must be called after dpm_suspend_start() * to complete the transition, like in the hibernation flows * mentioned above. */ error = dpm_suspend_end(PMSG_FREEZE); if (error) goto Resume_devices; error = suspend_disable_secondary_cpus(); if (error) goto Enable_cpus; local_irq_disable(); error = syscore_suspend(); if (error) goto Enable_irqs; } else #endif { kexec_in_progress = true; kernel_restart_prepare("kexec reboot"); migrate_to_reboot_cpu(); syscore_shutdown(); /* * migrate_to_reboot_cpu() disables CPU hotplug assuming that * no further code needs to use CPU hotplug (which is true in * the reboot case). However, the kexec path depends on using * CPU hotplug again; so re-enable it here. */ cpu_hotplug_enable(); pr_notice("Starting new kernel\n"); machine_shutdown(); } kmsg_dump(KMSG_DUMP_SHUTDOWN); machine_kexec(kexec_image); #ifdef CONFIG_KEXEC_JUMP if (kexec_image->preserve_context) { /* * This flow is analogous to hibernation flows that occur after * creating an image and after the image kernel has got control * back, and in case the devices have been reset or otherwise * manipulated in the meantime, it uses the device callbacks * used by the latter. */ syscore_resume(); Enable_irqs: local_irq_enable(); Enable_cpus: suspend_enable_secondary_cpus(); dpm_resume_start(PMSG_RESTORE); Resume_devices: dpm_resume_end(PMSG_RESTORE); Resume_console: resume_console(); thaw_processes(); Restore_console: pm_restore_console(); } #endif Unlock: kexec_unlock(); return error; }
18 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. */ #include <linux/libnvdimm.h> #include "rxe.h" #include "rxe_loc.h" /* Return a random 8 bit key value that is * different than the last_key. Set last_key to -1 * if this is the first key for an MR or MW */ u8 rxe_get_next_key(u32 last_key) { u8 key; do { get_random_bytes(&key, 1); } while (key == last_key); return key; } int mr_check_range(struct rxe_mr *mr, u64 iova, size_t length) { switch (mr->ibmr.type) { case IB_MR_TYPE_DMA: return 0; case IB_MR_TYPE_USER: case IB_MR_TYPE_MEM_REG: if (iova < mr->ibmr.iova || iova + length > mr->ibmr.iova + mr->ibmr.length) { rxe_dbg_mr(mr, "iova/length out of range\n"); return -EINVAL; } return 0; default: rxe_dbg_mr(mr, "mr type not supported\n"); return -EINVAL; } } static void rxe_mr_init(int access, struct rxe_mr *mr) { u32 key = mr->elem.index << 8 | rxe_get_next_key(-1); /* set ibmr->l/rkey and also copy into private l/rkey * for user MRs these will always be the same * for cases where caller 'owns' the key portion * they may be different until REG_MR WQE is executed. */ mr->lkey = mr->ibmr.lkey = key; mr->rkey = mr->ibmr.rkey = key; mr->access = access; mr->ibmr.page_size = PAGE_SIZE; mr->page_mask = PAGE_MASK; mr->page_shift = PAGE_SHIFT; mr->state = RXE_MR_STATE_INVALID; } void rxe_mr_init_dma(int access, struct rxe_mr *mr) { rxe_mr_init(access, mr); mr->state = RXE_MR_STATE_VALID; mr->ibmr.type = IB_MR_TYPE_DMA; } static unsigned long rxe_mr_iova_to_index(struct rxe_mr *mr, u64 iova) { return (iova >> mr->page_shift) - (mr->ibmr.iova >> mr->page_shift); } static unsigned long rxe_mr_iova_to_page_offset(struct rxe_mr *mr, u64 iova) { return iova & (mr_page_size(mr) - 1); } static bool is_pmem_page(struct page *pg) { unsigned long paddr = page_to_phys(pg); return REGION_INTERSECTS == region_intersects(paddr, PAGE_SIZE, IORESOURCE_MEM, IORES_DESC_PERSISTENT_MEMORY); } static int rxe_mr_fill_pages_from_sgt(struct rxe_mr *mr, struct sg_table *sgt) { XA_STATE(xas, &mr->page_list, 0); struct sg_page_iter sg_iter; struct page *page; bool persistent = !!(mr->access & IB_ACCESS_FLUSH_PERSISTENT); __sg_page_iter_start(&sg_iter, sgt->sgl, sgt->orig_nents, 0); if (!__sg_page_iter_next(&sg_iter)) return 0; do { xas_lock(&xas); while (true) { page = sg_page_iter_page(&sg_iter); if (persistent && !is_pmem_page(page)) { rxe_dbg_mr(mr, "Page can't be persistent\n"); xas_set_err(&xas, -EINVAL); break; } xas_store(&xas, page); if (xas_error(&xas)) break; xas_next(&xas); if (!__sg_page_iter_next(&sg_iter)) break; } xas_unlock(&xas); } while (xas_nomem(&xas, GFP_KERNEL)); return xas_error(&xas); } int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, int access, struct rxe_mr *mr) { struct ib_umem *umem; int err; rxe_mr_init(access, mr); xa_init(&mr->page_list); umem = ib_umem_get(&rxe->ib_dev, start, length, access); if (IS_ERR(umem)) { rxe_dbg_mr(mr, "Unable to pin memory region err = %d\n", (int)PTR_ERR(umem)); return PTR_ERR(umem); } err = rxe_mr_fill_pages_from_sgt(mr, &umem->sgt_append.sgt); if (err) { ib_umem_release(umem); return err; } mr->umem = umem; mr->ibmr.type = IB_MR_TYPE_USER; mr->state = RXE_MR_STATE_VALID; return 0; } static int rxe_mr_alloc(struct rxe_mr *mr, int num_buf) { XA_STATE(xas, &mr->page_list, 0); int i = 0; int err; xa_init(&mr->page_list); do { xas_lock(&xas); while (i != num_buf) { xas_store(&xas, XA_ZERO_ENTRY); if (xas_error(&xas)) break; xas_next(&xas); i++; } xas_unlock(&xas); } while (xas_nomem(&xas, GFP_KERNEL)); err = xas_error(&xas); if (err) return err; mr->num_buf = num_buf; return 0; } int rxe_mr_init_fast(int max_pages, struct rxe_mr *mr) { int err; /* always allow remote access for FMRs */ rxe_mr_init(RXE_ACCESS_REMOTE, mr); err = rxe_mr_alloc(mr, max_pages); if (err) goto err1; mr->state = RXE_MR_STATE_FREE; mr->ibmr.type = IB_MR_TYPE_MEM_REG; return 0; err1: return err; } static int rxe_set_page(struct ib_mr *ibmr, u64 dma_addr) { struct rxe_mr *mr = to_rmr(ibmr); struct page *page = ib_virt_dma_to_page(dma_addr); bool persistent = !!(mr->access & IB_ACCESS_FLUSH_PERSISTENT); int err; if (persistent && !is_pmem_page(page)) { rxe_dbg_mr(mr, "Page cannot be persistent\n"); return -EINVAL; } if (unlikely(mr->nbuf == mr->num_buf)) return -ENOMEM; err = xa_err(xa_store(&mr->page_list, mr->nbuf, page, GFP_KERNEL)); if (err) return err; mr->nbuf++; return 0; } int rxe_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sgl, int sg_nents, unsigned int *sg_offset) { struct rxe_mr *mr = to_rmr(ibmr); unsigned int page_size = mr_page_size(mr); mr->nbuf = 0; mr->page_shift = ilog2(page_size); mr->page_mask = ~((u64)page_size - 1); mr->page_offset = mr->ibmr.iova & (page_size - 1); return ib_sg_to_pages(ibmr, sgl, sg_nents, sg_offset, rxe_set_page); } static int rxe_mr_copy_xarray(struct rxe_mr *mr, u64 iova, void *addr, unsigned int length, enum rxe_mr_copy_dir dir) { unsigned int page_offset = rxe_mr_iova_to_page_offset(mr, iova); unsigned long index = rxe_mr_iova_to_index(mr, iova); unsigned int bytes; struct page *page; void *va; while (length) { page = xa_load(&mr->page_list, index); if (!page) return -EFAULT; bytes = min_t(unsigned int, length, mr_page_size(mr) - page_offset); va = kmap_local_page(page); if (dir == RXE_FROM_MR_OBJ) memcpy(addr, va + page_offset, bytes); else memcpy(va + page_offset, addr, bytes); kunmap_local(va); page_offset = 0; addr += bytes; length -= bytes; index++; } return 0; } static void rxe_mr_copy_dma(struct rxe_mr *mr, u64 dma_addr, void *addr, unsigned int length, enum rxe_mr_copy_dir dir) { unsigned int page_offset = dma_addr & (PAGE_SIZE - 1); unsigned int bytes; struct page *page; u8 *va; while (length) { page = ib_virt_dma_to_page(dma_addr); bytes = min_t(unsigned int, length, PAGE_SIZE - page_offset); va = kmap_local_page(page); if (dir == RXE_TO_MR_OBJ) memcpy(va + page_offset, addr, bytes); else memcpy(addr, va + page_offset, bytes); kunmap_local(va); page_offset = 0; dma_addr += bytes; addr += bytes; length -= bytes; } } int rxe_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, unsigned int length, enum rxe_mr_copy_dir dir) { int err; if (length == 0) return 0; if (WARN_ON(!mr)) return -EINVAL; if (mr->ibmr.type == IB_MR_TYPE_DMA) { rxe_mr_copy_dma(mr, iova, addr, length, dir); return 0; } err = mr_check_range(mr, iova, length); if (unlikely(err)) { rxe_dbg_mr(mr, "iova out of range\n"); return err; } return rxe_mr_copy_xarray(mr, iova, addr, length, dir); } /* copy data in or out of a wqe, i.e. sg list * under the control of a dma descriptor */ int copy_data( struct rxe_pd *pd, int access, struct rxe_dma_info *dma, void *addr, int length, enum rxe_mr_copy_dir dir) { int bytes; struct rxe_sge *sge = &dma->sge[dma->cur_sge]; int offset = dma->sge_offset; int resid = dma->resid; struct rxe_mr *mr = NULL; u64 iova; int err; if (length == 0) return 0; if (length > resid) { err = -EINVAL; goto err2; } if (sge->length && (offset < sge->length)) { mr = lookup_mr(pd, access, sge->lkey, RXE_LOOKUP_LOCAL); if (!mr) { err = -EINVAL; goto err1; } } while (length > 0) { bytes = length; if (offset >= sge->length) { if (mr) { rxe_put(mr); mr = NULL; } sge++; dma->cur_sge++; offset = 0; if (dma->cur_sge >= dma->num_sge) { err = -ENOSPC; goto err2; } if (sge->length) { mr = lookup_mr(pd, access, sge->lkey, RXE_LOOKUP_LOCAL); if (!mr) { err = -EINVAL; goto err1; } } else { continue; } } if (bytes > sge->length - offset) bytes = sge->length - offset; if (bytes > 0) { iova = sge->addr + offset; err = rxe_mr_copy(mr, iova, addr, bytes, dir); if (err) goto err2; offset += bytes; resid -= bytes; length -= bytes; addr += bytes; } } dma->sge_offset = offset; dma->resid = resid; if (mr) rxe_put(mr); return 0; err2: if (mr) rxe_put(mr); err1: return err; } int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length) { unsigned int page_offset; unsigned long index; struct page *page; unsigned int bytes; int err; u8 *va; /* mr must be valid even if length is zero */ if (WARN_ON(!mr)) return -EINVAL; if (length == 0) return 0; if (mr->ibmr.type == IB_MR_TYPE_DMA) return -EFAULT; err = mr_check_range(mr, iova, length); if (err) return err; while (length > 0) { index = rxe_mr_iova_to_index(mr, iova); page = xa_load(&mr->page_list, index); page_offset = rxe_mr_iova_to_page_offset(mr, iova); if (!page) return -EFAULT; bytes = min_t(unsigned int, length, mr_page_size(mr) - page_offset); va = kmap_local_page(page); arch_wb_cache_pmem(va + page_offset, bytes); kunmap_local(va); length -= bytes; iova += bytes; page_offset = 0; } return 0; } /* Guarantee atomicity of atomic operations at the machine level. */ static DEFINE_SPINLOCK(atomic_ops_lock); int rxe_mr_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, u64 compare, u64 swap_add, u64 *orig_val) { unsigned int page_offset; struct page *page; u64 value; u64 *va; if (unlikely(mr->state != RXE_MR_STATE_VALID)) { rxe_dbg_mr(mr, "mr not in valid state\n"); return RESPST_ERR_RKEY_VIOLATION; } if (mr->ibmr.type == IB_MR_TYPE_DMA) { page_offset = iova & (PAGE_SIZE - 1); page = ib_virt_dma_to_page(iova); } else { unsigned long index; int err; err = mr_check_range(mr, iova, sizeof(value)); if (err) { rxe_dbg_mr(mr, "iova out of range\n"); return RESPST_ERR_RKEY_VIOLATION; } page_offset = rxe_mr_iova_to_page_offset(mr, iova); index = rxe_mr_iova_to_index(mr, iova); page = xa_load(&mr->page_list, index); if (!page) return RESPST_ERR_RKEY_VIOLATION; } if (unlikely(page_offset & 0x7)) { rxe_dbg_mr(mr, "iova not aligned\n"); return RESPST_ERR_MISALIGNED_ATOMIC; } va = kmap_local_page(page); spin_lock_bh(&atomic_ops_lock); value = *orig_val = va[page_offset >> 3]; if (opcode == IB_OPCODE_RC_COMPARE_SWAP) { if (value == compare) va[page_offset >> 3] = swap_add; } else { value += swap_add; va[page_offset >> 3] = value; } spin_unlock_bh(&atomic_ops_lock); kunmap_local(va); return 0; } #if defined CONFIG_64BIT /* only implemented or called for 64 bit architectures */ int rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value) { unsigned int page_offset; struct page *page; u64 *va; /* See IBA oA19-28 */ if (unlikely(mr->state != RXE_MR_STATE_VALID)) { rxe_dbg_mr(mr, "mr not in valid state\n"); return RESPST_ERR_RKEY_VIOLATION; } if (mr->ibmr.type == IB_MR_TYPE_DMA) { page_offset = iova & (PAGE_SIZE - 1); page = ib_virt_dma_to_page(iova); } else { unsigned long index; int err; /* See IBA oA19-28 */ err = mr_check_range(mr, iova, sizeof(value)); if (unlikely(err)) { rxe_dbg_mr(mr, "iova out of range\n"); return RESPST_ERR_RKEY_VIOLATION; } page_offset = rxe_mr_iova_to_page_offset(mr, iova); index = rxe_mr_iova_to_index(mr, iova); page = xa_load(&mr->page_list, index); if (!page) return RESPST_ERR_RKEY_VIOLATION; } /* See IBA A19.4.2 */ if (unlikely(page_offset & 0x7)) { rxe_dbg_mr(mr, "misaligned address\n"); return RESPST_ERR_MISALIGNED_ATOMIC; } va = kmap_local_page(page); /* Do atomic write after all prior operations have completed */ smp_store_release(&va[page_offset >> 3], value); kunmap_local(va); return 0; } #else int rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value) { return RESPST_ERR_UNSUPPORTED_OPCODE; } #endif int advance_dma_data(struct rxe_dma_info *dma, unsigned int length) { struct rxe_sge *sge = &dma->sge[dma->cur_sge]; int offset = dma->sge_offset; int resid = dma->resid; while (length) { unsigned int bytes; if (offset >= sge->length) { sge++; dma->cur_sge++; offset = 0; if (dma->cur_sge >= dma->num_sge) return -ENOSPC; } bytes = length; if (bytes > sge->length - offset) bytes = sge->length - offset; offset += bytes; resid -= bytes; length -= bytes; } dma->sge_offset = offset; dma->resid = resid; return 0; } struct rxe_mr *lookup_mr(struct rxe_pd *pd, int access, u32 key, enum rxe_mr_lookup_type type) { struct rxe_mr *mr; struct rxe_dev *rxe = to_rdev(pd->ibpd.device); int index = key >> 8; mr = rxe_pool_get_index(&rxe->mr_pool, index); if (!mr) return NULL; if (unlikely((type == RXE_LOOKUP_LOCAL && mr->lkey != key) || (type == RXE_LOOKUP_REMOTE && mr->rkey != key) || mr_pd(mr) != pd || ((access & mr->access) != access) || mr->state != RXE_MR_STATE_VALID)) { rxe_put(mr); mr = NULL; } return mr; } int rxe_invalidate_mr(struct rxe_qp *qp, u32 key) { struct rxe_dev *rxe = to_rdev(qp->ibqp.device); struct rxe_mr *mr; int remote; int ret; mr = rxe_pool_get_index(&rxe->mr_pool, key >> 8); if (!mr) { rxe_dbg_qp(qp, "No MR for key %#x\n", key); ret = -EINVAL; goto err; } remote = mr->access & RXE_ACCESS_REMOTE; if (remote ? (key != mr->rkey) : (key != mr->lkey)) { rxe_dbg_mr(mr, "wr key (%#x) doesn't match mr key (%#x)\n", key, (remote ? mr->rkey : mr->lkey)); ret = -EINVAL; goto err_drop_ref; } if (atomic_read(&mr->num_mw) > 0) { rxe_dbg_mr(mr, "Attempt to invalidate an MR while bound to MWs\n"); ret = -EINVAL; goto err_drop_ref; } if (unlikely(mr->ibmr.type != IB_MR_TYPE_MEM_REG)) { rxe_dbg_mr(mr, "Type (%d) is wrong\n", mr->ibmr.type); ret = -EINVAL; goto err_drop_ref; } mr->state = RXE_MR_STATE_FREE; ret = 0; err_drop_ref: rxe_put(mr); err: return ret; } /* user can (re)register fast MR by executing a REG_MR WQE. * user is expected to hold a reference on the ib mr until the * WQE completes. * Once a fast MR is created this is the only way to change the * private keys. It is the responsibility of the user to maintain * the ib mr keys in sync with rxe mr keys. */ int rxe_reg_fast_mr(struct rxe_qp *qp, struct rxe_send_wqe *wqe) { struct rxe_mr *mr = to_rmr(wqe->wr.wr.reg.mr); u32 key = wqe->wr.wr.reg.key; u32 access = wqe->wr.wr.reg.access; /* user can only register MR in free state */ if (unlikely(mr->state != RXE_MR_STATE_FREE)) { rxe_dbg_mr(mr, "mr->lkey = 0x%x not free\n", mr->lkey); return -EINVAL; } /* user can only register mr with qp in same protection domain */ if (unlikely(qp->ibqp.pd != mr->ibmr.pd)) { rxe_dbg_mr(mr, "qp->pd and mr->pd don't match\n"); return -EINVAL; } /* user is only allowed to change key portion of l/rkey */ if (unlikely((mr->lkey & ~0xff) != (key & ~0xff))) { rxe_dbg_mr(mr, "key = 0x%x has wrong index mr->lkey = 0x%x\n", key, mr->lkey); return -EINVAL; } mr->access = access; mr->lkey = key; mr->rkey = key; mr->ibmr.iova = wqe->wr.wr.reg.mr->iova; mr->state = RXE_MR_STATE_VALID; return 0; } void rxe_mr_cleanup(struct rxe_pool_elem *elem) { struct rxe_mr *mr = container_of(elem, typeof(*mr), elem); rxe_put(mr_pd(mr)); ib_umem_release(mr->umem); if (mr->ibmr.type != IB_MR_TYPE_DMA) xa_destroy(&mr->page_list); }
560 560 1208 1208 1208 1102 107 1205 1207 1104 48 59 4077 4880 4880 4732 96 224 2762 2696 4163 4169 4174 3071 2188 4146 37 2754 2748 2015 2746 99 176 231 313 314 109 109 104 104 206 192 2265 2268 2269 2269 225 4435 1278 79 309 2954 2529 1996 1539 4887 4768 238 4785 135 4882 3950 1548 4413 931 2953 4884 1 4880 4884 4915 46 4886 108 4438 1 659 10 3243 46 169 816 66 66 12301 12315 12321 12455 7688 12291 11406 9 4708 103 9 109 12 100 109 1375 1373 1372 1 1097 1097 1101 1099 969 164 164 35 12 150 387 227 190 10 10 10 1 277 277 276 81 275 2104 2099 313 46 46 45 1 276 105 104 105 105 105 102 102 354 355 355 1099 1097 1095 1098 1 22 8 114 8 1098 1096 8 8 4 1100 1097 1339 1340 1 1341 1336 1095 1097 108 15 15 108 1098 108 1097 232 233 233 232 233 231 232 798 796 1 826 3 28 796 1 1 9193 1585 8600 660 9201 1712 5602 146 16 16 4585 2129 2470 50 50 9193 9192 9185 4596 7274 893 7335 771 6113 2647 7812 521 6515 2748 7928 145 7838 501 16 6 3255 2053 447 5363 3293 8209 2 6569 6592 6565 6549 6572 6567 5949 6248 420 419 1366 1369 118 10 27 65 50 65 65 64 65 120 1 217 577 581 61 3 166 8388 577 7872 1571 1 5758 4990 5018 1040 4608 5366 5359 5384 1515 9 1409 5366 27 29 29 436 245 205 5357 5366 5353 58 4134 4126 4132 5 4407 4411 4404 4408 7 1 4395 2 5 5 5 5 1 4 4401 4403 4398 4389 4396 4403 4396 292 5270 1521 4110 3733 2164 2153 2150 1517 4110 3734 2151 2025 2009 46 3 4 437 63 99 409 499 572 572 573 174 129 411 47 551 20 571 570 22 549 26 548 502 14 487 80 80 80 80 14 558 571 570 550 20 413 175 14 556 478 79 80 80 80 80 611 662 11 7 1 1 2 9 9 7 2 7 3 3599 117 1802 1605 904 26 3 11 9 2 14 199 74 143 143 143 143 143 19 19 11 18 243 243 234 233 292 294 293 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 // SPDX-License-Identifier: GPL-2.0-only /* * fs/dcache.c * * Complete reimplementation * (C) 1997 Thomas Schoebel-Theuer, * with heavy changes by Linus Torvalds */ /* * Notes on the allocation strategy: * * The dcache is a master of the icache - whenever a dcache entry * exists, the inode will always exist. "iput()" is done either when * the dcache entry is deleted or garbage collected. */ #include <linux/ratelimit.h> #include <linux/string.h> #include <linux/mm.h> #include <linux/fs.h> #include <linux/fscrypt.h> #include <linux/fsnotify.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/hash.h> #include <linux/cache.h> #include <linux/export.h> #include <linux/security.h> #include <linux/seqlock.h> #include <linux/memblock.h> #include <linux/bit_spinlock.h> #include <linux/rculist_bl.h> #include <linux/list_lru.h> #include "internal.h" #include "mount.h" #include <asm/runtime-const.h> /* * Usage: * dcache->d_inode->i_lock protects: * - i_dentry, d_u.d_alias, d_inode of aliases * dcache_hash_bucket lock protects: * - the dcache hash table * s_roots bl list spinlock protects: * - the s_roots list (see __d_drop) * dentry->d_sb->s_dentry_lru_lock protects: * - the dcache lru lists and counters * d_lock protects: * - d_flags * - d_name * - d_lru * - d_count * - d_unhashed() * - d_parent and d_chilren * - childrens' d_sib and d_parent * - d_u.d_alias, d_inode * * Ordering: * dentry->d_inode->i_lock * dentry->d_lock * dentry->d_sb->s_dentry_lru_lock * dcache_hash_bucket lock * s_roots lock * * If there is an ancestor relationship: * dentry->d_parent->...->d_parent->d_lock * ... * dentry->d_parent->d_lock * dentry->d_lock * * If no ancestor relationship: * arbitrary, since it's serialized on rename_lock */ int sysctl_vfs_cache_pressure __read_mostly = 100; EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure); __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock); EXPORT_SYMBOL(rename_lock); static struct kmem_cache *dentry_cache __ro_after_init; const struct qstr empty_name = QSTR_INIT("", 0); EXPORT_SYMBOL(empty_name); const struct qstr slash_name = QSTR_INIT("/", 1); EXPORT_SYMBOL(slash_name); const struct qstr dotdot_name = QSTR_INIT("..", 2); EXPORT_SYMBOL(dotdot_name); /* * This is the single most critical data structure when it comes * to the dcache: the hashtable for lookups. Somebody should try * to make this good - I've just made it work. * * This hash-function tries to avoid losing too many bits of hash * information, yet avoid using a prime hash-size or similar. * * Marking the variables "used" ensures that the compiler doesn't * optimize them away completely on architectures with runtime * constant infrastructure, this allows debuggers to see their * values. But updating these values has no effect on those arches. */ static unsigned int d_hash_shift __ro_after_init __used; static struct hlist_bl_head *dentry_hashtable __ro_after_init __used; static inline struct hlist_bl_head *d_hash(unsigned long hashlen) { return runtime_const_ptr(dentry_hashtable) + runtime_const_shift_right_32(hashlen, d_hash_shift); } #define IN_LOOKUP_SHIFT 10 static struct hlist_bl_head in_lookup_hashtable[1 << IN_LOOKUP_SHIFT]; static inline struct hlist_bl_head *in_lookup_hash(const struct dentry *parent, unsigned int hash) { hash += (unsigned long) parent / L1_CACHE_BYTES; return in_lookup_hashtable + hash_32(hash, IN_LOOKUP_SHIFT); } struct dentry_stat_t { long nr_dentry; long nr_unused; long age_limit; /* age in seconds */ long want_pages; /* pages requested by system */ long nr_negative; /* # of unused negative dentries */ long dummy; /* Reserved for future use */ }; static DEFINE_PER_CPU(long, nr_dentry); static DEFINE_PER_CPU(long, nr_dentry_unused); static DEFINE_PER_CPU(long, nr_dentry_negative); static int dentry_negative_policy; #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) /* Statistics gathering. */ static struct dentry_stat_t dentry_stat = { .age_limit = 45, }; /* * Here we resort to our own counters instead of using generic per-cpu counters * for consistency with what the vfs inode code does. We are expected to harvest * better code and performance by having our own specialized counters. * * Please note that the loop is done over all possible CPUs, not over all online * CPUs. The reason for this is that we don't want to play games with CPUs going * on and off. If one of them goes off, we will just keep their counters. * * glommer: See cffbc8a for details, and if you ever intend to change this, * please update all vfs counters to match. */ static long get_nr_dentry(void) { int i; long sum = 0; for_each_possible_cpu(i) sum += per_cpu(nr_dentry, i); return sum < 0 ? 0 : sum; } static long get_nr_dentry_unused(void) { int i; long sum = 0; for_each_possible_cpu(i) sum += per_cpu(nr_dentry_unused, i); return sum < 0 ? 0 : sum; } static long get_nr_dentry_negative(void) { int i; long sum = 0; for_each_possible_cpu(i) sum += per_cpu(nr_dentry_negative, i); return sum < 0 ? 0 : sum; } static int proc_nr_dentry(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { dentry_stat.nr_dentry = get_nr_dentry(); dentry_stat.nr_unused = get_nr_dentry_unused(); dentry_stat.nr_negative = get_nr_dentry_negative(); return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } static const struct ctl_table fs_dcache_sysctls[] = { { .procname = "dentry-state", .data = &dentry_stat, .maxlen = 6*sizeof(long), .mode = 0444, .proc_handler = proc_nr_dentry, }, { .procname = "dentry-negative", .data = &dentry_negative_policy, .maxlen = sizeof(dentry_negative_policy), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, }; static int __init init_fs_dcache_sysctls(void) { register_sysctl_init("fs", fs_dcache_sysctls); return 0; } fs_initcall(init_fs_dcache_sysctls); #endif /* * Compare 2 name strings, return 0 if they match, otherwise non-zero. * The strings are both count bytes long, and count is non-zero. */ #ifdef CONFIG_DCACHE_WORD_ACCESS #include <asm/word-at-a-time.h> /* * NOTE! 'cs' and 'scount' come from a dentry, so it has a * aligned allocation for this particular component. We don't * strictly need the load_unaligned_zeropad() safety, but it * doesn't hurt either. * * In contrast, 'ct' and 'tcount' can be from a pathname, and do * need the careful unaligned handling. */ static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char *ct, unsigned tcount) { unsigned long a,b,mask; for (;;) { a = read_word_at_a_time(cs); b = load_unaligned_zeropad(ct); if (tcount < sizeof(unsigned long)) break; if (unlikely(a != b)) return 1; cs += sizeof(unsigned long); ct += sizeof(unsigned long); tcount -= sizeof(unsigned long); if (!tcount) return 0; } mask = bytemask_from_count(tcount); return unlikely(!!((a ^ b) & mask)); } #else static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char *ct, unsigned tcount) { do { if (*cs != *ct) return 1; cs++; ct++; tcount--; } while (tcount); return 0; } #endif static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *ct, unsigned tcount) { /* * Be careful about RCU walk racing with rename: * use 'READ_ONCE' to fetch the name pointer. * * NOTE! Even if a rename will mean that the length * was not loaded atomically, we don't care. The * RCU walk will check the sequence count eventually, * and catch it. And we won't overrun the buffer, * because we're reading the name pointer atomically, * and a dentry name is guaranteed to be properly * terminated with a NUL byte. * * End result: even if 'len' is wrong, we'll exit * early because the data cannot match (there can * be no NUL in the ct/tcount data) */ const unsigned char *cs = READ_ONCE(dentry->d_name.name); return dentry_string_cmp(cs, ct, tcount); } /* * long names are allocated separately from dentry and never modified. * Refcounted, freeing is RCU-delayed. See take_dentry_name_snapshot() * for the reason why ->count and ->head can't be combined into a union. * dentry_string_cmp() relies upon ->name[] being word-aligned. */ struct external_name { atomic_t count; struct rcu_head head; unsigned char name[] __aligned(sizeof(unsigned long)); }; static inline struct external_name *external_name(struct dentry *dentry) { return container_of(dentry->d_name.name, struct external_name, name[0]); } static void __d_free(struct rcu_head *head) { struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu); kmem_cache_free(dentry_cache, dentry); } static void __d_free_external(struct rcu_head *head) { struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu); kfree(external_name(dentry)); kmem_cache_free(dentry_cache, dentry); } static inline int dname_external(const struct dentry *dentry) { return dentry->d_name.name != dentry->d_shortname.string; } void take_dentry_name_snapshot(struct name_snapshot *name, struct dentry *dentry) { unsigned seq; const unsigned char *s; rcu_read_lock(); retry: seq = read_seqcount_begin(&dentry->d_seq); s = READ_ONCE(dentry->d_name.name); name->name.hash_len = dentry->d_name.hash_len; name->name.name = name->inline_name.string; if (likely(s == dentry->d_shortname.string)) { name->inline_name = dentry->d_shortname; } else { struct external_name *p; p = container_of(s, struct external_name, name[0]); // get a valid reference if (unlikely(!atomic_inc_not_zero(&p->count))) goto retry; name->name.name = s; } if (read_seqcount_retry(&dentry->d_seq, seq)) { release_dentry_name_snapshot(name); goto retry; } rcu_read_unlock(); } EXPORT_SYMBOL(take_dentry_name_snapshot); void release_dentry_name_snapshot(struct name_snapshot *name) { if (unlikely(name->name.name != name->inline_name.string)) { struct external_name *p; p = container_of(name->name.name, struct external_name, name[0]); if (unlikely(atomic_dec_and_test(&p->count))) kfree_rcu(p, head); } } EXPORT_SYMBOL(release_dentry_name_snapshot); static inline void __d_set_inode_and_type(struct dentry *dentry, struct inode *inode, unsigned type_flags) { unsigned flags; dentry->d_inode = inode; flags = READ_ONCE(dentry->d_flags); flags &= ~DCACHE_ENTRY_TYPE; flags |= type_flags; smp_store_release(&dentry->d_flags, flags); } static inline void __d_clear_type_and_inode(struct dentry *dentry) { unsigned flags = READ_ONCE(dentry->d_flags); flags &= ~DCACHE_ENTRY_TYPE; WRITE_ONCE(dentry->d_flags, flags); dentry->d_inode = NULL; /* * The negative counter only tracks dentries on the LRU. Don't inc if * d_lru is on another list. */ if ((flags & (DCACHE_LRU_LIST|DCACHE_SHRINK_LIST)) == DCACHE_LRU_LIST) this_cpu_inc(nr_dentry_negative); } static void dentry_free(struct dentry *dentry) { WARN_ON(!hlist_unhashed(&dentry->d_u.d_alias)); if (unlikely(dname_external(dentry))) { struct external_name *p = external_name(dentry); if (likely(atomic_dec_and_test(&p->count))) { call_rcu(&dentry->d_u.d_rcu, __d_free_external); return; } } /* if dentry was never visible to RCU, immediate free is OK */ if (dentry->d_flags & DCACHE_NORCU) __d_free(&dentry->d_u.d_rcu); else call_rcu(&dentry->d_u.d_rcu, __d_free); } /* * Release the dentry's inode, using the filesystem * d_iput() operation if defined. */ static void dentry_unlink_inode(struct dentry * dentry) __releases(dentry->d_lock) __releases(dentry->d_inode->i_lock) { struct inode *inode = dentry->d_inode; raw_write_seqcount_begin(&dentry->d_seq); __d_clear_type_and_inode(dentry); hlist_del_init(&dentry->d_u.d_alias); raw_write_seqcount_end(&dentry->d_seq); spin_unlock(&dentry->d_lock); spin_unlock(&inode->i_lock); if (!inode->i_nlink) fsnotify_inoderemove(inode); if (dentry->d_op && dentry->d_op->d_iput) dentry->d_op->d_iput(dentry, inode); else iput(inode); } /* * The DCACHE_LRU_LIST bit is set whenever the 'd_lru' entry * is in use - which includes both the "real" per-superblock * LRU list _and_ the DCACHE_SHRINK_LIST use. * * The DCACHE_SHRINK_LIST bit is set whenever the dentry is * on the shrink list (ie not on the superblock LRU list). * * The per-cpu "nr_dentry_unused" counters are updated with * the DCACHE_LRU_LIST bit. * * The per-cpu "nr_dentry_negative" counters are only updated * when deleted from or added to the per-superblock LRU list, not * from/to the shrink list. That is to avoid an unneeded dec/inc * pair when moving from LRU to shrink list in select_collect(). * * These helper functions make sure we always follow the * rules. d_lock must be held by the caller. */ #define D_FLAG_VERIFY(dentry,x) WARN_ON_ONCE(((dentry)->d_flags & (DCACHE_LRU_LIST | DCACHE_SHRINK_LIST)) != (x)) static void d_lru_add(struct dentry *dentry) { D_FLAG_VERIFY(dentry, 0); dentry->d_flags |= DCACHE_LRU_LIST; this_cpu_inc(nr_dentry_unused); if (d_is_negative(dentry)) this_cpu_inc(nr_dentry_negative); WARN_ON_ONCE(!list_lru_add_obj( &dentry->d_sb->s_dentry_lru, &dentry->d_lru)); } static void d_lru_del(struct dentry *dentry) { D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); dentry->d_flags &= ~DCACHE_LRU_LIST; this_cpu_dec(nr_dentry_unused); if (d_is_negative(dentry)) this_cpu_dec(nr_dentry_negative); WARN_ON_ONCE(!list_lru_del_obj( &dentry->d_sb->s_dentry_lru, &dentry->d_lru)); } static void d_shrink_del(struct dentry *dentry) { D_FLAG_VERIFY(dentry, DCACHE_SHRINK_LIST | DCACHE_LRU_LIST); list_del_init(&dentry->d_lru); dentry->d_flags &= ~(DCACHE_SHRINK_LIST | DCACHE_LRU_LIST); this_cpu_dec(nr_dentry_unused); } static void d_shrink_add(struct dentry *dentry, struct list_head *list) { D_FLAG_VERIFY(dentry, 0); list_add(&dentry->d_lru, list); dentry->d_flags |= DCACHE_SHRINK_LIST | DCACHE_LRU_LIST; this_cpu_inc(nr_dentry_unused); } /* * These can only be called under the global LRU lock, ie during the * callback for freeing the LRU list. "isolate" removes it from the * LRU lists entirely, while shrink_move moves it to the indicated * private list. */ static void d_lru_isolate(struct list_lru_one *lru, struct dentry *dentry) { D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); dentry->d_flags &= ~DCACHE_LRU_LIST; this_cpu_dec(nr_dentry_unused); if (d_is_negative(dentry)) this_cpu_dec(nr_dentry_negative); list_lru_isolate(lru, &dentry->d_lru); } static void d_lru_shrink_move(struct list_lru_one *lru, struct dentry *dentry, struct list_head *list) { D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); dentry->d_flags |= DCACHE_SHRINK_LIST; if (d_is_negative(dentry)) this_cpu_dec(nr_dentry_negative); list_lru_isolate_move(lru, &dentry->d_lru, list); } static void ___d_drop(struct dentry *dentry) { struct hlist_bl_head *b; /* * Hashed dentries are normally on the dentry hashtable, * with the exception of those newly allocated by * d_obtain_root, which are always IS_ROOT: */ if (unlikely(IS_ROOT(dentry))) b = &dentry->d_sb->s_roots; else b = d_hash(dentry->d_name.hash); hlist_bl_lock(b); __hlist_bl_del(&dentry->d_hash); hlist_bl_unlock(b); } void __d_drop(struct dentry *dentry) { if (!d_unhashed(dentry)) { ___d_drop(dentry); dentry->d_hash.pprev = NULL; write_seqcount_invalidate(&dentry->d_seq); } } EXPORT_SYMBOL(__d_drop); /** * d_drop - drop a dentry * @dentry: dentry to drop * * d_drop() unhashes the entry from the parent dentry hashes, so that it won't * be found through a VFS lookup any more. Note that this is different from * deleting the dentry - d_delete will try to mark the dentry negative if * possible, giving a successful _negative_ lookup, while d_drop will * just make the cache lookup fail. * * d_drop() is used mainly for stuff that wants to invalidate a dentry for some * reason (NFS timeouts or autofs deletes). * * __d_drop requires dentry->d_lock * * ___d_drop doesn't mark dentry as "unhashed" * (dentry->d_hash.pprev will be LIST_POISON2, not NULL). */ void d_drop(struct dentry *dentry) { spin_lock(&dentry->d_lock); __d_drop(dentry); spin_unlock(&dentry->d_lock); } EXPORT_SYMBOL(d_drop); static inline void dentry_unlist(struct dentry *dentry) { struct dentry *next; /* * Inform d_walk() and shrink_dentry_list() that we are no longer * attached to the dentry tree */ dentry->d_flags |= DCACHE_DENTRY_KILLED; if (unlikely(hlist_unhashed(&dentry->d_sib))) return; __hlist_del(&dentry->d_sib); /* * Cursors can move around the list of children. While we'd been * a normal list member, it didn't matter - ->d_sib.next would've * been updated. However, from now on it won't be and for the * things like d_walk() it might end up with a nasty surprise. * Normally d_walk() doesn't care about cursors moving around - * ->d_lock on parent prevents that and since a cursor has no children * of its own, we get through it without ever unlocking the parent. * There is one exception, though - if we ascend from a child that * gets killed as soon as we unlock it, the next sibling is found * using the value left in its ->d_sib.next. And if _that_ * pointed to a cursor, and cursor got moved (e.g. by lseek()) * before d_walk() regains parent->d_lock, we'll end up skipping * everything the cursor had been moved past. * * Solution: make sure that the pointer left behind in ->d_sib.next * points to something that won't be moving around. I.e. skip the * cursors. */ while (dentry->d_sib.next) { next = hlist_entry(dentry->d_sib.next, struct dentry, d_sib); if (likely(!(next->d_flags & DCACHE_DENTRY_CURSOR))) break; dentry->d_sib.next = next->d_sib.next; } } static struct dentry *__dentry_kill(struct dentry *dentry) { struct dentry *parent = NULL; bool can_free = true; /* * The dentry is now unrecoverably dead to the world. */ lockref_mark_dead(&dentry->d_lockref); /* * inform the fs via d_prune that this dentry is about to be * unhashed and destroyed. */ if (dentry->d_flags & DCACHE_OP_PRUNE) dentry->d_op->d_prune(dentry); if (dentry->d_flags & DCACHE_LRU_LIST) { if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) d_lru_del(dentry); } /* if it was on the hash then remove it */ __d_drop(dentry); if (dentry->d_inode) dentry_unlink_inode(dentry); else spin_unlock(&dentry->d_lock); this_cpu_dec(nr_dentry); if (dentry->d_op && dentry->d_op->d_release) dentry->d_op->d_release(dentry); cond_resched(); /* now that it's negative, ->d_parent is stable */ if (!IS_ROOT(dentry)) { parent = dentry->d_parent; spin_lock(&parent->d_lock); } spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); dentry_unlist(dentry); if (dentry->d_flags & DCACHE_SHRINK_LIST) can_free = false; spin_unlock(&dentry->d_lock); if (likely(can_free)) dentry_free(dentry); if (parent && --parent->d_lockref.count) { spin_unlock(&parent->d_lock); return NULL; } return parent; } /* * Lock a dentry for feeding it to __dentry_kill(). * Called under rcu_read_lock() and dentry->d_lock; the former * guarantees that nothing we access will be freed under us. * Note that dentry is *not* protected from concurrent dentry_kill(), * d_delete(), etc. * * Return false if dentry is busy. Otherwise, return true and have * that dentry's inode locked. */ static bool lock_for_kill(struct dentry *dentry) { struct inode *inode = dentry->d_inode; if (unlikely(dentry->d_lockref.count)) return false; if (!inode || likely(spin_trylock(&inode->i_lock))) return true; do { spin_unlock(&dentry->d_lock); spin_lock(&inode->i_lock); spin_lock(&dentry->d_lock); if (likely(inode == dentry->d_inode)) break; spin_unlock(&inode->i_lock); inode = dentry->d_inode; } while (inode); if (likely(!dentry->d_lockref.count)) return true; if (inode) spin_unlock(&inode->i_lock); return false; } /* * Decide if dentry is worth retaining. Usually this is called with dentry * locked; if not locked, we are more limited and might not be able to tell * without a lock. False in this case means "punt to locked path and recheck". * * In case we aren't locked, these predicates are not "stable". However, it is * sufficient that at some point after we dropped the reference the dentry was * hashed and the flags had the proper value. Other dentry users may have * re-gotten a reference to the dentry and change that, but our work is done - * we can leave the dentry around with a zero refcount. */ static inline bool retain_dentry(struct dentry *dentry, bool locked) { unsigned int d_flags; smp_rmb(); d_flags = READ_ONCE(dentry->d_flags); // Unreachable? Nobody would be able to look it up, no point retaining if (unlikely(d_unhashed(dentry))) return false; // Same if it's disconnected if (unlikely(d_flags & DCACHE_DISCONNECTED)) return false; // ->d_delete() might tell us not to bother, but that requires // ->d_lock; can't decide without it if (unlikely(d_flags & DCACHE_OP_DELETE)) { if (!locked || dentry->d_op->d_delete(dentry)) return false; } // Explicitly told not to bother if (unlikely(d_flags & DCACHE_DONTCACHE)) return false; // At this point it looks like we ought to keep it. We also might // need to do something - put it on LRU if it wasn't there already // and mark it referenced if it was on LRU, but not marked yet. // Unfortunately, both actions require ->d_lock, so in lockless // case we'd have to punt rather than doing those. if (unlikely(!(d_flags & DCACHE_LRU_LIST))) { if (!locked) return false; d_lru_add(dentry); } else if (unlikely(!(d_flags & DCACHE_REFERENCED))) { if (!locked) return false; dentry->d_flags |= DCACHE_REFERENCED; } return true; } void d_mark_dontcache(struct inode *inode) { struct dentry *de; spin_lock(&inode->i_lock); hlist_for_each_entry(de, &inode->i_dentry, d_u.d_alias) { spin_lock(&de->d_lock); de->d_flags |= DCACHE_DONTCACHE; spin_unlock(&de->d_lock); } inode->i_state |= I_DONTCACHE; spin_unlock(&inode->i_lock); } EXPORT_SYMBOL(d_mark_dontcache); /* * Try to do a lockless dput(), and return whether that was successful. * * If unsuccessful, we return false, having already taken the dentry lock. * In that case refcount is guaranteed to be zero and we have already * decided that it's not worth keeping around. * * The caller needs to hold the RCU read lock, so that the dentry is * guaranteed to stay around even if the refcount goes down to zero! */ static inline bool fast_dput(struct dentry *dentry) { int ret; /* * try to decrement the lockref optimistically. */ ret = lockref_put_return(&dentry->d_lockref); /* * If the lockref_put_return() failed due to the lock being held * by somebody else, the fast path has failed. We will need to * get the lock, and then check the count again. */ if (unlikely(ret < 0)) { spin_lock(&dentry->d_lock); if (WARN_ON_ONCE(dentry->d_lockref.count <= 0)) { spin_unlock(&dentry->d_lock); return true; } dentry->d_lockref.count--; goto locked; } /* * If we weren't the last ref, we're done. */ if (ret) return true; /* * Can we decide that decrement of refcount is all we needed without * taking the lock? There's a very common case when it's all we need - * dentry looks like it ought to be retained and there's nothing else * to do. */ if (retain_dentry(dentry, false)) return true; /* * Either not worth retaining or we can't tell without the lock. * Get the lock, then. We've already decremented the refcount to 0, * but we'll need to re-check the situation after getting the lock. */ spin_lock(&dentry->d_lock); /* * Did somebody else grab a reference to it in the meantime, and * we're no longer the last user after all? Alternatively, somebody * else could have killed it and marked it dead. Either way, we * don't need to do anything else. */ locked: if (dentry->d_lockref.count || retain_dentry(dentry, true)) { spin_unlock(&dentry->d_lock); return true; } return false; } /* * This is dput * * This is complicated by the fact that we do not want to put * dentries that are no longer on any hash chain on the unused * list: we'd much rather just get rid of them immediately. * * However, that implies that we have to traverse the dentry * tree upwards to the parents which might _also_ now be * scheduled for deletion (it may have been only waiting for * its last child to go away). * * This tail recursion is done by hand as we don't want to depend * on the compiler to always get this right (gcc generally doesn't). * Real recursion would eat up our stack space. */ /* * dput - release a dentry * @dentry: dentry to release * * Release a dentry. This will drop the usage count and if appropriate * call the dentry unlink method as well as removing it from the queues and * releasing its resources. If the parent dentries were scheduled for release * they too may now get deleted. */ void dput(struct dentry *dentry) { if (!dentry) return; might_sleep(); rcu_read_lock(); if (likely(fast_dput(dentry))) { rcu_read_unlock(); return; } while (lock_for_kill(dentry)) { rcu_read_unlock(); dentry = __dentry_kill(dentry); if (!dentry) return; if (retain_dentry(dentry, true)) { spin_unlock(&dentry->d_lock); return; } rcu_read_lock(); } rcu_read_unlock(); spin_unlock(&dentry->d_lock); } EXPORT_SYMBOL(dput); static void to_shrink_list(struct dentry *dentry, struct list_head *list) __must_hold(&dentry->d_lock) { if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) { if (dentry->d_flags & DCACHE_LRU_LIST) d_lru_del(dentry); d_shrink_add(dentry, list); } } void dput_to_list(struct dentry *dentry, struct list_head *list) { rcu_read_lock(); if (likely(fast_dput(dentry))) { rcu_read_unlock(); return; } rcu_read_unlock(); to_shrink_list(dentry, list); spin_unlock(&dentry->d_lock); } struct dentry *dget_parent(struct dentry *dentry) { int gotref; struct dentry *ret; unsigned seq; /* * Do optimistic parent lookup without any * locking. */ rcu_read_lock(); seq = raw_seqcount_begin(&dentry->d_seq); ret = READ_ONCE(dentry->d_parent); gotref = lockref_get_not_zero(&ret->d_lockref); rcu_read_unlock(); if (likely(gotref)) { if (!read_seqcount_retry(&dentry->d_seq, seq)) return ret; dput(ret); } repeat: /* * Don't need rcu_dereference because we re-check it was correct under * the lock. */ rcu_read_lock(); ret = dentry->d_parent; spin_lock(&ret->d_lock); if (unlikely(ret != dentry->d_parent)) { spin_unlock(&ret->d_lock); rcu_read_unlock(); goto repeat; } rcu_read_unlock(); BUG_ON(!ret->d_lockref.count); ret->d_lockref.count++; spin_unlock(&ret->d_lock); return ret; } EXPORT_SYMBOL(dget_parent); static struct dentry * __d_find_any_alias(struct inode *inode) { struct dentry *alias; if (hlist_empty(&inode->i_dentry)) return NULL; alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias); lockref_get(&alias->d_lockref); return alias; } /** * d_find_any_alias - find any alias for a given inode * @inode: inode to find an alias for * * If any aliases exist for the given inode, take and return a * reference for one of them. If no aliases exist, return %NULL. */ struct dentry *d_find_any_alias(struct inode *inode) { struct dentry *de; spin_lock(&inode->i_lock); de = __d_find_any_alias(inode); spin_unlock(&inode->i_lock); return de; } EXPORT_SYMBOL(d_find_any_alias); static struct dentry *__d_find_alias(struct inode *inode) { struct dentry *alias; if (S_ISDIR(inode->i_mode)) return __d_find_any_alias(inode); hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { spin_lock(&alias->d_lock); if (!d_unhashed(alias)) { dget_dlock(alias); spin_unlock(&alias->d_lock); return alias; } spin_unlock(&alias->d_lock); } return NULL; } /** * d_find_alias - grab a hashed alias of inode * @inode: inode in question * * If inode has a hashed alias, or is a directory and has any alias, * acquire the reference to alias and return it. Otherwise return NULL. * Notice that if inode is a directory there can be only one alias and * it can be unhashed only if it has no children, or if it is the root * of a filesystem, or if the directory was renamed and d_revalidate * was the first vfs operation to notice. * * If the inode has an IS_ROOT, DCACHE_DISCONNECTED alias, then prefer * any other hashed alias over that one. */ struct dentry *d_find_alias(struct inode *inode) { struct dentry *de = NULL; if (!hlist_empty(&inode->i_dentry)) { spin_lock(&inode->i_lock); de = __d_find_alias(inode); spin_unlock(&inode->i_lock); } return de; } EXPORT_SYMBOL(d_find_alias); /* * Caller MUST be holding rcu_read_lock() and be guaranteed * that inode won't get freed until rcu_read_unlock(). */ struct dentry *d_find_alias_rcu(struct inode *inode) { struct hlist_head *l = &inode->i_dentry; struct dentry *de = NULL; spin_lock(&inode->i_lock); // ->i_dentry and ->i_rcu are colocated, but the latter won't be // used without having I_FREEING set, which means no aliases left if (likely(!(inode->i_state & I_FREEING) && !hlist_empty(l))) { if (S_ISDIR(inode->i_mode)) { de = hlist_entry(l->first, struct dentry, d_u.d_alias); } else { hlist_for_each_entry(de, l, d_u.d_alias) if (!d_unhashed(de)) break; } } spin_unlock(&inode->i_lock); return de; } /* * Try to kill dentries associated with this inode. * WARNING: you must own a reference to inode. */ void d_prune_aliases(struct inode *inode) { LIST_HEAD(dispose); struct dentry *dentry; spin_lock(&inode->i_lock); hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) { spin_lock(&dentry->d_lock); if (!dentry->d_lockref.count) to_shrink_list(dentry, &dispose); spin_unlock(&dentry->d_lock); } spin_unlock(&inode->i_lock); shrink_dentry_list(&dispose); } EXPORT_SYMBOL(d_prune_aliases); static inline void shrink_kill(struct dentry *victim) { do { rcu_read_unlock(); victim = __dentry_kill(victim); rcu_read_lock(); } while (victim && lock_for_kill(victim)); rcu_read_unlock(); if (victim) spin_unlock(&victim->d_lock); } void shrink_dentry_list(struct list_head *list) { while (!list_empty(list)) { struct dentry *dentry; dentry = list_entry(list->prev, struct dentry, d_lru); spin_lock(&dentry->d_lock); rcu_read_lock(); if (!lock_for_kill(dentry)) { bool can_free; rcu_read_unlock(); d_shrink_del(dentry); can_free = dentry->d_flags & DCACHE_DENTRY_KILLED; spin_unlock(&dentry->d_lock); if (can_free) dentry_free(dentry); continue; } d_shrink_del(dentry); shrink_kill(dentry); } } static enum lru_status dentry_lru_isolate(struct list_head *item, struct list_lru_one *lru, void *arg) { struct list_head *freeable = arg; struct dentry *dentry = container_of(item, struct dentry, d_lru); /* * we are inverting the lru lock/dentry->d_lock here, * so use a trylock. If we fail to get the lock, just skip * it */ if (!spin_trylock(&dentry->d_lock)) return LRU_SKIP; /* * Referenced dentries are still in use. If they have active * counts, just remove them from the LRU. Otherwise give them * another pass through the LRU. */ if (dentry->d_lockref.count) { d_lru_isolate(lru, dentry); spin_unlock(&dentry->d_lock); return LRU_REMOVED; } if (dentry->d_flags & DCACHE_REFERENCED) { dentry->d_flags &= ~DCACHE_REFERENCED; spin_unlock(&dentry->d_lock); /* * The list move itself will be made by the common LRU code. At * this point, we've dropped the dentry->d_lock but keep the * lru lock. This is safe to do, since every list movement is * protected by the lru lock even if both locks are held. * * This is guaranteed by the fact that all LRU management * functions are intermediated by the LRU API calls like * list_lru_add_obj and list_lru_del_obj. List movement in this file * only ever occur through this functions or through callbacks * like this one, that are called from the LRU API. * * The only exceptions to this are functions like * shrink_dentry_list, and code that first checks for the * DCACHE_SHRINK_LIST flag. Those are guaranteed to be * operating only with stack provided lists after they are * properly isolated from the main list. It is thus, always a * local access. */ return LRU_ROTATE; } d_lru_shrink_move(lru, dentry, freeable); spin_unlock(&dentry->d_lock); return LRU_REMOVED; } /** * prune_dcache_sb - shrink the dcache * @sb: superblock * @sc: shrink control, passed to list_lru_shrink_walk() * * Attempt to shrink the superblock dcache LRU by @sc->nr_to_scan entries. This * is done when we need more memory and called from the superblock shrinker * function. * * This function may fail to free any resources if all the dentries are in * use. */ long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc) { LIST_HEAD(dispose); long freed; freed = list_lru_shrink_walk(&sb->s_dentry_lru, sc, dentry_lru_isolate, &dispose); shrink_dentry_list(&dispose); return freed; } static enum lru_status dentry_lru_isolate_shrink(struct list_head *item, struct list_lru_one *lru, void *arg) { struct list_head *freeable = arg; struct dentry *dentry = container_of(item, struct dentry, d_lru); /* * we are inverting the lru lock/dentry->d_lock here, * so use a trylock. If we fail to get the lock, just skip * it */ if (!spin_trylock(&dentry->d_lock)) return LRU_SKIP; d_lru_shrink_move(lru, dentry, freeable); spin_unlock(&dentry->d_lock); return LRU_REMOVED; } /** * shrink_dcache_sb - shrink dcache for a superblock * @sb: superblock * * Shrink the dcache for the specified super block. This is used to free * the dcache before unmounting a file system. */ void shrink_dcache_sb(struct super_block *sb) { do { LIST_HEAD(dispose); list_lru_walk(&sb->s_dentry_lru, dentry_lru_isolate_shrink, &dispose, 1024); shrink_dentry_list(&dispose); } while (list_lru_count(&sb->s_dentry_lru) > 0); } EXPORT_SYMBOL(shrink_dcache_sb); /** * enum d_walk_ret - action to talke during tree walk * @D_WALK_CONTINUE: contrinue walk * @D_WALK_QUIT: quit walk * @D_WALK_NORETRY: quit when retry is needed * @D_WALK_SKIP: skip this dentry and its children */ enum d_walk_ret { D_WALK_CONTINUE, D_WALK_QUIT, D_WALK_NORETRY, D_WALK_SKIP, }; /** * d_walk - walk the dentry tree * @parent: start of walk * @data: data passed to @enter() and @finish() * @enter: callback when first entering the dentry * * The @enter() callbacks are called with d_lock held. */ static void d_walk(struct dentry *parent, void *data, enum d_walk_ret (*enter)(void *, struct dentry *)) { struct dentry *this_parent, *dentry; unsigned seq = 0; enum d_walk_ret ret; bool retry = true; again: read_seqbegin_or_lock(&rename_lock, &seq); this_parent = parent; spin_lock(&this_parent->d_lock); ret = enter(data, this_parent); switch (ret) { case D_WALK_CONTINUE: break; case D_WALK_QUIT: case D_WALK_SKIP: goto out_unlock; case D_WALK_NORETRY: retry = false; break; } repeat: dentry = d_first_child(this_parent); resume: hlist_for_each_entry_from(dentry, d_sib) { if (unlikely(dentry->d_flags & DCACHE_DENTRY_CURSOR)) continue; spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); ret = enter(data, dentry); switch (ret) { case D_WALK_CONTINUE: break; case D_WALK_QUIT: spin_unlock(&dentry->d_lock); goto out_unlock; case D_WALK_NORETRY: retry = false; break; case D_WALK_SKIP: spin_unlock(&dentry->d_lock); continue; } if (!hlist_empty(&dentry->d_children)) { spin_unlock(&this_parent->d_lock); spin_release(&dentry->d_lock.dep_map, _RET_IP_); this_parent = dentry; spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_); goto repeat; } spin_unlock(&dentry->d_lock); } /* * All done at this level ... ascend and resume the search. */ rcu_read_lock(); ascend: if (this_parent != parent) { dentry = this_parent; this_parent = dentry->d_parent; spin_unlock(&dentry->d_lock); spin_lock(&this_parent->d_lock); /* might go back up the wrong parent if we have had a rename. */ if (need_seqretry(&rename_lock, seq)) goto rename_retry; /* go into the first sibling still alive */ hlist_for_each_entry_continue(dentry, d_sib) { if (likely(!(dentry->d_flags & DCACHE_DENTRY_KILLED))) { rcu_read_unlock(); goto resume; } } goto ascend; } if (need_seqretry(&rename_lock, seq)) goto rename_retry; rcu_read_unlock(); out_unlock: spin_unlock(&this_parent->d_lock); done_seqretry(&rename_lock, seq); return; rename_retry: spin_unlock(&this_parent->d_lock); rcu_read_unlock(); BUG_ON(seq & 1); if (!retry) return; seq = 1; goto again; } struct check_mount { struct vfsmount *mnt; unsigned int mounted; }; static enum d_walk_ret path_check_mount(void *data, struct dentry *dentry) { struct check_mount *info = data; struct path path = { .mnt = info->mnt, .dentry = dentry }; if (likely(!d_mountpoint(dentry))) return D_WALK_CONTINUE; if (__path_is_mountpoint(&path)) { info->mounted = 1; return D_WALK_QUIT; } return D_WALK_CONTINUE; } /** * path_has_submounts - check for mounts over a dentry in the * current namespace. * @parent: path to check. * * Return true if the parent or its subdirectories contain * a mount point in the current namespace. */ int path_has_submounts(const struct path *parent) { struct check_mount data = { .mnt = parent->mnt, .mounted = 0 }; read_seqlock_excl(&mount_lock); d_walk(parent->dentry, &data, path_check_mount); read_sequnlock_excl(&mount_lock); return data.mounted; } EXPORT_SYMBOL(path_has_submounts); /* * Called by mount code to set a mountpoint and check if the mountpoint is * reachable (e.g. NFS can unhash a directory dentry and then the complete * subtree can become unreachable). * * Only one of d_invalidate() and d_set_mounted() must succeed. For * this reason take rename_lock and d_lock on dentry and ancestors. */ int d_set_mounted(struct dentry *dentry) { struct dentry *p; int ret = -ENOENT; write_seqlock(&rename_lock); for (p = dentry->d_parent; !IS_ROOT(p); p = p->d_parent) { /* Need exclusion wrt. d_invalidate() */ spin_lock(&p->d_lock); if (unlikely(d_unhashed(p))) { spin_unlock(&p->d_lock); goto out; } spin_unlock(&p->d_lock); } spin_lock(&dentry->d_lock); if (!d_unlinked(dentry)) { ret = -EBUSY; if (!d_mountpoint(dentry)) { dentry->d_flags |= DCACHE_MOUNTED; ret = 0; } } spin_unlock(&dentry->d_lock); out: write_sequnlock(&rename_lock); return ret; } /* * Search the dentry child list of the specified parent, * and move any unused dentries to the end of the unused * list for prune_dcache(). We descend to the next level * whenever the d_children list is non-empty and continue * searching. * * It returns zero iff there are no unused children, * otherwise it returns the number of children moved to * the end of the unused list. This may not be the total * number of unused children, because select_parent can * drop the lock and return early due to latency * constraints. */ struct select_data { struct dentry *start; union { long found; struct dentry *victim; }; struct list_head dispose; }; static enum d_walk_ret select_collect(void *_data, struct dentry *dentry) { struct select_data *data = _data; enum d_walk_ret ret = D_WALK_CONTINUE; if (data->start == dentry) goto out; if (dentry->d_flags & DCACHE_SHRINK_LIST) { data->found++; } else if (!dentry->d_lockref.count) { to_shrink_list(dentry, &data->dispose); data->found++; } else if (dentry->d_lockref.count < 0) { data->found++; } /* * We can return to the caller if we have found some (this * ensures forward progress). We'll be coming back to find * the rest. */ if (!list_empty(&data->dispose)) ret = need_resched() ? D_WALK_QUIT : D_WALK_NORETRY; out: return ret; } static enum d_walk_ret select_collect2(void *_data, struct dentry *dentry) { struct select_data *data = _data; enum d_walk_ret ret = D_WALK_CONTINUE; if (data->start == dentry) goto out; if (!dentry->d_lockref.count) { if (dentry->d_flags & DCACHE_SHRINK_LIST) { rcu_read_lock(); data->victim = dentry; return D_WALK_QUIT; } to_shrink_list(dentry, &data->dispose); } /* * We can return to the caller if we have found some (this * ensures forward progress). We'll be coming back to find * the rest. */ if (!list_empty(&data->dispose)) ret = need_resched() ? D_WALK_QUIT : D_WALK_NORETRY; out: return ret; } /** * shrink_dcache_parent - prune dcache * @parent: parent of entries to prune * * Prune the dcache to remove unused children of the parent dentry. */ void shrink_dcache_parent(struct dentry *parent) { for (;;) { struct select_data data = {.start = parent}; INIT_LIST_HEAD(&data.dispose); d_walk(parent, &data, select_collect); if (!list_empty(&data.dispose)) { shrink_dentry_list(&data.dispose); continue; } cond_resched(); if (!data.found) break; data.victim = NULL; d_walk(parent, &data, select_collect2); if (data.victim) { spin_lock(&data.victim->d_lock); if (!lock_for_kill(data.victim)) { spin_unlock(&data.victim->d_lock); rcu_read_unlock(); } else { shrink_kill(data.victim); } } if (!list_empty(&data.dispose)) shrink_dentry_list(&data.dispose); } } EXPORT_SYMBOL(shrink_dcache_parent); static enum d_walk_ret umount_check(void *_data, struct dentry *dentry) { /* it has busy descendents; complain about those instead */ if (!hlist_empty(&dentry->d_children)) return D_WALK_CONTINUE; /* root with refcount 1 is fine */ if (dentry == _data && dentry->d_lockref.count == 1) return D_WALK_CONTINUE; WARN(1, "BUG: Dentry %p{i=%lx,n=%pd} " " still in use (%d) [unmount of %s %s]\n", dentry, dentry->d_inode ? dentry->d_inode->i_ino : 0UL, dentry, dentry->d_lockref.count, dentry->d_sb->s_type->name, dentry->d_sb->s_id); return D_WALK_CONTINUE; } static void do_one_tree(struct dentry *dentry) { shrink_dcache_parent(dentry); d_walk(dentry, dentry, umount_check); d_drop(dentry); dput(dentry); } /* * destroy the dentries attached to a superblock on unmounting */ void shrink_dcache_for_umount(struct super_block *sb) { struct dentry *dentry; rwsem_assert_held_write(&sb->s_umount); dentry = sb->s_root; sb->s_root = NULL; do_one_tree(dentry); while (!hlist_bl_empty(&sb->s_roots)) { dentry = dget(hlist_bl_entry(hlist_bl_first(&sb->s_roots), struct dentry, d_hash)); do_one_tree(dentry); } } static enum d_walk_ret find_submount(void *_data, struct dentry *dentry) { struct dentry **victim = _data; if (d_mountpoint(dentry)) { *victim = dget_dlock(dentry); return D_WALK_QUIT; } return D_WALK_CONTINUE; } /** * d_invalidate - detach submounts, prune dcache, and drop * @dentry: dentry to invalidate (aka detach, prune and drop) */ void d_invalidate(struct dentry *dentry) { bool had_submounts = false; spin_lock(&dentry->d_lock); if (d_unhashed(dentry)) { spin_unlock(&dentry->d_lock); return; } __d_drop(dentry); spin_unlock(&dentry->d_lock); /* Negative dentries can be dropped without further checks */ if (!dentry->d_inode) return; shrink_dcache_parent(dentry); for (;;) { struct dentry *victim = NULL; d_walk(dentry, &victim, find_submount); if (!victim) { if (had_submounts) shrink_dcache_parent(dentry); return; } had_submounts = true; detach_mounts(victim); dput(victim); } } EXPORT_SYMBOL(d_invalidate); /** * __d_alloc - allocate a dcache entry * @sb: filesystem it will belong to * @name: qstr of the name * * Allocates a dentry. It returns %NULL if there is insufficient memory * available. On a success the dentry is returned. The name passed in is * copied and the copy passed in may be reused after this call. */ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) { struct dentry *dentry; char *dname; int err; dentry = kmem_cache_alloc_lru(dentry_cache, &sb->s_dentry_lru, GFP_KERNEL); if (!dentry) return NULL; /* * We guarantee that the inline name is always NUL-terminated. * This way the memcpy() done by the name switching in rename * will still always have a NUL at the end, even if we might * be overwriting an internal NUL character */ dentry->d_shortname.string[DNAME_INLINE_LEN-1] = 0; if (unlikely(!name)) { name = &slash_name; dname = dentry->d_shortname.string; } else if (name->len > DNAME_INLINE_LEN-1) { size_t size = offsetof(struct external_name, name[1]); struct external_name *p = kmalloc(size + name->len, GFP_KERNEL_ACCOUNT | __GFP_RECLAIMABLE); if (!p) { kmem_cache_free(dentry_cache, dentry); return NULL; } atomic_set(&p->count, 1); dname = p->name; } else { dname = dentry->d_shortname.string; } dentry->d_name.len = name->len; dentry->d_name.hash = name->hash; memcpy(dname, name->name, name->len); dname[name->len] = 0; /* Make sure we always see the terminating NUL character */ smp_store_release(&dentry->d_name.name, dname); /* ^^^ */ dentry->d_flags = 0; lockref_init(&dentry->d_lockref); seqcount_spinlock_init(&dentry->d_seq, &dentry->d_lock); dentry->d_inode = NULL; dentry->d_parent = dentry; dentry->d_sb = sb; dentry->d_op = NULL; dentry->d_fsdata = NULL; INIT_HLIST_BL_NODE(&dentry->d_hash); INIT_LIST_HEAD(&dentry->d_lru); INIT_HLIST_HEAD(&dentry->d_children); INIT_HLIST_NODE(&dentry->d_u.d_alias); INIT_HLIST_NODE(&dentry->d_sib); d_set_d_op(dentry, dentry->d_sb->s_d_op); if (dentry->d_op && dentry->d_op->d_init) { err = dentry->d_op->d_init(dentry); if (err) { if (dname_external(dentry)) kfree(external_name(dentry)); kmem_cache_free(dentry_cache, dentry); return NULL; } } this_cpu_inc(nr_dentry); return dentry; } /** * d_alloc - allocate a dcache entry * @parent: parent of entry to allocate * @name: qstr of the name * * Allocates a dentry. It returns %NULL if there is insufficient memory * available. On a success the dentry is returned. The name passed in is * copied and the copy passed in may be reused after this call. */ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name) { struct dentry *dentry = __d_alloc(parent->d_sb, name); if (!dentry) return NULL; spin_lock(&parent->d_lock); /* * don't need child lock because it is not subject * to concurrency here */ dentry->d_parent = dget_dlock(parent); hlist_add_head(&dentry->d_sib, &parent->d_children); spin_unlock(&parent->d_lock); return dentry; } EXPORT_SYMBOL(d_alloc); struct dentry *d_alloc_anon(struct super_block *sb) { return __d_alloc(sb, NULL); } EXPORT_SYMBOL(d_alloc_anon); struct dentry *d_alloc_cursor(struct dentry * parent) { struct dentry *dentry = d_alloc_anon(parent->d_sb); if (dentry) { dentry->d_flags |= DCACHE_DENTRY_CURSOR; dentry->d_parent = dget(parent); } return dentry; } /** * d_alloc_pseudo - allocate a dentry (for lookup-less filesystems) * @sb: the superblock * @name: qstr of the name * * For a filesystem that just pins its dentries in memory and never * performs lookups at all, return an unhashed IS_ROOT dentry. * This is used for pipes, sockets et.al. - the stuff that should * never be anyone's children or parents. Unlike all other * dentries, these will not have RCU delay between dropping the * last reference and freeing them. * * The only user is alloc_file_pseudo() and that's what should * be considered a public interface. Don't use directly. */ struct dentry *d_alloc_pseudo(struct super_block *sb, const struct qstr *name) { static const struct dentry_operations anon_ops = { .d_dname = simple_dname }; struct dentry *dentry = __d_alloc(sb, name); if (likely(dentry)) { dentry->d_flags |= DCACHE_NORCU; if (!sb->s_d_op) d_set_d_op(dentry, &anon_ops); } return dentry; } struct dentry *d_alloc_name(struct dentry *parent, const char *name) { struct qstr q; q.name = name; q.hash_len = hashlen_string(parent, name); return d_alloc(parent, &q); } EXPORT_SYMBOL(d_alloc_name); void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op) { WARN_ON_ONCE(dentry->d_op); WARN_ON_ONCE(dentry->d_flags & (DCACHE_OP_HASH | DCACHE_OP_COMPARE | DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE | DCACHE_OP_DELETE | DCACHE_OP_REAL)); dentry->d_op = op; if (!op) return; if (op->d_hash) dentry->d_flags |= DCACHE_OP_HASH; if (op->d_compare) dentry->d_flags |= DCACHE_OP_COMPARE; if (op->d_revalidate) dentry->d_flags |= DCACHE_OP_REVALIDATE; if (op->d_weak_revalidate) dentry->d_flags |= DCACHE_OP_WEAK_REVALIDATE; if (op->d_delete) dentry->d_flags |= DCACHE_OP_DELETE; if (op->d_prune) dentry->d_flags |= DCACHE_OP_PRUNE; if (op->d_real) dentry->d_flags |= DCACHE_OP_REAL; } EXPORT_SYMBOL(d_set_d_op); static unsigned d_flags_for_inode(struct inode *inode) { unsigned add_flags = DCACHE_REGULAR_TYPE; if (!inode) return DCACHE_MISS_TYPE; if (S_ISDIR(inode->i_mode)) { add_flags = DCACHE_DIRECTORY_TYPE; if (unlikely(!(inode->i_opflags & IOP_LOOKUP))) { if (unlikely(!inode->i_op->lookup)) add_flags = DCACHE_AUTODIR_TYPE; else inode->i_opflags |= IOP_LOOKUP; } goto type_determined; } if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) { if (unlikely(inode->i_op->get_link)) { add_flags = DCACHE_SYMLINK_TYPE; goto type_determined; } inode->i_opflags |= IOP_NOFOLLOW; } if (unlikely(!S_ISREG(inode->i_mode))) add_flags = DCACHE_SPECIAL_TYPE; type_determined: if (unlikely(IS_AUTOMOUNT(inode))) add_flags |= DCACHE_NEED_AUTOMOUNT; return add_flags; } static void __d_instantiate(struct dentry *dentry, struct inode *inode) { unsigned add_flags = d_flags_for_inode(inode); WARN_ON(d_in_lookup(dentry)); spin_lock(&dentry->d_lock); /* * The negative counter only tracks dentries on the LRU. Don't dec if * d_lru is on another list. */ if ((dentry->d_flags & (DCACHE_LRU_LIST|DCACHE_SHRINK_LIST)) == DCACHE_LRU_LIST) this_cpu_dec(nr_dentry_negative); hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry); raw_write_seqcount_begin(&dentry->d_seq); __d_set_inode_and_type(dentry, inode, add_flags); raw_write_seqcount_end(&dentry->d_seq); fsnotify_update_flags(dentry); spin_unlock(&dentry->d_lock); } /** * d_instantiate - fill in inode information for a dentry * @entry: dentry to complete * @inode: inode to attach to this dentry * * Fill in inode information in the entry. * * This turns negative dentries into productive full members * of society. * * NOTE! This assumes that the inode count has been incremented * (or otherwise set) by the caller to indicate that it is now * in use by the dcache. */ void d_instantiate(struct dentry *entry, struct inode * inode) { BUG_ON(!hlist_unhashed(&entry->d_u.d_alias)); if (inode) { security_d_instantiate(entry, inode); spin_lock(&inode->i_lock); __d_instantiate(entry, inode); spin_unlock(&inode->i_lock); } } EXPORT_SYMBOL(d_instantiate); /* * This should be equivalent to d_instantiate() + unlock_new_inode(), * with lockdep-related part of unlock_new_inode() done before * anything else. Use that instead of open-coding d_instantiate()/ * unlock_new_inode() combinations. */ void d_instantiate_new(struct dentry *entry, struct inode *inode) { BUG_ON(!hlist_unhashed(&entry->d_u.d_alias)); BUG_ON(!inode); lockdep_annotate_inode_mutex_key(inode); security_d_instantiate(entry, inode); spin_lock(&inode->i_lock); __d_instantiate(entry, inode); WARN_ON(!(inode->i_state & I_NEW)); inode->i_state &= ~I_NEW & ~I_CREATING; /* * Pairs with the barrier in prepare_to_wait_event() to make sure * ___wait_var_event() either sees the bit cleared or * waitqueue_active() check in wake_up_var() sees the waiter. */ smp_mb(); inode_wake_up_bit(inode, __I_NEW); spin_unlock(&inode->i_lock); } EXPORT_SYMBOL(d_instantiate_new); struct dentry *d_make_root(struct inode *root_inode) { struct dentry *res = NULL; if (root_inode) { res = d_alloc_anon(root_inode->i_sb); if (res) d_instantiate(res, root_inode); else iput(root_inode); } return res; } EXPORT_SYMBOL(d_make_root); static struct dentry *__d_obtain_alias(struct inode *inode, bool disconnected) { struct super_block *sb; struct dentry *new, *res; if (!inode) return ERR_PTR(-ESTALE); if (IS_ERR(inode)) return ERR_CAST(inode); sb = inode->i_sb; res = d_find_any_alias(inode); /* existing alias? */ if (res) goto out; new = d_alloc_anon(sb); if (!new) { res = ERR_PTR(-ENOMEM); goto out; } security_d_instantiate(new, inode); spin_lock(&inode->i_lock); res = __d_find_any_alias(inode); /* recheck under lock */ if (likely(!res)) { /* still no alias, attach a disconnected dentry */ unsigned add_flags = d_flags_for_inode(inode); if (disconnected) add_flags |= DCACHE_DISCONNECTED; spin_lock(&new->d_lock); __d_set_inode_and_type(new, inode, add_flags); hlist_add_head(&new->d_u.d_alias, &inode->i_dentry); if (!disconnected) { hlist_bl_lock(&sb->s_roots); hlist_bl_add_head(&new->d_hash, &sb->s_roots); hlist_bl_unlock(&sb->s_roots); } spin_unlock(&new->d_lock); spin_unlock(&inode->i_lock); inode = NULL; /* consumed by new->d_inode */ res = new; } else { spin_unlock(&inode->i_lock); dput(new); } out: iput(inode); return res; } /** * d_obtain_alias - find or allocate a DISCONNECTED dentry for a given inode * @inode: inode to allocate the dentry for * * Obtain a dentry for an inode resulting from NFS filehandle conversion or * similar open by handle operations. The returned dentry may be anonymous, * or may have a full name (if the inode was already in the cache). * * When called on a directory inode, we must ensure that the inode only ever * has one dentry. If a dentry is found, that is returned instead of * allocating a new one. * * On successful return, the reference to the inode has been transferred * to the dentry. In case of an error the reference on the inode is released. * To make it easier to use in export operations a %NULL or IS_ERR inode may * be passed in and the error will be propagated to the return value, * with a %NULL @inode replaced by ERR_PTR(-ESTALE). */ struct dentry *d_obtain_alias(struct inode *inode) { return __d_obtain_alias(inode, true); } EXPORT_SYMBOL(d_obtain_alias); /** * d_obtain_root - find or allocate a dentry for a given inode * @inode: inode to allocate the dentry for * * Obtain an IS_ROOT dentry for the root of a filesystem. * * We must ensure that directory inodes only ever have one dentry. If a * dentry is found, that is returned instead of allocating a new one. * * On successful return, the reference to the inode has been transferred * to the dentry. In case of an error the reference on the inode is * released. A %NULL or IS_ERR inode may be passed in and will be the * error will be propagate to the return value, with a %NULL @inode * replaced by ERR_PTR(-ESTALE). */ struct dentry *d_obtain_root(struct inode *inode) { return __d_obtain_alias(inode, false); } EXPORT_SYMBOL(d_obtain_root); /** * d_add_ci - lookup or allocate new dentry with case-exact name * @dentry: the negative dentry that was passed to the parent's lookup func * @inode: the inode case-insensitive lookup has found * @name: the case-exact name to be associated with the returned dentry * * This is to avoid filling the dcache with case-insensitive names to the * same inode, only the actual correct case is stored in the dcache for * case-insensitive filesystems. * * For a case-insensitive lookup match and if the case-exact dentry * already exists in the dcache, use it and return it. * * If no entry exists with the exact case name, allocate new dentry with * the exact case, and return the spliced entry. */ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode, struct qstr *name) { struct dentry *found, *res; /* * First check if a dentry matching the name already exists, * if not go ahead and create it now. */ found = d_hash_and_lookup(dentry->d_parent, name); if (found) { iput(inode); return found; } if (d_in_lookup(dentry)) { found = d_alloc_parallel(dentry->d_parent, name, dentry->d_wait); if (IS_ERR(found) || !d_in_lookup(found)) { iput(inode); return found; } } else { found = d_alloc(dentry->d_parent, name); if (!found) { iput(inode); return ERR_PTR(-ENOMEM); } } res = d_splice_alias(inode, found); if (res) { d_lookup_done(found); dput(found); return res; } return found; } EXPORT_SYMBOL(d_add_ci); /** * d_same_name - compare dentry name with case-exact name * @dentry: the negative dentry that was passed to the parent's lookup func * @parent: parent dentry * @name: the case-exact name to be associated with the returned dentry * * Return: true if names are same, or false */ bool d_same_name(const struct dentry *dentry, const struct dentry *parent, const struct qstr *name) { if (likely(!(parent->d_flags & DCACHE_OP_COMPARE))) { if (dentry->d_name.len != name->len) return false; return dentry_cmp(dentry, name->name, name->len) == 0; } return parent->d_op->d_compare(dentry, dentry->d_name.len, dentry->d_name.name, name) == 0; } EXPORT_SYMBOL_GPL(d_same_name); /* * This is __d_lookup_rcu() when the parent dentry has * DCACHE_OP_COMPARE, which makes things much nastier. */ static noinline struct dentry *__d_lookup_rcu_op_compare( const struct dentry *parent, const struct qstr *name, unsigned *seqp) { u64 hashlen = name->hash_len; struct hlist_bl_head *b = d_hash(hashlen); struct hlist_bl_node *node; struct dentry *dentry; hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) { int tlen; const char *tname; unsigned seq; seqretry: seq = raw_seqcount_begin(&dentry->d_seq); if (dentry->d_parent != parent) continue; if (d_unhashed(dentry)) continue; if (dentry->d_name.hash != hashlen_hash(hashlen)) continue; tlen = dentry->d_name.len; tname = dentry->d_name.name; /* we want a consistent (name,len) pair */ if (read_seqcount_retry(&dentry->d_seq, seq)) { cpu_relax(); goto seqretry; } if (parent->d_op->d_compare(dentry, tlen, tname, name) != 0) continue; *seqp = seq; return dentry; } return NULL; } /** * __d_lookup_rcu - search for a dentry (racy, store-free) * @parent: parent dentry * @name: qstr of name we wish to find * @seqp: returns d_seq value at the point where the dentry was found * Returns: dentry, or NULL * * __d_lookup_rcu is the dcache lookup function for rcu-walk name * resolution (store-free path walking) design described in * Documentation/filesystems/path-lookup.txt. * * This is not to be used outside core vfs. * * __d_lookup_rcu must only be used in rcu-walk mode, ie. with vfsmount lock * held, and rcu_read_lock held. The returned dentry must not be stored into * without taking d_lock and checking d_seq sequence count against @seq * returned here. * * Alternatively, __d_lookup_rcu may be called again to look up the child of * the returned dentry, so long as its parent's seqlock is checked after the * child is looked up. Thus, an interlocking stepping of sequence lock checks * is formed, giving integrity down the path walk. * * NOTE! The caller *has* to check the resulting dentry against the sequence * number we've returned before using any of the resulting dentry state! */ struct dentry *__d_lookup_rcu(const struct dentry *parent, const struct qstr *name, unsigned *seqp) { u64 hashlen = name->hash_len; const unsigned char *str = name->name; struct hlist_bl_head *b = d_hash(hashlen); struct hlist_bl_node *node; struct dentry *dentry; /* * Note: There is significant duplication with __d_lookup_rcu which is * required to prevent single threaded performance regressions * especially on architectures where smp_rmb (in seqcounts) are costly. * Keep the two functions in sync. */ if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) return __d_lookup_rcu_op_compare(parent, name, seqp); /* * The hash list is protected using RCU. * * Carefully use d_seq when comparing a candidate dentry, to avoid * races with d_move(). * * It is possible that concurrent renames can mess up our list * walk here and result in missing our dentry, resulting in the * false-negative result. d_lookup() protects against concurrent * renames using rename_lock seqlock. * * See Documentation/filesystems/path-lookup.txt for more details. */ hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) { unsigned seq; /* * The dentry sequence count protects us from concurrent * renames, and thus protects parent and name fields. * * The caller must perform a seqcount check in order * to do anything useful with the returned dentry. * * NOTE! We do a "raw" seqcount_begin here. That means that * we don't wait for the sequence count to stabilize if it * is in the middle of a sequence change. If we do the slow * dentry compare, we will do seqretries until it is stable, * and if we end up with a successful lookup, we actually * want to exit RCU lookup anyway. * * Note that raw_seqcount_begin still *does* smp_rmb(), so * we are still guaranteed NUL-termination of ->d_name.name. */ seq = raw_seqcount_begin(&dentry->d_seq); if (dentry->d_parent != parent) continue; if (d_unhashed(dentry)) continue; if (dentry->d_name.hash_len != hashlen) continue; if (dentry_cmp(dentry, str, hashlen_len(hashlen)) != 0) continue; *seqp = seq; return dentry; } return NULL; } /** * d_lookup - search for a dentry * @parent: parent dentry * @name: qstr of name we wish to find * Returns: dentry, or NULL * * d_lookup searches the children of the parent dentry for the name in * question. If the dentry is found its reference count is incremented and the * dentry is returned. The caller must use dput to free the entry when it has * finished using it. %NULL is returned if the dentry does not exist. */ struct dentry *d_lookup(const struct dentry *parent, const struct qstr *name) { struct dentry *dentry; unsigned seq; do { seq = read_seqbegin(&rename_lock); dentry = __d_lookup(parent, name); if (dentry) break; } while (read_seqretry(&rename_lock, seq)); return dentry; } EXPORT_SYMBOL(d_lookup); /** * __d_lookup - search for a dentry (racy) * @parent: parent dentry * @name: qstr of name we wish to find * Returns: dentry, or NULL * * __d_lookup is like d_lookup, however it may (rarely) return a * false-negative result due to unrelated rename activity. * * __d_lookup is slightly faster by avoiding rename_lock read seqlock, * however it must be used carefully, eg. with a following d_lookup in * the case of failure. * * __d_lookup callers must be commented. */ struct dentry *__d_lookup(const struct dentry *parent, const struct qstr *name) { unsigned int hash = name->hash; struct hlist_bl_head *b = d_hash(hash); struct hlist_bl_node *node; struct dentry *found = NULL; struct dentry *dentry; /* * Note: There is significant duplication with __d_lookup_rcu which is * required to prevent single threaded performance regressions * especially on architectures where smp_rmb (in seqcounts) are costly. * Keep the two functions in sync. */ /* * The hash list is protected using RCU. * * Take d_lock when comparing a candidate dentry, to avoid races * with d_move(). * * It is possible that concurrent renames can mess up our list * walk here and result in missing our dentry, resulting in the * false-negative result. d_lookup() protects against concurrent * renames using rename_lock seqlock. * * See Documentation/filesystems/path-lookup.txt for more details. */ rcu_read_lock(); hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) { if (dentry->d_name.hash != hash) continue; spin_lock(&dentry->d_lock); if (dentry->d_parent != parent) goto next; if (d_unhashed(dentry)) goto next; if (!d_same_name(dentry, parent, name)) goto next; dentry->d_lockref.count++; found = dentry; spin_unlock(&dentry->d_lock); break; next: spin_unlock(&dentry->d_lock); } rcu_read_unlock(); return found; } /** * d_hash_and_lookup - hash the qstr then search for a dentry * @dir: Directory to search in * @name: qstr of name we wish to find * * On lookup failure NULL is returned; on bad name - ERR_PTR(-error) */ struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name) { /* * Check for a fs-specific hash function. Note that we must * calculate the standard hash first, as the d_op->d_hash() * routine may choose to leave the hash value unchanged. */ name->hash = full_name_hash(dir, name->name, name->len); if (dir->d_flags & DCACHE_OP_HASH) { int err = dir->d_op->d_hash(dir, name); if (unlikely(err < 0)) return ERR_PTR(err); } return d_lookup(dir, name); } EXPORT_SYMBOL(d_hash_and_lookup); /* * When a file is deleted, we have two options: * - turn this dentry into a negative dentry * - unhash this dentry and free it. * * Usually, we want to just turn this into * a negative dentry, but if anybody else is * currently using the dentry or the inode * we can't do that and we fall back on removing * it from the hash queues and waiting for * it to be deleted later when it has no users */ /** * d_delete - delete a dentry * @dentry: The dentry to delete * * Turn the dentry into a negative dentry if possible, otherwise * remove it from the hash queues so it can be deleted later */ void d_delete(struct dentry * dentry) { struct inode *inode = dentry->d_inode; spin_lock(&inode->i_lock); spin_lock(&dentry->d_lock); /* * Are we the only user? */ if (dentry->d_lockref.count == 1) { if (dentry_negative_policy) __d_drop(dentry); dentry->d_flags &= ~DCACHE_CANT_MOUNT; dentry_unlink_inode(dentry); } else { __d_drop(dentry); spin_unlock(&dentry->d_lock); spin_unlock(&inode->i_lock); } } EXPORT_SYMBOL(d_delete); static void __d_rehash(struct dentry *entry) { struct hlist_bl_head *b = d_hash(entry->d_name.hash); hlist_bl_lock(b); hlist_bl_add_head_rcu(&entry->d_hash, b); hlist_bl_unlock(b); } /** * d_rehash - add an entry back to the hash * @entry: dentry to add to the hash * * Adds a dentry to the hash according to its name. */ void d_rehash(struct dentry * entry) { spin_lock(&entry->d_lock); __d_rehash(entry); spin_unlock(&entry->d_lock); } EXPORT_SYMBOL(d_rehash); static inline unsigned start_dir_add(struct inode *dir) { preempt_disable_nested(); for (;;) { unsigned n = dir->i_dir_seq; if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n) return n; cpu_relax(); } } static inline void end_dir_add(struct inode *dir, unsigned int n, wait_queue_head_t *d_wait) { smp_store_release(&dir->i_dir_seq, n + 2); preempt_enable_nested(); wake_up_all(d_wait); } static void d_wait_lookup(struct dentry *dentry) { if (d_in_lookup(dentry)) { DECLARE_WAITQUEUE(wait, current); add_wait_queue(dentry->d_wait, &wait); do { set_current_state(TASK_UNINTERRUPTIBLE); spin_unlock(&dentry->d_lock); schedule(); spin_lock(&dentry->d_lock); } while (d_in_lookup(dentry)); } } struct dentry *d_alloc_parallel(struct dentry *parent, const struct qstr *name, wait_queue_head_t *wq) { unsigned int hash = name->hash; struct hlist_bl_head *b = in_lookup_hash(parent, hash); struct hlist_bl_node *node; struct dentry *new = d_alloc(parent, name); struct dentry *dentry; unsigned seq, r_seq, d_seq; if (unlikely(!new)) return ERR_PTR(-ENOMEM); retry: rcu_read_lock(); seq = smp_load_acquire(&parent->d_inode->i_dir_seq); r_seq = read_seqbegin(&rename_lock); dentry = __d_lookup_rcu(parent, name, &d_seq); if (unlikely(dentry)) { if (!lockref_get_not_dead(&dentry->d_lockref)) { rcu_read_unlock(); goto retry; } if (read_seqcount_retry(&dentry->d_seq, d_seq)) { rcu_read_unlock(); dput(dentry); goto retry; } rcu_read_unlock(); dput(new); return dentry; } if (unlikely(read_seqretry(&rename_lock, r_seq))) { rcu_read_unlock(); goto retry; } if (unlikely(seq & 1)) { rcu_read_unlock(); goto retry; } hlist_bl_lock(b); if (unlikely(READ_ONCE(parent->d_inode->i_dir_seq) != seq)) { hlist_bl_unlock(b); rcu_read_unlock(); goto retry; } /* * No changes for the parent since the beginning of d_lookup(). * Since all removals from the chain happen with hlist_bl_lock(), * any potential in-lookup matches are going to stay here until * we unlock the chain. All fields are stable in everything * we encounter. */ hlist_bl_for_each_entry(dentry, node, b, d_u.d_in_lookup_hash) { if (dentry->d_name.hash != hash) continue; if (dentry->d_parent != parent) continue; if (!d_same_name(dentry, parent, name)) continue; hlist_bl_unlock(b); /* now we can try to grab a reference */ if (!lockref_get_not_dead(&dentry->d_lockref)) { rcu_read_unlock(); goto retry; } rcu_read_unlock(); /* * somebody is likely to be still doing lookup for it; * wait for them to finish */ spin_lock(&dentry->d_lock); d_wait_lookup(dentry); /* * it's not in-lookup anymore; in principle we should repeat * everything from dcache lookup, but it's likely to be what * d_lookup() would've found anyway. If it is, just return it; * otherwise we really have to repeat the whole thing. */ if (unlikely(dentry->d_name.hash != hash)) goto mismatch; if (unlikely(dentry->d_parent != parent)) goto mismatch; if (unlikely(d_unhashed(dentry))) goto mismatch; if (unlikely(!d_same_name(dentry, parent, name))) goto mismatch; /* OK, it *is* a hashed match; return it */ spin_unlock(&dentry->d_lock); dput(new); return dentry; } rcu_read_unlock(); /* we can't take ->d_lock here; it's OK, though. */ new->d_flags |= DCACHE_PAR_LOOKUP; new->d_wait = wq; hlist_bl_add_head(&new->d_u.d_in_lookup_hash, b); hlist_bl_unlock(b); return new; mismatch: spin_unlock(&dentry->d_lock); dput(dentry); goto retry; } EXPORT_SYMBOL(d_alloc_parallel); /* * - Unhash the dentry * - Retrieve and clear the waitqueue head in dentry * - Return the waitqueue head */ static wait_queue_head_t *__d_lookup_unhash(struct dentry *dentry) { wait_queue_head_t *d_wait; struct hlist_bl_head *b; lockdep_assert_held(&dentry->d_lock); b = in_lookup_hash(dentry->d_parent, dentry->d_name.hash); hlist_bl_lock(b); dentry->d_flags &= ~DCACHE_PAR_LOOKUP; __hlist_bl_del(&dentry->d_u.d_in_lookup_hash); d_wait = dentry->d_wait; dentry->d_wait = NULL; hlist_bl_unlock(b); INIT_HLIST_NODE(&dentry->d_u.d_alias); INIT_LIST_HEAD(&dentry->d_lru); return d_wait; } void __d_lookup_unhash_wake(struct dentry *dentry) { spin_lock(&dentry->d_lock); wake_up_all(__d_lookup_unhash(dentry)); spin_unlock(&dentry->d_lock); } EXPORT_SYMBOL(__d_lookup_unhash_wake); /* inode->i_lock held if inode is non-NULL */ static inline void __d_add(struct dentry *dentry, struct inode *inode) { wait_queue_head_t *d_wait; struct inode *dir = NULL; unsigned n; spin_lock(&dentry->d_lock); if (unlikely(d_in_lookup(dentry))) { dir = dentry->d_parent->d_inode; n = start_dir_add(dir); d_wait = __d_lookup_unhash(dentry); } if (inode) { unsigned add_flags = d_flags_for_inode(inode); hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry); raw_write_seqcount_begin(&dentry->d_seq); __d_set_inode_and_type(dentry, inode, add_flags); raw_write_seqcount_end(&dentry->d_seq); fsnotify_update_flags(dentry); } __d_rehash(dentry); if (dir) end_dir_add(dir, n, d_wait); spin_unlock(&dentry->d_lock); if (inode) spin_unlock(&inode->i_lock); } /** * d_add - add dentry to hash queues * @entry: dentry to add * @inode: The inode to attach to this dentry * * This adds the entry to the hash queues and initializes @inode. * The entry was actually filled in earlier during d_alloc(). */ void d_add(struct dentry *entry, struct inode *inode) { if (inode) { security_d_instantiate(entry, inode); spin_lock(&inode->i_lock); } __d_add(entry, inode); } EXPORT_SYMBOL(d_add); /** * d_exact_alias - find and hash an exact unhashed alias * @entry: dentry to add * @inode: The inode to go with this dentry * * If an unhashed dentry with the same name/parent and desired * inode already exists, hash and return it. Otherwise, return * NULL. * * Parent directory should be locked. */ struct dentry *d_exact_alias(struct dentry *entry, struct inode *inode) { struct dentry *alias; unsigned int hash = entry->d_name.hash; spin_lock(&inode->i_lock); hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { /* * Don't need alias->d_lock here, because aliases with * d_parent == entry->d_parent are not subject to name or * parent changes, because the parent inode i_mutex is held. */ if (alias->d_name.hash != hash) continue; if (alias->d_parent != entry->d_parent) continue; if (!d_same_name(alias, entry->d_parent, &entry->d_name)) continue; spin_lock(&alias->d_lock); if (!d_unhashed(alias)) { spin_unlock(&alias->d_lock); alias = NULL; } else { dget_dlock(alias); __d_rehash(alias); spin_unlock(&alias->d_lock); } spin_unlock(&inode->i_lock); return alias; } spin_unlock(&inode->i_lock); return NULL; } EXPORT_SYMBOL(d_exact_alias); static void swap_names(struct dentry *dentry, struct dentry *target) { if (unlikely(dname_external(target))) { if (unlikely(dname_external(dentry))) { /* * Both external: swap the pointers */ swap(target->d_name.name, dentry->d_name.name); } else { /* * dentry:internal, target:external. Steal target's * storage and make target internal. */ dentry->d_name.name = target->d_name.name; target->d_shortname = dentry->d_shortname; target->d_name.name = target->d_shortname.string; } } else { if (unlikely(dname_external(dentry))) { /* * dentry:external, target:internal. Give dentry's * storage to target and make dentry internal */ target->d_name.name = dentry->d_name.name; dentry->d_shortname = target->d_shortname; dentry->d_name.name = dentry->d_shortname.string; } else { /* * Both are internal. */ for (int i = 0; i < DNAME_INLINE_WORDS; i++) swap(dentry->d_shortname.words[i], target->d_shortname.words[i]); } } swap(dentry->d_name.hash_len, target->d_name.hash_len); } static void copy_name(struct dentry *dentry, struct dentry *target) { struct external_name *old_name = NULL; if (unlikely(dname_external(dentry))) old_name = external_name(dentry); if (unlikely(dname_external(target))) { atomic_inc(&external_name(target)->count); dentry->d_name = target->d_name; } else { dentry->d_shortname = target->d_shortname; dentry->d_name.name = dentry->d_shortname.string; dentry->d_name.hash_len = target->d_name.hash_len; } if (old_name && likely(atomic_dec_and_test(&old_name->count))) kfree_rcu(old_name, head); } /* * __d_move - move a dentry * @dentry: entry to move * @target: new dentry * @exchange: exchange the two dentries * * Update the dcache to reflect the move of a file name. Negative * dcache entries should not be moved in this way. Caller must hold * rename_lock, the i_mutex of the source and target directories, * and the sb->s_vfs_rename_mutex if they differ. See lock_rename(). */ static void __d_move(struct dentry *dentry, struct dentry *target, bool exchange) { struct dentry *old_parent, *p; wait_queue_head_t *d_wait; struct inode *dir = NULL; unsigned n; WARN_ON(!dentry->d_inode); if (WARN_ON(dentry == target)) return; BUG_ON(d_ancestor(target, dentry)); old_parent = dentry->d_parent; p = d_ancestor(old_parent, target); if (IS_ROOT(dentry)) { BUG_ON(p); spin_lock(&target->d_parent->d_lock); } else if (!p) { /* target is not a descendent of dentry->d_parent */ spin_lock(&target->d_parent->d_lock); spin_lock_nested(&old_parent->d_lock, DENTRY_D_LOCK_NESTED); } else { BUG_ON(p == dentry); spin_lock(&old_parent->d_lock); if (p != target) spin_lock_nested(&target->d_parent->d_lock, DENTRY_D_LOCK_NESTED); } spin_lock_nested(&dentry->d_lock, 2); spin_lock_nested(&target->d_lock, 3); if (unlikely(d_in_lookup(target))) { dir = target->d_parent->d_inode; n = start_dir_add(dir); d_wait = __d_lookup_unhash(target); } write_seqcount_begin(&dentry->d_seq); write_seqcount_begin_nested(&target->d_seq, DENTRY_D_LOCK_NESTED); /* unhash both */ if (!d_unhashed(dentry)) ___d_drop(dentry); if (!d_unhashed(target)) ___d_drop(target); /* ... and switch them in the tree */ dentry->d_parent = target->d_parent; if (!exchange) { copy_name(dentry, target); target->d_hash.pprev = NULL; dentry->d_parent->d_lockref.count++; if (dentry != old_parent) /* wasn't IS_ROOT */ WARN_ON(!--old_parent->d_lockref.count); } else { target->d_parent = old_parent; swap_names(dentry, target); if (!hlist_unhashed(&target->d_sib)) __hlist_del(&target->d_sib); hlist_add_head(&target->d_sib, &target->d_parent->d_children); __d_rehash(target); fsnotify_update_flags(target); } if (!hlist_unhashed(&dentry->d_sib)) __hlist_del(&dentry->d_sib); hlist_add_head(&dentry->d_sib, &dentry->d_parent->d_children); __d_rehash(dentry); fsnotify_update_flags(dentry); fscrypt_handle_d_move(dentry); write_seqcount_end(&target->d_seq); write_seqcount_end(&dentry->d_seq); if (dir) end_dir_add(dir, n, d_wait); if (dentry->d_parent != old_parent) spin_unlock(&dentry->d_parent->d_lock); if (dentry != old_parent) spin_unlock(&old_parent->d_lock); spin_unlock(&target->d_lock); spin_unlock(&dentry->d_lock); } /* * d_move - move a dentry * @dentry: entry to move * @target: new dentry * * Update the dcache to reflect the move of a file name. Negative * dcache entries should not be moved in this way. See the locking * requirements for __d_move. */ void d_move(struct dentry *dentry, struct dentry *target) { write_seqlock(&rename_lock); __d_move(dentry, target, false); write_sequnlock(&rename_lock); } EXPORT_SYMBOL(d_move); /* * d_exchange - exchange two dentries * @dentry1: first dentry * @dentry2: second dentry */ void d_exchange(struct dentry *dentry1, struct dentry *dentry2) { write_seqlock(&rename_lock); WARN_ON(!dentry1->d_inode); WARN_ON(!dentry2->d_inode); WARN_ON(IS_ROOT(dentry1)); WARN_ON(IS_ROOT(dentry2)); __d_move(dentry1, dentry2, true); write_sequnlock(&rename_lock); } /** * d_ancestor - search for an ancestor * @p1: ancestor dentry * @p2: child dentry * * Returns the ancestor dentry of p2 which is a child of p1, if p1 is * an ancestor of p2, else NULL. */ struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2) { struct dentry *p; for (p = p2; !IS_ROOT(p); p = p->d_parent) { if (p->d_parent == p1) return p; } return NULL; } /* * This helper attempts to cope with remotely renamed directories * * It assumes that the caller is already holding * dentry->d_parent->d_inode->i_mutex, and rename_lock * * Note: If ever the locking in lock_rename() changes, then please * remember to update this too... */ static int __d_unalias(struct dentry *dentry, struct dentry *alias) { struct mutex *m1 = NULL; struct rw_semaphore *m2 = NULL; int ret = -ESTALE; /* If alias and dentry share a parent, then no extra locks required */ if (alias->d_parent == dentry->d_parent) goto out_unalias; /* See lock_rename() */ if (!mutex_trylock(&dentry->d_sb->s_vfs_rename_mutex)) goto out_err; m1 = &dentry->d_sb->s_vfs_rename_mutex; if (!inode_trylock_shared(alias->d_parent->d_inode)) goto out_err; m2 = &alias->d_parent->d_inode->i_rwsem; out_unalias: if (alias->d_op && alias->d_op->d_unalias_trylock && !alias->d_op->d_unalias_trylock(alias)) goto out_err; __d_move(alias, dentry, false); if (alias->d_op && alias->d_op->d_unalias_unlock) alias->d_op->d_unalias_unlock(alias); ret = 0; out_err: if (m2) up_read(m2); if (m1) mutex_unlock(m1); return ret; } /** * d_splice_alias - splice a disconnected dentry into the tree if one exists * @inode: the inode which may have a disconnected dentry * @dentry: a negative dentry which we want to point to the inode. * * If inode is a directory and has an IS_ROOT alias, then d_move that in * place of the given dentry and return it, else simply d_add the inode * to the dentry and return NULL. * * If a non-IS_ROOT directory is found, the filesystem is corrupt, and * we should error out: directories can't have multiple aliases. * * This is needed in the lookup routine of any filesystem that is exportable * (via knfsd) so that we can build dcache paths to directories effectively. * * If a dentry was found and moved, then it is returned. Otherwise NULL * is returned. This matches the expected return value of ->lookup. * * Cluster filesystems may call this function with a negative, hashed dentry. * In that case, we know that the inode will be a regular file, and also this * will only occur during atomic_open. So we need to check for the dentry * being already hashed only in the final case. */ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) { if (IS_ERR(inode)) return ERR_CAST(inode); BUG_ON(!d_unhashed(dentry)); if (!inode) goto out; security_d_instantiate(dentry, inode); spin_lock(&inode->i_lock); if (S_ISDIR(inode->i_mode)) { struct dentry *new = __d_find_any_alias(inode); if (unlikely(new)) { /* The reference to new ensures it remains an alias */ spin_unlock(&inode->i_lock); write_seqlock(&rename_lock); if (unlikely(d_ancestor(new, dentry))) { write_sequnlock(&rename_lock); dput(new); new = ERR_PTR(-ELOOP); pr_warn_ratelimited( "VFS: Lookup of '%s' in %s %s" " would have caused loop\n", dentry->d_name.name, inode->i_sb->s_type->name, inode->i_sb->s_id); } else if (!IS_ROOT(new)) { struct dentry *old_parent = dget(new->d_parent); int err = __d_unalias(dentry, new); write_sequnlock(&rename_lock); if (err) { dput(new); new = ERR_PTR(err); } dput(old_parent); } else { __d_move(new, dentry, false); write_sequnlock(&rename_lock); } iput(inode); return new; } } out: __d_add(dentry, inode); return NULL; } EXPORT_SYMBOL(d_splice_alias); /* * Test whether new_dentry is a subdirectory of old_dentry. * * Trivially implemented using the dcache structure */ /** * is_subdir - is new dentry a subdirectory of old_dentry * @new_dentry: new dentry * @old_dentry: old dentry * * Returns true if new_dentry is a subdirectory of the parent (at any depth). * Returns false otherwise. * Caller must ensure that "new_dentry" is pinned before calling is_subdir() */ bool is_subdir(struct dentry *new_dentry, struct dentry *old_dentry) { bool subdir; unsigned seq; if (new_dentry == old_dentry) return true; /* Access d_parent under rcu as d_move() may change it. */ rcu_read_lock(); seq = read_seqbegin(&rename_lock); subdir = d_ancestor(old_dentry, new_dentry); /* Try lockless once... */ if (read_seqretry(&rename_lock, seq)) { /* ...else acquire lock for progress even on deep chains. */ read_seqlock_excl(&rename_lock); subdir = d_ancestor(old_dentry, new_dentry); read_sequnlock_excl(&rename_lock); } rcu_read_unlock(); return subdir; } EXPORT_SYMBOL(is_subdir); static enum d_walk_ret d_genocide_kill(void *data, struct dentry *dentry) { struct dentry *root = data; if (dentry != root) { if (d_unhashed(dentry) || !dentry->d_inode) return D_WALK_SKIP; if (!(dentry->d_flags & DCACHE_GENOCIDE)) { dentry->d_flags |= DCACHE_GENOCIDE; dentry->d_lockref.count--; } } return D_WALK_CONTINUE; } void d_genocide(struct dentry *parent) { d_walk(parent, parent, d_genocide_kill); } void d_mark_tmpfile(struct file *file, struct inode *inode) { struct dentry *dentry = file->f_path.dentry; BUG_ON(dname_external(dentry) || !hlist_unhashed(&dentry->d_u.d_alias) || !d_unlinked(dentry)); spin_lock(&dentry->d_parent->d_lock); spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); dentry->d_name.len = sprintf(dentry->d_shortname.string, "#%llu", (unsigned long long)inode->i_ino); spin_unlock(&dentry->d_lock); spin_unlock(&dentry->d_parent->d_lock); } EXPORT_SYMBOL(d_mark_tmpfile); void d_tmpfile(struct file *file, struct inode *inode) { struct dentry *dentry = file->f_path.dentry; inode_dec_link_count(inode); d_mark_tmpfile(file, inode); d_instantiate(dentry, inode); } EXPORT_SYMBOL(d_tmpfile); /* * Obtain inode number of the parent dentry. */ ino_t d_parent_ino(struct dentry *dentry) { struct dentry *parent; struct inode *iparent; unsigned seq; ino_t ret; scoped_guard(rcu) { seq = raw_seqcount_begin(&dentry->d_seq); parent = READ_ONCE(dentry->d_parent); iparent = d_inode_rcu(parent); if (likely(iparent)) { ret = iparent->i_ino; if (!read_seqcount_retry(&dentry->d_seq, seq)) return ret; } } spin_lock(&dentry->d_lock); ret = dentry->d_parent->d_inode->i_ino; spin_unlock(&dentry->d_lock); return ret; } EXPORT_SYMBOL(d_parent_ino); static __initdata unsigned long dhash_entries; static int __init set_dhash_entries(char *str) { if (!str) return 0; dhash_entries = simple_strtoul(str, &str, 0); return 1; } __setup("dhash_entries=", set_dhash_entries); static void __init dcache_init_early(void) { /* If hashes are distributed across NUMA nodes, defer * hash allocation until vmalloc space is available. */ if (hashdist) return; dentry_hashtable = alloc_large_system_hash("Dentry cache", sizeof(struct hlist_bl_head), dhash_entries, 13, HASH_EARLY | HASH_ZERO, &d_hash_shift, NULL, 0, 0); d_hash_shift = 32 - d_hash_shift; runtime_const_init(shift, d_hash_shift); runtime_const_init(ptr, dentry_hashtable); } static void __init dcache_init(void) { /* * A constructor could be added for stable state like the lists, * but it is probably not worth it because of the cache nature * of the dcache. */ dentry_cache = KMEM_CACHE_USERCOPY(dentry, SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_ACCOUNT, d_shortname.string); /* Hash may have been set up in dcache_init_early */ if (!hashdist) return; dentry_hashtable = alloc_large_system_hash("Dentry cache", sizeof(struct hlist_bl_head), dhash_entries, 13, HASH_ZERO, &d_hash_shift, NULL, 0, 0); d_hash_shift = 32 - d_hash_shift; runtime_const_init(shift, d_hash_shift); runtime_const_init(ptr, dentry_hashtable); } /* SLAB cache for __getname() consumers */ struct kmem_cache *names_cachep __ro_after_init; EXPORT_SYMBOL(names_cachep); void __init vfs_caches_init_early(void) { int i; for (i = 0; i < ARRAY_SIZE(in_lookup_hashtable); i++) INIT_HLIST_BL_HEAD(&in_lookup_hashtable[i]); dcache_init_early(); inode_init_early(); } void __init vfs_caches_init(void) { names_cachep = kmem_cache_create_usercopy("names_cache", PATH_MAX, 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, 0, PATH_MAX, NULL); dcache_init(); inode_init(); files_init(); files_maxfiles_init(); mnt_init(); bdev_cache_init(); chrdev_init(); }
1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2012 Bjørn Mork <bjorn@mork.no> * * The probing code is heavily inspired by cdc_ether, which is: * Copyright (C) 2003-2005 by David Brownell * Copyright (C) 2006 by Ole Andre Vadla Ravnas (ActiveSync) */ #include <linux/module.h> #include <linux/sched/signal.h> #include <linux/netdevice.h> #include <linux/ethtool.h> #include <linux/etherdevice.h> #include <linux/if_arp.h> #include <linux/kstrtox.h> #include <linux/mii.h> #include <linux/rtnetlink.h> #include <linux/usb.h> #include <linux/usb/cdc.h> #include <linux/usb/usbnet.h> #include <linux/usb/cdc-wdm.h> #include <linux/u64_stats_sync.h> /* This driver supports wwan (3G/LTE/?) devices using a vendor * specific management protocol called Qualcomm MSM Interface (QMI) - * in addition to the more common AT commands over serial interface * management * * QMI is wrapped in CDC, using CDC encapsulated commands on the * control ("master") interface of a two-interface CDC Union * resembling standard CDC ECM. The devices do not use the control * interface for any other CDC messages. Most likely because the * management protocol is used in place of the standard CDC * notifications NOTIFY_NETWORK_CONNECTION and NOTIFY_SPEED_CHANGE * * Alternatively, control and data functions can be combined in a * single USB interface. * * Handling a protocol like QMI is out of the scope for any driver. * It is exported as a character device using the cdc-wdm driver as * a subdriver, enabling userspace applications ("modem managers") to * handle it. * * These devices may alternatively/additionally be configured using AT * commands on a serial interface */ /* driver specific data */ struct qmi_wwan_state { struct usb_driver *subdriver; atomic_t pmcount; unsigned long flags; struct usb_interface *control; struct usb_interface *data; }; enum qmi_wwan_flags { QMI_WWAN_FLAG_RAWIP = 1 << 0, QMI_WWAN_FLAG_MUX = 1 << 1, QMI_WWAN_FLAG_PASS_THROUGH = 1 << 2, }; enum qmi_wwan_quirks { QMI_WWAN_QUIRK_DTR = 1 << 0, /* needs "set DTR" request */ }; struct qmimux_hdr { u8 pad; u8 mux_id; __be16 pkt_len; }; struct qmimux_priv { struct net_device *real_dev; u8 mux_id; }; static int qmimux_open(struct net_device *dev) { struct qmimux_priv *priv = netdev_priv(dev); struct net_device *real_dev = priv->real_dev; if (!(priv->real_dev->flags & IFF_UP)) return -ENETDOWN; if (netif_carrier_ok(real_dev)) netif_carrier_on(dev); return 0; } static int qmimux_stop(struct net_device *dev) { netif_carrier_off(dev); return 0; } static netdev_tx_t qmimux_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct qmimux_priv *priv = netdev_priv(dev); unsigned int len = skb->len; struct qmimux_hdr *hdr; netdev_tx_t ret; hdr = skb_push(skb, sizeof(struct qmimux_hdr)); hdr->pad = 0; hdr->mux_id = priv->mux_id; hdr->pkt_len = cpu_to_be16(len); skb->dev = priv->real_dev; ret = dev_queue_xmit(skb); if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) dev_sw_netstats_tx_add(dev, 1, len); else dev->stats.tx_dropped++; return ret; } static const struct net_device_ops qmimux_netdev_ops = { .ndo_open = qmimux_open, .ndo_stop = qmimux_stop, .ndo_start_xmit = qmimux_start_xmit, }; static void qmimux_setup(struct net_device *dev) { dev->header_ops = NULL; /* No header */ dev->type = ARPHRD_NONE; dev->hard_header_len = 0; dev->addr_len = 0; dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST; dev->netdev_ops = &qmimux_netdev_ops; dev->mtu = 1500; dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; dev->needs_free_netdev = true; } static struct net_device *qmimux_find_dev(struct usbnet *dev, u8 mux_id) { struct qmimux_priv *priv; struct list_head *iter; struct net_device *ldev; rcu_read_lock(); netdev_for_each_upper_dev_rcu(dev->net, ldev, iter) { priv = netdev_priv(ldev); if (priv->mux_id == mux_id) { rcu_read_unlock(); return ldev; } } rcu_read_unlock(); return NULL; } static bool qmimux_has_slaves(struct usbnet *dev) { return !list_empty(&dev->net->adj_list.upper); } static int qmimux_rx_fixup(struct usbnet *dev, struct sk_buff *skb) { unsigned int len, offset = 0, pad_len, pkt_len; struct qmimux_hdr *hdr; struct net_device *net; struct sk_buff *skbn; u8 qmimux_hdr_sz = sizeof(*hdr); while (offset + qmimux_hdr_sz < skb->len) { hdr = (struct qmimux_hdr *)(skb->data + offset); len = be16_to_cpu(hdr->pkt_len); /* drop the packet, bogus length */ if (offset + len + qmimux_hdr_sz > skb->len) return 0; /* control packet, we do not know what to do */ if (hdr->pad & 0x80) goto skip; /* extract padding length and check for valid length info */ pad_len = hdr->pad & 0x3f; if (len == 0 || pad_len >= len) goto skip; pkt_len = len - pad_len; net = qmimux_find_dev(dev, hdr->mux_id); if (!net) goto skip; skbn = netdev_alloc_skb(net, pkt_len + LL_MAX_HEADER); if (!skbn) return 0; switch (skb->data[offset + qmimux_hdr_sz] & 0xf0) { case 0x40: skbn->protocol = htons(ETH_P_IP); break; case 0x60: skbn->protocol = htons(ETH_P_IPV6); break; default: /* not ip - do not know what to do */ kfree_skb(skbn); goto skip; } skb_reserve(skbn, LL_MAX_HEADER); skb_put_data(skbn, skb->data + offset + qmimux_hdr_sz, pkt_len); if (netif_rx(skbn) != NET_RX_SUCCESS) { net->stats.rx_errors++; return 0; } else { dev_sw_netstats_rx_add(net, pkt_len); } skip: offset += len + qmimux_hdr_sz; } return 1; } static ssize_t mux_id_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_device *dev = to_net_dev(d); struct qmimux_priv *priv; priv = netdev_priv(dev); return sysfs_emit(buf, "0x%02x\n", priv->mux_id); } static DEVICE_ATTR_RO(mux_id); static struct attribute *qmi_wwan_sysfs_qmimux_attrs[] = { &dev_attr_mux_id.attr, NULL, }; static struct attribute_group qmi_wwan_sysfs_qmimux_attr_group = { .name = "qmap", .attrs = qmi_wwan_sysfs_qmimux_attrs, }; static int qmimux_register_device(struct net_device *real_dev, u8 mux_id) { struct net_device *new_dev; struct qmimux_priv *priv; int err; new_dev = alloc_netdev(sizeof(struct qmimux_priv), "qmimux%d", NET_NAME_UNKNOWN, qmimux_setup); if (!new_dev) return -ENOBUFS; dev_net_set(new_dev, dev_net(real_dev)); priv = netdev_priv(new_dev); priv->mux_id = mux_id; priv->real_dev = real_dev; new_dev->sysfs_groups[0] = &qmi_wwan_sysfs_qmimux_attr_group; err = register_netdevice(new_dev); if (err < 0) goto out_free_newdev; /* Account for reference in struct qmimux_priv_priv */ dev_hold(real_dev); err = netdev_upper_dev_link(real_dev, new_dev, NULL); if (err) goto out_unregister_netdev; netif_stacked_transfer_operstate(real_dev, new_dev); return 0; out_unregister_netdev: unregister_netdevice(new_dev); dev_put(real_dev); out_free_newdev: free_netdev(new_dev); return err; } static void qmimux_unregister_device(struct net_device *dev, struct list_head *head) { struct qmimux_priv *priv = netdev_priv(dev); struct net_device *real_dev = priv->real_dev; netdev_upper_dev_unlink(real_dev, dev); unregister_netdevice_queue(dev, head); /* Get rid of the reference to real_dev */ dev_put(real_dev); } static void qmi_wwan_netdev_setup(struct net_device *net) { struct usbnet *dev = netdev_priv(net); struct qmi_wwan_state *info = (void *)&dev->data; if (info->flags & QMI_WWAN_FLAG_RAWIP) { net->header_ops = NULL; /* No header */ net->type = ARPHRD_NONE; net->hard_header_len = 0; net->addr_len = 0; net->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST; set_bit(EVENT_NO_IP_ALIGN, &dev->flags); netdev_dbg(net, "mode: raw IP\n"); } else if (!net->header_ops) { /* don't bother if already set */ ether_setup(net); /* Restoring min/max mtu values set originally by usbnet */ net->min_mtu = 0; net->max_mtu = ETH_MAX_MTU; clear_bit(EVENT_NO_IP_ALIGN, &dev->flags); netdev_dbg(net, "mode: Ethernet\n"); } /* recalculate buffers after changing hard_header_len */ usbnet_change_mtu(net, net->mtu); } static ssize_t raw_ip_show(struct device *d, struct device_attribute *attr, char *buf) { struct usbnet *dev = netdev_priv(to_net_dev(d)); struct qmi_wwan_state *info = (void *)&dev->data; return sprintf(buf, "%c\n", info->flags & QMI_WWAN_FLAG_RAWIP ? 'Y' : 'N'); } static ssize_t raw_ip_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { struct usbnet *dev = netdev_priv(to_net_dev(d)); struct qmi_wwan_state *info = (void *)&dev->data; bool enable; int ret; if (kstrtobool(buf, &enable)) return -EINVAL; /* no change? */ if (enable == (info->flags & QMI_WWAN_FLAG_RAWIP)) return len; /* ip mode cannot be cleared when pass through mode is set */ if (!enable && (info->flags & QMI_WWAN_FLAG_PASS_THROUGH)) { netdev_err(dev->net, "Cannot clear ip mode on pass through device\n"); return -EINVAL; } if (!rtnl_trylock()) return restart_syscall(); /* we don't want to modify a running netdev */ if (netif_running(dev->net)) { netdev_err(dev->net, "Cannot change a running device\n"); ret = -EBUSY; goto err; } /* let other drivers deny the change */ ret = call_netdevice_notifiers(NETDEV_PRE_TYPE_CHANGE, dev->net); ret = notifier_to_errno(ret); if (ret) { netdev_err(dev->net, "Type change was refused\n"); goto err; } if (enable) info->flags |= QMI_WWAN_FLAG_RAWIP; else info->flags &= ~QMI_WWAN_FLAG_RAWIP; qmi_wwan_netdev_setup(dev->net); call_netdevice_notifiers(NETDEV_POST_TYPE_CHANGE, dev->net); ret = len; err: rtnl_unlock(); return ret; } static ssize_t add_mux_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_device *dev = to_net_dev(d); struct qmimux_priv *priv; struct list_head *iter; struct net_device *ldev; ssize_t count = 0; rcu_read_lock(); netdev_for_each_upper_dev_rcu(dev, ldev, iter) { priv = netdev_priv(ldev); count += scnprintf(&buf[count], PAGE_SIZE - count, "0x%02x\n", priv->mux_id); } rcu_read_unlock(); return count; } static ssize_t add_mux_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { struct usbnet *dev = netdev_priv(to_net_dev(d)); struct qmi_wwan_state *info = (void *)&dev->data; u8 mux_id; int ret; if (kstrtou8(buf, 0, &mux_id)) return -EINVAL; /* mux_id [1 - 254] for compatibility with ip(8) and the rmnet driver */ if (mux_id < 1 || mux_id > 254) return -EINVAL; if (!rtnl_trylock()) return restart_syscall(); if (qmimux_find_dev(dev, mux_id)) { netdev_err(dev->net, "mux_id already present\n"); ret = -EINVAL; goto err; } ret = qmimux_register_device(dev->net, mux_id); if (!ret) { info->flags |= QMI_WWAN_FLAG_MUX; ret = len; } err: rtnl_unlock(); return ret; } static ssize_t del_mux_show(struct device *d, struct device_attribute *attr, char *buf) { return add_mux_show(d, attr, buf); } static ssize_t del_mux_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { struct usbnet *dev = netdev_priv(to_net_dev(d)); struct qmi_wwan_state *info = (void *)&dev->data; struct net_device *del_dev; u8 mux_id; int ret = 0; if (kstrtou8(buf, 0, &mux_id)) return -EINVAL; if (!rtnl_trylock()) return restart_syscall(); del_dev = qmimux_find_dev(dev, mux_id); if (!del_dev) { netdev_err(dev->net, "mux_id not present\n"); ret = -EINVAL; goto err; } qmimux_unregister_device(del_dev, NULL); if (!qmimux_has_slaves(dev)) info->flags &= ~QMI_WWAN_FLAG_MUX; ret = len; err: rtnl_unlock(); return ret; } static ssize_t pass_through_show(struct device *d, struct device_attribute *attr, char *buf) { struct usbnet *dev = netdev_priv(to_net_dev(d)); struct qmi_wwan_state *info; info = (void *)&dev->data; return sprintf(buf, "%c\n", info->flags & QMI_WWAN_FLAG_PASS_THROUGH ? 'Y' : 'N'); } static ssize_t pass_through_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { struct usbnet *dev = netdev_priv(to_net_dev(d)); struct qmi_wwan_state *info; bool enable; if (kstrtobool(buf, &enable)) return -EINVAL; info = (void *)&dev->data; /* no change? */ if (enable == (info->flags & QMI_WWAN_FLAG_PASS_THROUGH)) return len; /* pass through mode can be set for raw ip devices only */ if (!(info->flags & QMI_WWAN_FLAG_RAWIP)) { netdev_err(dev->net, "Cannot set pass through mode on non ip device\n"); return -EINVAL; } if (enable) info->flags |= QMI_WWAN_FLAG_PASS_THROUGH; else info->flags &= ~QMI_WWAN_FLAG_PASS_THROUGH; return len; } static DEVICE_ATTR_RW(raw_ip); static DEVICE_ATTR_RW(add_mux); static DEVICE_ATTR_RW(del_mux); static DEVICE_ATTR_RW(pass_through); static struct attribute *qmi_wwan_sysfs_attrs[] = { &dev_attr_raw_ip.attr, &dev_attr_add_mux.attr, &dev_attr_del_mux.attr, &dev_attr_pass_through.attr, NULL, }; static struct attribute_group qmi_wwan_sysfs_attr_group = { .name = "qmi", .attrs = qmi_wwan_sysfs_attrs, }; /* default ethernet address used by the modem */ static const u8 default_modem_addr[ETH_ALEN] = {0x02, 0x50, 0xf3}; static const u8 buggy_fw_addr[ETH_ALEN] = {0x00, 0xa0, 0xc6, 0x00, 0x00, 0x00}; /* Make up an ethernet header if the packet doesn't have one. * * A firmware bug common among several devices cause them to send raw * IP packets under some circumstances. There is no way for the * driver/host to know when this will happen. And even when the bug * hits, some packets will still arrive with an intact header. * * The supported devices are only capably of sending IPv4, IPv6 and * ARP packets on a point-to-point link. Any packet with an ethernet * header will have either our address or a broadcast/multicast * address as destination. ARP packets will always have a header. * * This means that this function will reliably add the appropriate * header iff necessary, provided our hardware address does not start * with 4 or 6. * * Another common firmware bug results in all packets being addressed * to 00:a0:c6:00:00:00 despite the host address being different. * This function will also fixup such packets. */ static int qmi_wwan_rx_fixup(struct usbnet *dev, struct sk_buff *skb) { struct qmi_wwan_state *info = (void *)&dev->data; bool rawip = info->flags & QMI_WWAN_FLAG_RAWIP; __be16 proto; /* This check is no longer done by usbnet */ if (skb->len < dev->net->hard_header_len) return 0; if (info->flags & QMI_WWAN_FLAG_MUX) return qmimux_rx_fixup(dev, skb); if (info->flags & QMI_WWAN_FLAG_PASS_THROUGH) { skb->protocol = htons(ETH_P_MAP); return 1; } switch (skb->data[0] & 0xf0) { case 0x40: proto = htons(ETH_P_IP); break; case 0x60: proto = htons(ETH_P_IPV6); break; case 0x00: if (rawip) return 0; if (is_multicast_ether_addr(skb->data)) return 1; /* possibly bogus destination - rewrite just in case */ skb_reset_mac_header(skb); goto fix_dest; default: if (rawip) return 0; /* pass along other packets without modifications */ return 1; } if (rawip) { skb_reset_mac_header(skb); skb->dev = dev->net; /* normally set by eth_type_trans */ skb->protocol = proto; return 1; } if (skb_headroom(skb) < ETH_HLEN) return 0; skb_push(skb, ETH_HLEN); skb_reset_mac_header(skb); eth_hdr(skb)->h_proto = proto; eth_zero_addr(eth_hdr(skb)->h_source); fix_dest: memcpy(eth_hdr(skb)->h_dest, dev->net->dev_addr, ETH_ALEN); return 1; } /* very simplistic detection of IPv4 or IPv6 headers */ static bool possibly_iphdr(const char *data) { return (data[0] & 0xd0) == 0x40; } /* disallow addresses which may be confused with IP headers */ static int qmi_wwan_mac_addr(struct net_device *dev, void *p) { int ret; struct sockaddr *addr = p; ret = eth_prepare_mac_addr_change(dev, p); if (ret < 0) return ret; if (possibly_iphdr(addr->sa_data)) return -EADDRNOTAVAIL; eth_commit_mac_addr_change(dev, p); return 0; } static const struct net_device_ops qmi_wwan_netdev_ops = { .ndo_open = usbnet_open, .ndo_stop = usbnet_stop, .ndo_start_xmit = usbnet_start_xmit, .ndo_tx_timeout = usbnet_tx_timeout, .ndo_change_mtu = usbnet_change_mtu, .ndo_set_mac_address = qmi_wwan_mac_addr, .ndo_validate_addr = eth_validate_addr, }; /* using a counter to merge subdriver requests with our own into a * combined state */ static int qmi_wwan_manage_power(struct usbnet *dev, int on) { struct qmi_wwan_state *info = (void *)&dev->data; int rv; dev_dbg(&dev->intf->dev, "%s() pmcount=%d, on=%d\n", __func__, atomic_read(&info->pmcount), on); if ((on && atomic_add_return(1, &info->pmcount) == 1) || (!on && atomic_dec_and_test(&info->pmcount))) { /* need autopm_get/put here to ensure the usbcore sees * the new value */ rv = usb_autopm_get_interface(dev->intf); dev->intf->needs_remote_wakeup = on; if (!rv) usb_autopm_put_interface(dev->intf); } return 0; } static int qmi_wwan_cdc_wdm_manage_power(struct usb_interface *intf, int on) { struct usbnet *dev = usb_get_intfdata(intf); /* can be called while disconnecting */ if (!dev) return 0; return qmi_wwan_manage_power(dev, on); } /* collect all three endpoints and register subdriver */ static int qmi_wwan_register_subdriver(struct usbnet *dev) { int rv; struct usb_driver *subdriver = NULL; struct qmi_wwan_state *info = (void *)&dev->data; /* collect bulk endpoints */ rv = usbnet_get_endpoints(dev, info->data); if (rv < 0) goto err; /* update status endpoint if separate control interface */ if (info->control != info->data) dev->status = &info->control->cur_altsetting->endpoint[0]; /* require interrupt endpoint for subdriver */ if (!dev->status) { rv = -EINVAL; goto err; } /* for subdriver power management */ atomic_set(&info->pmcount, 0); /* register subdriver */ subdriver = usb_cdc_wdm_register(info->control, &dev->status->desc, 4096, WWAN_PORT_QMI, &qmi_wwan_cdc_wdm_manage_power); if (IS_ERR(subdriver)) { dev_err(&info->control->dev, "subdriver registration failed\n"); rv = PTR_ERR(subdriver); goto err; } /* prevent usbnet from using status endpoint */ dev->status = NULL; /* save subdriver struct for suspend/resume wrappers */ info->subdriver = subdriver; err: return rv; } /* Send CDC SetControlLineState request, setting or clearing the DTR. * "Required for Autoconnect and 9x30 to wake up" according to the * GobiNet driver. The requirement has been verified on an MDM9230 * based Sierra Wireless MC7455 */ static int qmi_wwan_change_dtr(struct usbnet *dev, bool on) { u8 intf = dev->intf->cur_altsetting->desc.bInterfaceNumber; return usbnet_write_cmd(dev, USB_CDC_REQ_SET_CONTROL_LINE_STATE, USB_DIR_OUT | USB_TYPE_CLASS | USB_RECIP_INTERFACE, on ? 0x01 : 0x00, intf, NULL, 0); } static int qmi_wwan_bind(struct usbnet *dev, struct usb_interface *intf) { int status; u8 *buf = intf->cur_altsetting->extra; int len = intf->cur_altsetting->extralen; struct usb_interface_descriptor *desc = &intf->cur_altsetting->desc; struct usb_cdc_union_desc *cdc_union; struct usb_cdc_ether_desc *cdc_ether; struct usb_driver *driver = driver_of(intf); struct qmi_wwan_state *info = (void *)&dev->data; struct usb_cdc_parsed_header hdr; BUILD_BUG_ON((sizeof(((struct usbnet *)0)->data) < sizeof(struct qmi_wwan_state))); /* set up initial state */ info->control = intf; info->data = intf; /* and a number of CDC descriptors */ cdc_parse_cdc_header(&hdr, intf, buf, len); cdc_union = hdr.usb_cdc_union_desc; cdc_ether = hdr.usb_cdc_ether_desc; /* Use separate control and data interfaces if we found a CDC Union */ if (cdc_union) { info->data = usb_ifnum_to_if(dev->udev, cdc_union->bSlaveInterface0); if (desc->bInterfaceNumber != cdc_union->bMasterInterface0 || !info->data) { dev_err(&intf->dev, "bogus CDC Union: master=%u, slave=%u\n", cdc_union->bMasterInterface0, cdc_union->bSlaveInterface0); /* ignore and continue... */ cdc_union = NULL; info->data = intf; } } /* errors aren't fatal - we can live with the dynamic address */ if (cdc_ether && cdc_ether->wMaxSegmentSize) { dev->hard_mtu = le16_to_cpu(cdc_ether->wMaxSegmentSize); usbnet_get_ethernet_addr(dev, cdc_ether->iMACAddress); } /* claim data interface and set it up */ if (info->control != info->data) { status = usb_driver_claim_interface(driver, info->data, dev); if (status < 0) goto err; } status = qmi_wwan_register_subdriver(dev); if (status < 0 && info->control != info->data) { usb_set_intfdata(info->data, NULL); usb_driver_release_interface(driver, info->data); } /* disabling remote wakeup on MDM9x30 devices has the same * effect as clearing DTR. The device will not respond to QMI * requests until we set DTR again. This is similar to a * QMI_CTL SYNC request, clearing a lot of firmware state * including the client ID allocations. * * Our usage model allows a session to span multiple * open/close events, so we must prevent the firmware from * clearing out state the clients might need. * * MDM9x30 is the first QMI chipset with USB3 support. Abuse * this fact to enable the quirk for all USB3 devices. * * There are also chipsets with the same "set DTR" requirement * but without USB3 support. Devices based on these chips * need a quirk flag in the device ID table. */ if (dev->driver_info->data & QMI_WWAN_QUIRK_DTR || le16_to_cpu(dev->udev->descriptor.bcdUSB) >= 0x0201) { qmi_wwan_manage_power(dev, 1); qmi_wwan_change_dtr(dev, true); } /* Never use the same address on both ends of the link, even if the * buggy firmware told us to. Or, if device is assigned the well-known * buggy firmware MAC address, replace it with a random address, */ if (ether_addr_equal(dev->net->dev_addr, default_modem_addr) || ether_addr_equal(dev->net->dev_addr, buggy_fw_addr)) eth_hw_addr_random(dev->net); /* make MAC addr easily distinguishable from an IP header */ if (possibly_iphdr(dev->net->dev_addr)) { u8 addr = dev->net->dev_addr[0]; addr |= 0x02; /* set local assignment bit */ addr &= 0xbf; /* clear "IP" bit */ dev_addr_mod(dev->net, 0, &addr, 1); } dev->net->netdev_ops = &qmi_wwan_netdev_ops; dev->net->sysfs_groups[0] = &qmi_wwan_sysfs_attr_group; err: return status; } static void qmi_wwan_unbind(struct usbnet *dev, struct usb_interface *intf) { struct qmi_wwan_state *info = (void *)&dev->data; struct usb_driver *driver = driver_of(intf); struct usb_interface *other; if (info->subdriver && info->subdriver->disconnect) info->subdriver->disconnect(info->control); /* disable MDM9x30 quirk */ if (le16_to_cpu(dev->udev->descriptor.bcdUSB) >= 0x0201) { qmi_wwan_change_dtr(dev, false); qmi_wwan_manage_power(dev, 0); } /* allow user to unbind using either control or data */ if (intf == info->control) other = info->data; else other = info->control; /* only if not shared */ if (other && intf != other) { usb_set_intfdata(other, NULL); usb_driver_release_interface(driver, other); } info->subdriver = NULL; info->data = NULL; info->control = NULL; } /* suspend/resume wrappers calling both usbnet and the cdc-wdm * subdriver if present. * * NOTE: cdc-wdm also supports pre/post_reset, but we cannot provide * wrappers for those without adding usbnet reset support first. */ static int qmi_wwan_suspend(struct usb_interface *intf, pm_message_t message) { struct usbnet *dev = usb_get_intfdata(intf); struct qmi_wwan_state *info = (void *)&dev->data; int ret; /* Both usbnet_suspend() and subdriver->suspend() MUST return 0 * in system sleep context, otherwise, the resume callback has * to recover device from previous suspend failure. */ ret = usbnet_suspend(intf, message); if (ret < 0) goto err; if (intf == info->control && info->subdriver && info->subdriver->suspend) ret = info->subdriver->suspend(intf, message); if (ret < 0) usbnet_resume(intf); err: return ret; } static int qmi_wwan_resume(struct usb_interface *intf) { struct usbnet *dev = usb_get_intfdata(intf); struct qmi_wwan_state *info = (void *)&dev->data; int ret = 0; bool callsub = (intf == info->control && info->subdriver && info->subdriver->resume); if (callsub) ret = info->subdriver->resume(intf); if (ret < 0) goto err; ret = usbnet_resume(intf); if (ret < 0 && callsub) info->subdriver->suspend(intf, PMSG_SUSPEND); err: return ret; } static const struct driver_info qmi_wwan_info = { .description = "WWAN/QMI device", .flags = FLAG_WWAN | FLAG_SEND_ZLP, .bind = qmi_wwan_bind, .unbind = qmi_wwan_unbind, .manage_power = qmi_wwan_manage_power, .rx_fixup = qmi_wwan_rx_fixup, }; static const struct driver_info qmi_wwan_info_quirk_dtr = { .description = "WWAN/QMI device", .flags = FLAG_WWAN | FLAG_SEND_ZLP, .bind = qmi_wwan_bind, .unbind = qmi_wwan_unbind, .manage_power = qmi_wwan_manage_power, .rx_fixup = qmi_wwan_rx_fixup, .data = QMI_WWAN_QUIRK_DTR, }; #define HUAWEI_VENDOR_ID 0x12D1 /* map QMI/wwan function by a fixed interface number */ #define QMI_FIXED_INTF(vend, prod, num) \ USB_DEVICE_INTERFACE_NUMBER(vend, prod, num), \ .driver_info = (unsigned long)&qmi_wwan_info /* devices requiring "set DTR" quirk */ #define QMI_QUIRK_SET_DTR(vend, prod, num) \ USB_DEVICE_INTERFACE_NUMBER(vend, prod, num), \ .driver_info = (unsigned long)&qmi_wwan_info_quirk_dtr /* Gobi 1000 QMI/wwan interface number is 3 according to qcserial */ #define QMI_GOBI1K_DEVICE(vend, prod) \ QMI_FIXED_INTF(vend, prod, 3) /* Gobi 2000/3000 QMI/wwan interface number is 0 according to qcserial */ #define QMI_GOBI_DEVICE(vend, prod) \ QMI_FIXED_INTF(vend, prod, 0) /* Many devices have QMI and DIAG functions which are distinguishable * from other vendor specific functions by class, subclass and * protocol all being 0xff. The DIAG function has exactly 2 endpoints * and is silently rejected when probed. * * This makes it possible to match dynamically numbered QMI functions * as seen on e.g. many Quectel modems. */ #define QMI_MATCH_FF_FF_FF(vend, prod) \ USB_DEVICE_AND_INTERFACE_INFO(vend, prod, USB_CLASS_VENDOR_SPEC, \ USB_SUBCLASS_VENDOR_SPEC, 0xff), \ .driver_info = (unsigned long)&qmi_wwan_info_quirk_dtr static const struct usb_device_id products[] = { /* 1. CDC ECM like devices match on the control interface */ { /* Huawei E392, E398 and possibly others sharing both device id and more... */ USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, USB_CLASS_VENDOR_SPEC, 1, 9), .driver_info = (unsigned long)&qmi_wwan_info, }, { /* Vodafone/Huawei K5005 (12d1:14c8) and similar modems */ USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, USB_CLASS_VENDOR_SPEC, 1, 57), .driver_info = (unsigned long)&qmi_wwan_info, }, { /* HUAWEI_INTERFACE_NDIS_CONTROL_QUALCOMM */ USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, USB_CLASS_VENDOR_SPEC, 0x01, 0x69), .driver_info = (unsigned long)&qmi_wwan_info, }, { /* Motorola Mapphone devices with MDM6600 */ USB_VENDOR_AND_INTERFACE_INFO(0x22b8, USB_CLASS_VENDOR_SPEC, 0xfb, 0xff), .driver_info = (unsigned long)&qmi_wwan_info, }, /* 2. Combined interface devices matching on class+protocol */ { /* Huawei E367 and possibly others in "Windows mode" */ USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, USB_CLASS_VENDOR_SPEC, 1, 7), .driver_info = (unsigned long)&qmi_wwan_info, }, { /* Huawei E392, E398 and possibly others in "Windows mode" */ USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, USB_CLASS_VENDOR_SPEC, 1, 17), .driver_info = (unsigned long)&qmi_wwan_info, }, { /* HUAWEI_NDIS_SINGLE_INTERFACE_VDF */ USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, USB_CLASS_VENDOR_SPEC, 0x01, 0x37), .driver_info = (unsigned long)&qmi_wwan_info, }, { /* HUAWEI_INTERFACE_NDIS_HW_QUALCOMM */ USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, USB_CLASS_VENDOR_SPEC, 0x01, 0x67), .driver_info = (unsigned long)&qmi_wwan_info, }, { /* Pantech UML290, P4200 and more */ USB_VENDOR_AND_INTERFACE_INFO(0x106c, USB_CLASS_VENDOR_SPEC, 0xf0, 0xff), .driver_info = (unsigned long)&qmi_wwan_info, }, { /* Pantech UML290 - newer firmware */ USB_VENDOR_AND_INTERFACE_INFO(0x106c, USB_CLASS_VENDOR_SPEC, 0xf1, 0xff), .driver_info = (unsigned long)&qmi_wwan_info, }, { /* Novatel USB551L and MC551 */ USB_DEVICE_AND_INTERFACE_INFO(0x1410, 0xb001, USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&qmi_wwan_info, }, { /* Novatel E362 */ USB_DEVICE_AND_INTERFACE_INFO(0x1410, 0x9010, USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&qmi_wwan_info, }, { /* Novatel Expedite E371 */ USB_DEVICE_AND_INTERFACE_INFO(0x1410, 0x9011, USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&qmi_wwan_info, }, { /* Dell Wireless 5800 (Novatel E362) */ USB_DEVICE_AND_INTERFACE_INFO(0x413C, 0x8195, USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&qmi_wwan_info, }, { /* Dell Wireless 5800 V2 (Novatel E362) */ USB_DEVICE_AND_INTERFACE_INFO(0x413C, 0x8196, USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&qmi_wwan_info, }, { /* Dell Wireless 5804 (Novatel E371) */ USB_DEVICE_AND_INTERFACE_INFO(0x413C, 0x819b, USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&qmi_wwan_info, }, { /* ADU960S */ USB_DEVICE_AND_INTERFACE_INFO(0x16d5, 0x650a, USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&qmi_wwan_info, }, { /* HP lt2523 (Novatel E371) */ USB_DEVICE_AND_INTERFACE_INFO(0x03f0, 0x421d, USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&qmi_wwan_info, }, { /* HP lt4112 LTE/HSPA+ Gobi 4G Module (Huawei me906e) */ USB_DEVICE_AND_INTERFACE_INFO(0x03f0, 0x581d, USB_CLASS_VENDOR_SPEC, 1, 7), .driver_info = (unsigned long)&qmi_wwan_info, }, {QMI_MATCH_FF_FF_FF(0x2c7c, 0x0122)}, /* Quectel RG650V */ {QMI_MATCH_FF_FF_FF(0x2c7c, 0x0125)}, /* Quectel EC25, EC20 R2.0 Mini PCIe */ {QMI_MATCH_FF_FF_FF(0x2c7c, 0x0306)}, /* Quectel EP06/EG06/EM06 */ {QMI_MATCH_FF_FF_FF(0x2c7c, 0x0512)}, /* Quectel EG12/EM12 */ {QMI_MATCH_FF_FF_FF(0x2c7c, 0x0620)}, /* Quectel EM160R-GL */ {QMI_MATCH_FF_FF_FF(0x2c7c, 0x0800)}, /* Quectel RM500Q-GL */ {QMI_MATCH_FF_FF_FF(0x2c7c, 0x0801)}, /* Quectel RM520N */ /* 3. Combined interface devices matching on interface number */ {QMI_FIXED_INTF(0x0408, 0xea42, 4)}, /* Yota / Megafon M100-1 */ {QMI_FIXED_INTF(0x05c6, 0x6001, 3)}, /* 4G LTE usb-modem U901 */ {QMI_FIXED_INTF(0x05c6, 0x7000, 0)}, {QMI_FIXED_INTF(0x05c6, 0x7001, 1)}, {QMI_FIXED_INTF(0x05c6, 0x7002, 1)}, {QMI_FIXED_INTF(0x05c6, 0x7101, 1)}, {QMI_FIXED_INTF(0x05c6, 0x7101, 2)}, {QMI_FIXED_INTF(0x05c6, 0x7101, 3)}, {QMI_FIXED_INTF(0x05c6, 0x7102, 1)}, {QMI_FIXED_INTF(0x05c6, 0x7102, 2)}, {QMI_FIXED_INTF(0x05c6, 0x7102, 3)}, {QMI_FIXED_INTF(0x05c6, 0x8000, 7)}, {QMI_FIXED_INTF(0x05c6, 0x8001, 6)}, {QMI_FIXED_INTF(0x05c6, 0x9000, 4)}, {QMI_FIXED_INTF(0x05c6, 0x9003, 4)}, {QMI_FIXED_INTF(0x05c6, 0x9005, 2)}, {QMI_FIXED_INTF(0x05c6, 0x900a, 4)}, {QMI_FIXED_INTF(0x05c6, 0x900b, 2)}, {QMI_FIXED_INTF(0x05c6, 0x900c, 4)}, {QMI_FIXED_INTF(0x05c6, 0x900c, 5)}, {QMI_FIXED_INTF(0x05c6, 0x900c, 6)}, {QMI_FIXED_INTF(0x05c6, 0x900d, 5)}, {QMI_FIXED_INTF(0x05c6, 0x900f, 3)}, {QMI_FIXED_INTF(0x05c6, 0x900f, 4)}, {QMI_FIXED_INTF(0x05c6, 0x900f, 5)}, {QMI_FIXED_INTF(0x05c6, 0x9010, 4)}, {QMI_FIXED_INTF(0x05c6, 0x9010, 5)}, {QMI_FIXED_INTF(0x05c6, 0x9011, 3)}, {QMI_FIXED_INTF(0x05c6, 0x9011, 4)}, {QMI_FIXED_INTF(0x05c6, 0x9021, 1)}, {QMI_FIXED_INTF(0x05c6, 0x9022, 2)}, {QMI_QUIRK_SET_DTR(0x05c6, 0x9025, 4)}, /* Alcatel-sbell ASB TL131 TDD LTE (China Mobile) */ {QMI_FIXED_INTF(0x05c6, 0x9026, 3)}, {QMI_FIXED_INTF(0x05c6, 0x902e, 5)}, {QMI_FIXED_INTF(0x05c6, 0x9031, 5)}, {QMI_FIXED_INTF(0x05c6, 0x9032, 4)}, {QMI_FIXED_INTF(0x05c6, 0x9033, 3)}, {QMI_FIXED_INTF(0x05c6, 0x9033, 4)}, {QMI_FIXED_INTF(0x05c6, 0x9033, 5)}, {QMI_FIXED_INTF(0x05c6, 0x9033, 6)}, {QMI_FIXED_INTF(0x05c6, 0x9034, 3)}, {QMI_FIXED_INTF(0x05c6, 0x9034, 4)}, {QMI_FIXED_INTF(0x05c6, 0x9034, 5)}, {QMI_FIXED_INTF(0x05c6, 0x9034, 6)}, {QMI_FIXED_INTF(0x05c6, 0x9034, 7)}, {QMI_FIXED_INTF(0x05c6, 0x9035, 4)}, {QMI_FIXED_INTF(0x05c6, 0x9036, 3)}, {QMI_FIXED_INTF(0x05c6, 0x9037, 5)}, {QMI_FIXED_INTF(0x05c6, 0x9038, 4)}, {QMI_FIXED_INTF(0x05c6, 0x903b, 7)}, {QMI_FIXED_INTF(0x05c6, 0x903c, 6)}, {QMI_FIXED_INTF(0x05c6, 0x903d, 6)}, {QMI_FIXED_INTF(0x05c6, 0x903e, 5)}, {QMI_FIXED_INTF(0x05c6, 0x9043, 3)}, {QMI_FIXED_INTF(0x05c6, 0x9046, 3)}, {QMI_FIXED_INTF(0x05c6, 0x9046, 4)}, {QMI_FIXED_INTF(0x05c6, 0x9046, 5)}, {QMI_FIXED_INTF(0x05c6, 0x9047, 2)}, {QMI_FIXED_INTF(0x05c6, 0x9047, 3)}, {QMI_FIXED_INTF(0x05c6, 0x9047, 4)}, {QMI_FIXED_INTF(0x05c6, 0x9048, 4)}, {QMI_FIXED_INTF(0x05c6, 0x9048, 5)}, {QMI_FIXED_INTF(0x05c6, 0x9048, 6)}, {QMI_FIXED_INTF(0x05c6, 0x9048, 7)}, {QMI_FIXED_INTF(0x05c6, 0x9048, 8)}, {QMI_FIXED_INTF(0x05c6, 0x904c, 5)}, {QMI_FIXED_INTF(0x05c6, 0x904c, 6)}, {QMI_FIXED_INTF(0x05c6, 0x904c, 7)}, {QMI_FIXED_INTF(0x05c6, 0x904c, 8)}, {QMI_FIXED_INTF(0x05c6, 0x9050, 3)}, {QMI_FIXED_INTF(0x05c6, 0x9052, 4)}, {QMI_FIXED_INTF(0x05c6, 0x9053, 6)}, {QMI_FIXED_INTF(0x05c6, 0x9053, 7)}, {QMI_FIXED_INTF(0x05c6, 0x9054, 5)}, {QMI_FIXED_INTF(0x05c6, 0x9054, 6)}, {QMI_FIXED_INTF(0x05c6, 0x9055, 3)}, {QMI_FIXED_INTF(0x05c6, 0x9055, 4)}, {QMI_FIXED_INTF(0x05c6, 0x9055, 5)}, {QMI_FIXED_INTF(0x05c6, 0x9055, 6)}, {QMI_FIXED_INTF(0x05c6, 0x9055, 7)}, {QMI_FIXED_INTF(0x05c6, 0x9056, 3)}, {QMI_FIXED_INTF(0x05c6, 0x9062, 2)}, {QMI_FIXED_INTF(0x05c6, 0x9062, 3)}, {QMI_FIXED_INTF(0x05c6, 0x9062, 4)}, {QMI_FIXED_INTF(0x05c6, 0x9062, 5)}, {QMI_FIXED_INTF(0x05c6, 0x9062, 6)}, {QMI_FIXED_INTF(0x05c6, 0x9062, 7)}, {QMI_FIXED_INTF(0x05c6, 0x9062, 8)}, {QMI_FIXED_INTF(0x05c6, 0x9062, 9)}, {QMI_FIXED_INTF(0x05c6, 0x9064, 3)}, {QMI_FIXED_INTF(0x05c6, 0x9065, 6)}, {QMI_FIXED_INTF(0x05c6, 0x9065, 7)}, {QMI_FIXED_INTF(0x05c6, 0x9066, 5)}, {QMI_FIXED_INTF(0x05c6, 0x9066, 6)}, {QMI_FIXED_INTF(0x05c6, 0x9067, 1)}, {QMI_FIXED_INTF(0x05c6, 0x9068, 2)}, {QMI_FIXED_INTF(0x05c6, 0x9068, 3)}, {QMI_FIXED_INTF(0x05c6, 0x9068, 4)}, {QMI_FIXED_INTF(0x05c6, 0x9068, 5)}, {QMI_FIXED_INTF(0x05c6, 0x9068, 6)}, {QMI_FIXED_INTF(0x05c6, 0x9068, 7)}, {QMI_FIXED_INTF(0x05c6, 0x9069, 5)}, {QMI_FIXED_INTF(0x05c6, 0x9069, 6)}, {QMI_FIXED_INTF(0x05c6, 0x9069, 7)}, {QMI_FIXED_INTF(0x05c6, 0x9069, 8)}, {QMI_FIXED_INTF(0x05c6, 0x9070, 4)}, {QMI_FIXED_INTF(0x05c6, 0x9070, 5)}, {QMI_FIXED_INTF(0x05c6, 0x9075, 5)}, {QMI_FIXED_INTF(0x05c6, 0x9076, 4)}, {QMI_FIXED_INTF(0x05c6, 0x9076, 5)}, {QMI_FIXED_INTF(0x05c6, 0x9076, 6)}, {QMI_FIXED_INTF(0x05c6, 0x9076, 7)}, {QMI_FIXED_INTF(0x05c6, 0x9076, 8)}, {QMI_FIXED_INTF(0x05c6, 0x9077, 3)}, {QMI_FIXED_INTF(0x05c6, 0x9077, 4)}, {QMI_FIXED_INTF(0x05c6, 0x9077, 5)}, {QMI_FIXED_INTF(0x05c6, 0x9077, 6)}, {QMI_FIXED_INTF(0x05c6, 0x9078, 3)}, {QMI_FIXED_INTF(0x05c6, 0x9079, 4)}, {QMI_FIXED_INTF(0x05c6, 0x9079, 5)}, {QMI_FIXED_INTF(0x05c6, 0x9079, 6)}, {QMI_FIXED_INTF(0x05c6, 0x9079, 7)}, {QMI_FIXED_INTF(0x05c6, 0x9079, 8)}, {QMI_FIXED_INTF(0x05c6, 0x9080, 5)}, {QMI_FIXED_INTF(0x05c6, 0x9080, 6)}, {QMI_FIXED_INTF(0x05c6, 0x9080, 7)}, {QMI_FIXED_INTF(0x05c6, 0x9080, 8)}, {QMI_FIXED_INTF(0x05c6, 0x9083, 3)}, {QMI_FIXED_INTF(0x05c6, 0x9084, 4)}, {QMI_QUIRK_SET_DTR(0x05c6, 0x9091, 2)}, /* Compal RXM-G1 */ {QMI_FIXED_INTF(0x05c6, 0x90b2, 3)}, /* ublox R410M */ {QMI_QUIRK_SET_DTR(0x05c6, 0x90db, 2)}, /* Compal RXM-G1 */ {QMI_FIXED_INTF(0x05c6, 0x920d, 0)}, {QMI_FIXED_INTF(0x05c6, 0x920d, 5)}, {QMI_QUIRK_SET_DTR(0x05c6, 0x9625, 4)}, /* YUGA CLM920-NC5 */ {QMI_FIXED_INTF(0x0846, 0x68a2, 8)}, {QMI_FIXED_INTF(0x0846, 0x68d3, 8)}, /* Netgear Aircard 779S */ {QMI_FIXED_INTF(0x12d1, 0x140c, 1)}, /* Huawei E173 */ {QMI_FIXED_INTF(0x12d1, 0x14ac, 1)}, /* Huawei E1820 */ {QMI_FIXED_INTF(0x1435, 0x0918, 3)}, /* Wistron NeWeb D16Q1 */ {QMI_FIXED_INTF(0x1435, 0x0918, 4)}, /* Wistron NeWeb D16Q1 */ {QMI_FIXED_INTF(0x1435, 0x0918, 5)}, /* Wistron NeWeb D16Q1 */ {QMI_FIXED_INTF(0x1435, 0x3185, 4)}, /* Wistron NeWeb M18Q5 */ {QMI_FIXED_INTF(0x1435, 0xd111, 4)}, /* M9615A DM11-1 D51QC */ {QMI_FIXED_INTF(0x1435, 0xd181, 3)}, /* Wistron NeWeb D18Q1 */ {QMI_FIXED_INTF(0x1435, 0xd181, 4)}, /* Wistron NeWeb D18Q1 */ {QMI_FIXED_INTF(0x1435, 0xd181, 5)}, /* Wistron NeWeb D18Q1 */ {QMI_FIXED_INTF(0x1435, 0xd182, 4)}, /* Wistron NeWeb D18 */ {QMI_FIXED_INTF(0x1435, 0xd182, 5)}, /* Wistron NeWeb D18 */ {QMI_FIXED_INTF(0x1435, 0xd191, 4)}, /* Wistron NeWeb D19Q1 */ {QMI_QUIRK_SET_DTR(0x1508, 0x1001, 4)}, /* Fibocom NL668 series */ {QMI_FIXED_INTF(0x1690, 0x7588, 4)}, /* ASKEY WWHC050 */ {QMI_FIXED_INTF(0x16d8, 0x6003, 0)}, /* CMOTech 6003 */ {QMI_FIXED_INTF(0x16d8, 0x6007, 0)}, /* CMOTech CHE-628S */ {QMI_FIXED_INTF(0x16d8, 0x6008, 0)}, /* CMOTech CMU-301 */ {QMI_FIXED_INTF(0x16d8, 0x6280, 0)}, /* CMOTech CHU-628 */ {QMI_FIXED_INTF(0x16d8, 0x7001, 0)}, /* CMOTech CHU-720S */ {QMI_FIXED_INTF(0x16d8, 0x7002, 0)}, /* CMOTech 7002 */ {QMI_FIXED_INTF(0x16d8, 0x7003, 4)}, /* CMOTech CHU-629K */ {QMI_FIXED_INTF(0x16d8, 0x7004, 3)}, /* CMOTech 7004 */ {QMI_FIXED_INTF(0x16d8, 0x7006, 5)}, /* CMOTech CGU-629 */ {QMI_FIXED_INTF(0x16d8, 0x700a, 4)}, /* CMOTech CHU-629S */ {QMI_FIXED_INTF(0x16d8, 0x7211, 0)}, /* CMOTech CHU-720I */ {QMI_FIXED_INTF(0x16d8, 0x7212, 0)}, /* CMOTech 7212 */ {QMI_FIXED_INTF(0x16d8, 0x7213, 0)}, /* CMOTech 7213 */ {QMI_FIXED_INTF(0x16d8, 0x7251, 1)}, /* CMOTech 7251 */ {QMI_FIXED_INTF(0x16d8, 0x7252, 1)}, /* CMOTech 7252 */ {QMI_FIXED_INTF(0x16d8, 0x7253, 1)}, /* CMOTech 7253 */ {QMI_FIXED_INTF(0x19d2, 0x0002, 1)}, {QMI_FIXED_INTF(0x19d2, 0x0012, 1)}, {QMI_FIXED_INTF(0x19d2, 0x0017, 3)}, {QMI_FIXED_INTF(0x19d2, 0x0019, 3)}, /* ONDA MT689DC */ {QMI_FIXED_INTF(0x19d2, 0x0021, 4)}, {QMI_FIXED_INTF(0x19d2, 0x0025, 1)}, {QMI_FIXED_INTF(0x19d2, 0x0031, 4)}, {QMI_FIXED_INTF(0x19d2, 0x0042, 4)}, {QMI_FIXED_INTF(0x19d2, 0x0049, 5)}, {QMI_FIXED_INTF(0x19d2, 0x0052, 4)}, {QMI_FIXED_INTF(0x19d2, 0x0055, 1)}, /* ZTE (Vodafone) K3520-Z */ {QMI_FIXED_INTF(0x19d2, 0x0058, 4)}, {QMI_FIXED_INTF(0x19d2, 0x0063, 4)}, /* ZTE (Vodafone) K3565-Z */ {QMI_FIXED_INTF(0x19d2, 0x0104, 4)}, /* ZTE (Vodafone) K4505-Z */ {QMI_FIXED_INTF(0x19d2, 0x0113, 5)}, {QMI_FIXED_INTF(0x19d2, 0x0118, 5)}, {QMI_FIXED_INTF(0x19d2, 0x0121, 5)}, {QMI_FIXED_INTF(0x19d2, 0x0123, 4)}, {QMI_FIXED_INTF(0x19d2, 0x0124, 5)}, {QMI_FIXED_INTF(0x19d2, 0x0125, 6)}, {QMI_FIXED_INTF(0x19d2, 0x0126, 5)}, {QMI_FIXED_INTF(0x19d2, 0x0130, 1)}, {QMI_FIXED_INTF(0x19d2, 0x0133, 3)}, {QMI_FIXED_INTF(0x19d2, 0x0141, 5)}, {QMI_FIXED_INTF(0x19d2, 0x0157, 5)}, /* ZTE MF683 */ {QMI_FIXED_INTF(0x19d2, 0x0158, 3)}, {QMI_FIXED_INTF(0x19d2, 0x0167, 4)}, /* ZTE MF820D */ {QMI_FIXED_INTF(0x19d2, 0x0168, 4)}, {QMI_FIXED_INTF(0x19d2, 0x0176, 3)}, {QMI_FIXED_INTF(0x19d2, 0x0178, 3)}, {QMI_FIXED_INTF(0x19d2, 0x0189, 4)}, /* ZTE MF290 */ {QMI_FIXED_INTF(0x19d2, 0x0191, 4)}, /* ZTE EuFi890 */ {QMI_FIXED_INTF(0x19d2, 0x0199, 1)}, /* ZTE MF820S */ {QMI_FIXED_INTF(0x19d2, 0x0200, 1)}, {QMI_FIXED_INTF(0x19d2, 0x0257, 3)}, /* ZTE MF821 */ {QMI_FIXED_INTF(0x19d2, 0x0265, 4)}, /* ONDA MT8205 4G LTE */ {QMI_FIXED_INTF(0x19d2, 0x0284, 4)}, /* ZTE MF880 */ {QMI_FIXED_INTF(0x19d2, 0x0326, 4)}, /* ZTE MF821D */ {QMI_FIXED_INTF(0x19d2, 0x0396, 3)}, /* ZTE ZM8620 */ {QMI_FIXED_INTF(0x19d2, 0x0412, 4)}, /* Telewell TW-LTE 4G */ {QMI_FIXED_INTF(0x19d2, 0x1008, 4)}, /* ZTE (Vodafone) K3570-Z */ {QMI_FIXED_INTF(0x19d2, 0x1010, 4)}, /* ZTE (Vodafone) K3571-Z */ {QMI_FIXED_INTF(0x19d2, 0x1012, 4)}, {QMI_FIXED_INTF(0x19d2, 0x1018, 3)}, /* ZTE (Vodafone) K5006-Z */ {QMI_FIXED_INTF(0x19d2, 0x1021, 2)}, {QMI_FIXED_INTF(0x19d2, 0x1245, 4)}, {QMI_FIXED_INTF(0x19d2, 0x1247, 4)}, {QMI_FIXED_INTF(0x19d2, 0x1252, 4)}, {QMI_FIXED_INTF(0x19d2, 0x1254, 4)}, {QMI_FIXED_INTF(0x19d2, 0x1255, 3)}, {QMI_FIXED_INTF(0x19d2, 0x1255, 4)}, {QMI_FIXED_INTF(0x19d2, 0x1256, 4)}, {QMI_FIXED_INTF(0x19d2, 0x1270, 5)}, /* ZTE MF667 */ {QMI_FIXED_INTF(0x19d2, 0x1275, 3)}, /* ZTE P685M */ {QMI_FIXED_INTF(0x19d2, 0x1401, 2)}, {QMI_FIXED_INTF(0x19d2, 0x1402, 2)}, /* ZTE MF60 */ {QMI_FIXED_INTF(0x19d2, 0x1424, 2)}, {QMI_FIXED_INTF(0x19d2, 0x1425, 2)}, {QMI_FIXED_INTF(0x19d2, 0x1426, 2)}, /* ZTE MF91 */ {QMI_FIXED_INTF(0x19d2, 0x1428, 2)}, /* Telewell TW-LTE 4G v2 */ {QMI_FIXED_INTF(0x19d2, 0x1432, 3)}, /* ZTE ME3620 */ {QMI_FIXED_INTF(0x19d2, 0x1485, 5)}, /* ZTE MF286D */ {QMI_FIXED_INTF(0x19d2, 0x2002, 4)}, /* ZTE (Vodafone) K3765-Z */ {QMI_FIXED_INTF(0x2001, 0x7e16, 3)}, /* D-Link DWM-221 */ {QMI_FIXED_INTF(0x2001, 0x7e19, 4)}, /* D-Link DWM-221 B1 */ {QMI_FIXED_INTF(0x2001, 0x7e35, 4)}, /* D-Link DWM-222 */ {QMI_FIXED_INTF(0x2001, 0x7e3d, 4)}, /* D-Link DWM-222 A2 */ {QMI_FIXED_INTF(0x2020, 0x2031, 4)}, /* Olicard 600 */ {QMI_FIXED_INTF(0x2020, 0x2033, 4)}, /* BroadMobi BM806U */ {QMI_QUIRK_SET_DTR(0x2020, 0x2060, 4)}, /* BroadMobi BM818 */ {QMI_FIXED_INTF(0x0f3d, 0x68a2, 8)}, /* Sierra Wireless MC7700 */ {QMI_FIXED_INTF(0x114f, 0x68a2, 8)}, /* Sierra Wireless MC7750 */ {QMI_FIXED_INTF(0x1199, 0x68a2, 8)}, /* Sierra Wireless MC7710 in QMI mode */ {QMI_FIXED_INTF(0x1199, 0x68a2, 19)}, /* Sierra Wireless MC7710 in QMI mode */ {QMI_QUIRK_SET_DTR(0x1199, 0x68c0, 8)}, /* Sierra Wireless MC7304/MC7354, WP76xx */ {QMI_QUIRK_SET_DTR(0x1199, 0x68c0, 10)},/* Sierra Wireless MC7304/MC7354 */ {QMI_FIXED_INTF(0x1199, 0x901c, 8)}, /* Sierra Wireless EM7700 */ {QMI_FIXED_INTF(0x1199, 0x901f, 8)}, /* Sierra Wireless EM7355 */ {QMI_FIXED_INTF(0x1199, 0x9041, 8)}, /* Sierra Wireless MC7305/MC7355 */ {QMI_FIXED_INTF(0x1199, 0x9041, 10)}, /* Sierra Wireless MC7305/MC7355 */ {QMI_FIXED_INTF(0x1199, 0x9051, 8)}, /* Netgear AirCard 340U */ {QMI_FIXED_INTF(0x1199, 0x9053, 8)}, /* Sierra Wireless Modem */ {QMI_FIXED_INTF(0x1199, 0x9054, 8)}, /* Sierra Wireless Modem */ {QMI_FIXED_INTF(0x1199, 0x9055, 8)}, /* Netgear AirCard 341U */ {QMI_FIXED_INTF(0x1199, 0x9056, 8)}, /* Sierra Wireless Modem */ {QMI_FIXED_INTF(0x1199, 0x9057, 8)}, {QMI_FIXED_INTF(0x1199, 0x9061, 8)}, /* Sierra Wireless Modem */ {QMI_FIXED_INTF(0x1199, 0x9063, 8)}, /* Sierra Wireless EM7305 */ {QMI_FIXED_INTF(0x1199, 0x9063, 10)}, /* Sierra Wireless EM7305 */ {QMI_QUIRK_SET_DTR(0x1199, 0x9071, 8)}, /* Sierra Wireless MC74xx */ {QMI_QUIRK_SET_DTR(0x1199, 0x9071, 10)},/* Sierra Wireless MC74xx */ {QMI_QUIRK_SET_DTR(0x1199, 0x9079, 8)}, /* Sierra Wireless EM74xx */ {QMI_QUIRK_SET_DTR(0x1199, 0x9079, 10)},/* Sierra Wireless EM74xx */ {QMI_QUIRK_SET_DTR(0x1199, 0x907b, 8)}, /* Sierra Wireless EM74xx */ {QMI_QUIRK_SET_DTR(0x1199, 0x907b, 10)},/* Sierra Wireless EM74xx */ {QMI_QUIRK_SET_DTR(0x1199, 0x9091, 8)}, /* Sierra Wireless EM7565 */ {QMI_QUIRK_SET_DTR(0x1199, 0xc081, 8)}, /* Sierra Wireless EM7590 */ {QMI_FIXED_INTF(0x1bbb, 0x011e, 4)}, /* Telekom Speedstick LTE II (Alcatel One Touch L100V LTE) */ {QMI_FIXED_INTF(0x1bbb, 0x0203, 2)}, /* Alcatel L800MA */ {QMI_FIXED_INTF(0x2357, 0x0201, 4)}, /* TP-LINK HSUPA Modem MA180 */ {QMI_FIXED_INTF(0x2357, 0x9000, 4)}, /* TP-LINK MA260 */ {QMI_QUIRK_SET_DTR(0x1bc7, 0x1031, 3)}, /* Telit LE910C1-EUX */ {QMI_QUIRK_SET_DTR(0x1bc7, 0x103a, 0)}, /* Telit LE910C4-WWX */ {QMI_QUIRK_SET_DTR(0x1bc7, 0x1040, 2)}, /* Telit LE922A */ {QMI_QUIRK_SET_DTR(0x1bc7, 0x1050, 2)}, /* Telit FN980 */ {QMI_QUIRK_SET_DTR(0x1bc7, 0x1057, 2)}, /* Telit FN980 */ {QMI_QUIRK_SET_DTR(0x1bc7, 0x1060, 2)}, /* Telit LN920 */ {QMI_QUIRK_SET_DTR(0x1bc7, 0x1070, 2)}, /* Telit FN990 */ {QMI_QUIRK_SET_DTR(0x1bc7, 0x1080, 2)}, /* Telit FE990 */ {QMI_QUIRK_SET_DTR(0x1bc7, 0x10a0, 0)}, /* Telit FN920C04 */ {QMI_QUIRK_SET_DTR(0x1bc7, 0x10a4, 0)}, /* Telit FN920C04 */ {QMI_QUIRK_SET_DTR(0x1bc7, 0x10a9, 0)}, /* Telit FN920C04 */ {QMI_QUIRK_SET_DTR(0x1bc7, 0x10c0, 0)}, /* Telit FE910C04 */ {QMI_QUIRK_SET_DTR(0x1bc7, 0x10c4, 0)}, /* Telit FE910C04 */ {QMI_QUIRK_SET_DTR(0x1bc7, 0x10c8, 0)}, /* Telit FE910C04 */ {QMI_FIXED_INTF(0x1bc7, 0x1100, 3)}, /* Telit ME910 */ {QMI_FIXED_INTF(0x1bc7, 0x1101, 3)}, /* Telit ME910 dual modem */ {QMI_FIXED_INTF(0x1bc7, 0x1200, 5)}, /* Telit LE920 */ {QMI_QUIRK_SET_DTR(0x1bc7, 0x1201, 2)}, /* Telit LE920, LE920A4 */ {QMI_QUIRK_SET_DTR(0x1bc7, 0x1230, 2)}, /* Telit LE910Cx */ {QMI_QUIRK_SET_DTR(0x1bc7, 0x1250, 0)}, /* Telit LE910Cx */ {QMI_QUIRK_SET_DTR(0x1bc7, 0x1260, 2)}, /* Telit LE910Cx */ {QMI_QUIRK_SET_DTR(0x1bc7, 0x1261, 2)}, /* Telit LE910Cx */ {QMI_QUIRK_SET_DTR(0x1bc7, 0x1900, 1)}, /* Telit LN940 series */ {QMI_QUIRK_SET_DTR(0x1bc7, 0x3000, 0)}, /* Telit FN912 series */ {QMI_QUIRK_SET_DTR(0x1bc7, 0x3001, 0)}, /* Telit FN912 series */ {QMI_FIXED_INTF(0x1c9e, 0x9801, 3)}, /* Telewell TW-3G HSPA+ */ {QMI_FIXED_INTF(0x1c9e, 0x9803, 4)}, /* Telewell TW-3G HSPA+ */ {QMI_FIXED_INTF(0x1c9e, 0x9b01, 3)}, /* XS Stick W100-2 from 4G Systems */ {QMI_QUIRK_SET_DTR(0x1c9e, 0x9b05, 4)}, /* Longsung U8300 */ {QMI_QUIRK_SET_DTR(0x1c9e, 0x9b3c, 4)}, /* Longsung U9300 */ {QMI_FIXED_INTF(0x0b3c, 0xc000, 4)}, /* Olivetti Olicard 100 */ {QMI_FIXED_INTF(0x0b3c, 0xc001, 4)}, /* Olivetti Olicard 120 */ {QMI_FIXED_INTF(0x0b3c, 0xc002, 4)}, /* Olivetti Olicard 140 */ {QMI_FIXED_INTF(0x0b3c, 0xc004, 6)}, /* Olivetti Olicard 155 */ {QMI_FIXED_INTF(0x0b3c, 0xc005, 6)}, /* Olivetti Olicard 200 */ {QMI_FIXED_INTF(0x0b3c, 0xc00a, 6)}, /* Olivetti Olicard 160 */ {QMI_FIXED_INTF(0x0b3c, 0xc00b, 4)}, /* Olivetti Olicard 500 */ {QMI_FIXED_INTF(0x1e2d, 0x0060, 4)}, /* Cinterion PLxx */ {QMI_QUIRK_SET_DTR(0x1e2d, 0x006f, 8)}, /* Cinterion PLS83/PLS63 */ {QMI_FIXED_INTF(0x1e2d, 0x0053, 4)}, /* Cinterion PHxx,PXxx */ {QMI_FIXED_INTF(0x1e2d, 0x0063, 10)}, /* Cinterion ALASxx (1 RmNet) */ {QMI_FIXED_INTF(0x1e2d, 0x0082, 4)}, /* Cinterion PHxx,PXxx (2 RmNet) */ {QMI_FIXED_INTF(0x1e2d, 0x0082, 5)}, /* Cinterion PHxx,PXxx (2 RmNet) */ {QMI_FIXED_INTF(0x1e2d, 0x0083, 4)}, /* Cinterion PHxx,PXxx (1 RmNet + USB Audio)*/ {QMI_QUIRK_SET_DTR(0x1e2d, 0x00b0, 4)}, /* Cinterion CLS8 */ {QMI_FIXED_INTF(0x1e2d, 0x00b7, 0)}, /* Cinterion MV31 RmNet */ {QMI_FIXED_INTF(0x1e2d, 0x00b9, 0)}, /* Cinterion MV31 RmNet based on new baseline */ {QMI_FIXED_INTF(0x1e2d, 0x00f3, 0)}, /* Cinterion MV32-W-A RmNet */ {QMI_FIXED_INTF(0x1e2d, 0x00f4, 0)}, /* Cinterion MV32-W-B RmNet */ {QMI_FIXED_INTF(0x413c, 0x81a2, 8)}, /* Dell Wireless 5806 Gobi(TM) 4G LTE Mobile Broadband Card */ {QMI_FIXED_INTF(0x413c, 0x81a3, 8)}, /* Dell Wireless 5570 HSPA+ (42Mbps) Mobile Broadband Card */ {QMI_FIXED_INTF(0x413c, 0x81a4, 8)}, /* Dell Wireless 5570e HSPA+ (42Mbps) Mobile Broadband Card */ {QMI_FIXED_INTF(0x413c, 0x81a8, 8)}, /* Dell Wireless 5808 Gobi(TM) 4G LTE Mobile Broadband Card */ {QMI_FIXED_INTF(0x413c, 0x81a9, 8)}, /* Dell Wireless 5808e Gobi(TM) 4G LTE Mobile Broadband Card */ {QMI_FIXED_INTF(0x413c, 0x81b1, 8)}, /* Dell Wireless 5809e Gobi(TM) 4G LTE Mobile Broadband Card */ {QMI_FIXED_INTF(0x413c, 0x81b3, 8)}, /* Dell Wireless 5809e Gobi(TM) 4G LTE Mobile Broadband Card (rev3) */ {QMI_FIXED_INTF(0x413c, 0x81b6, 8)}, /* Dell Wireless 5811e */ {QMI_FIXED_INTF(0x413c, 0x81b6, 10)}, /* Dell Wireless 5811e */ {QMI_FIXED_INTF(0x413c, 0x81c2, 8)}, /* Dell Wireless 5811e */ {QMI_FIXED_INTF(0x413c, 0x81cc, 8)}, /* Dell Wireless 5816e */ {QMI_FIXED_INTF(0x413c, 0x81d7, 0)}, /* Dell Wireless 5821e */ {QMI_FIXED_INTF(0x413c, 0x81d7, 1)}, /* Dell Wireless 5821e preproduction config */ {QMI_FIXED_INTF(0x413c, 0x81e0, 0)}, /* Dell Wireless 5821e with eSIM support*/ {QMI_FIXED_INTF(0x413c, 0x81e4, 0)}, /* Dell Wireless 5829e with eSIM support*/ {QMI_FIXED_INTF(0x413c, 0x81e6, 0)}, /* Dell Wireless 5829e */ {QMI_FIXED_INTF(0x03f0, 0x4e1d, 8)}, /* HP lt4111 LTE/EV-DO/HSPA+ Gobi 4G Module */ {QMI_FIXED_INTF(0x03f0, 0x9d1d, 1)}, /* HP lt4120 Snapdragon X5 LTE */ {QMI_QUIRK_SET_DTR(0x22de, 0x9051, 2)}, /* Hucom Wireless HM-211S/K */ {QMI_FIXED_INTF(0x22de, 0x9061, 3)}, /* WeTelecom WPD-600N */ {QMI_QUIRK_SET_DTR(0x1e0e, 0x9001, 5)}, /* SIMCom 7100E, 7230E, 7600E ++ */ {QMI_QUIRK_SET_DTR(0x2c7c, 0x0121, 4)}, /* Quectel EC21 Mini PCIe */ {QMI_QUIRK_SET_DTR(0x2c7c, 0x0191, 4)}, /* Quectel EG91 */ {QMI_QUIRK_SET_DTR(0x2c7c, 0x0195, 4)}, /* Quectel EG95 */ {QMI_FIXED_INTF(0x2c7c, 0x0296, 4)}, /* Quectel BG96 */ {QMI_QUIRK_SET_DTR(0x2c7c, 0x030e, 4)}, /* Quectel EM05GV2 */ {QMI_QUIRK_SET_DTR(0x2c7c, 0x0316, 3)}, /* Quectel RG255C */ {QMI_QUIRK_SET_DTR(0x2cb7, 0x0104, 4)}, /* Fibocom NL678 series */ {QMI_QUIRK_SET_DTR(0x2cb7, 0x0112, 0)}, /* Fibocom FG132 */ {QMI_FIXED_INTF(0x0489, 0xe0b4, 0)}, /* Foxconn T77W968 LTE */ {QMI_FIXED_INTF(0x0489, 0xe0b5, 0)}, /* Foxconn T77W968 LTE with eSIM support*/ {QMI_FIXED_INTF(0x2692, 0x9025, 4)}, /* Cellient MPL200 (rebranded Qualcomm 05c6:9025) */ {QMI_QUIRK_SET_DTR(0x1546, 0x1312, 4)}, /* u-blox LARA-R6 01B */ {QMI_QUIRK_SET_DTR(0x1546, 0x1342, 4)}, /* u-blox LARA-L6 */ {QMI_QUIRK_SET_DTR(0x33f8, 0x0104, 4)}, /* Rolling RW101 RMNET */ {QMI_FIXED_INTF(0x2dee, 0x4d22, 5)}, /* MeiG Smart SRM825L */ /* 4. Gobi 1000 devices */ {QMI_GOBI1K_DEVICE(0x05c6, 0x9212)}, /* Acer Gobi Modem Device */ {QMI_GOBI1K_DEVICE(0x03f0, 0x1f1d)}, /* HP un2400 Gobi Modem Device */ {QMI_GOBI1K_DEVICE(0x04da, 0x250d)}, /* Panasonic Gobi Modem device */ {QMI_GOBI1K_DEVICE(0x413c, 0x8172)}, /* Dell Gobi Modem device */ {QMI_GOBI1K_DEVICE(0x1410, 0xa001)}, /* Novatel/Verizon USB-1000 */ {QMI_GOBI1K_DEVICE(0x1410, 0xa002)}, /* Novatel Gobi Modem device */ {QMI_GOBI1K_DEVICE(0x1410, 0xa003)}, /* Novatel Gobi Modem device */ {QMI_GOBI1K_DEVICE(0x1410, 0xa004)}, /* Novatel Gobi Modem device */ {QMI_GOBI1K_DEVICE(0x1410, 0xa005)}, /* Novatel Gobi Modem device */ {QMI_GOBI1K_DEVICE(0x1410, 0xa006)}, /* Novatel Gobi Modem device */ {QMI_GOBI1K_DEVICE(0x1410, 0xa007)}, /* Novatel Gobi Modem device */ {QMI_GOBI1K_DEVICE(0x0b05, 0x1776)}, /* Asus Gobi Modem device */ {QMI_GOBI1K_DEVICE(0x19d2, 0xfff3)}, /* ONDA Gobi Modem device */ {QMI_GOBI1K_DEVICE(0x05c6, 0x9001)}, /* Generic Gobi Modem device */ {QMI_GOBI1K_DEVICE(0x05c6, 0x9002)}, /* Generic Gobi Modem device */ {QMI_GOBI1K_DEVICE(0x05c6, 0x9202)}, /* Generic Gobi Modem device */ {QMI_GOBI1K_DEVICE(0x05c6, 0x9203)}, /* Generic Gobi Modem device */ {QMI_GOBI1K_DEVICE(0x05c6, 0x9222)}, /* Generic Gobi Modem device */ {QMI_GOBI1K_DEVICE(0x05c6, 0x9009)}, /* Generic Gobi Modem device */ /* 5. Gobi 2000 and 3000 devices */ {QMI_GOBI_DEVICE(0x413c, 0x8186)}, /* Dell Gobi 2000 Modem device (N0218, VU936) */ {QMI_GOBI_DEVICE(0x413c, 0x8194)}, /* Dell Gobi 3000 Composite */ {QMI_GOBI_DEVICE(0x05c6, 0x920b)}, /* Generic Gobi 2000 Modem device */ {QMI_GOBI_DEVICE(0x05c6, 0x9225)}, /* Sony Gobi 2000 Modem device (N0279, VU730) */ {QMI_GOBI_DEVICE(0x05c6, 0x9245)}, /* Samsung Gobi 2000 Modem device (VL176) */ {QMI_GOBI_DEVICE(0x03f0, 0x251d)}, /* HP Gobi 2000 Modem device (VP412) */ {QMI_GOBI_DEVICE(0x05c6, 0x9215)}, /* Acer Gobi 2000 Modem device (VP413) */ {QMI_FIXED_INTF(0x05c6, 0x9215, 4)}, /* Quectel EC20 Mini PCIe */ {QMI_GOBI_DEVICE(0x05c6, 0x9265)}, /* Asus Gobi 2000 Modem device (VR305) */ {QMI_GOBI_DEVICE(0x05c6, 0x9235)}, /* Top Global Gobi 2000 Modem device (VR306) */ {QMI_GOBI_DEVICE(0x05c6, 0x9275)}, /* iRex Technologies Gobi 2000 Modem device (VR307) */ {QMI_GOBI_DEVICE(0x0af0, 0x8120)}, /* Option GTM681W */ {QMI_GOBI_DEVICE(0x1199, 0x68a5)}, /* Sierra Wireless Modem */ {QMI_GOBI_DEVICE(0x1199, 0x68a9)}, /* Sierra Wireless Modem */ {QMI_GOBI_DEVICE(0x1199, 0x9001)}, /* Sierra Wireless Gobi 2000 Modem device (VT773) */ {QMI_GOBI_DEVICE(0x1199, 0x9002)}, /* Sierra Wireless Gobi 2000 Modem device (VT773) */ {QMI_GOBI_DEVICE(0x1199, 0x9003)}, /* Sierra Wireless Gobi 2000 Modem device (VT773) */ {QMI_GOBI_DEVICE(0x1199, 0x9004)}, /* Sierra Wireless Gobi 2000 Modem device (VT773) */ {QMI_GOBI_DEVICE(0x1199, 0x9005)}, /* Sierra Wireless Gobi 2000 Modem device (VT773) */ {QMI_GOBI_DEVICE(0x1199, 0x9006)}, /* Sierra Wireless Gobi 2000 Modem device (VT773) */ {QMI_GOBI_DEVICE(0x1199, 0x9007)}, /* Sierra Wireless Gobi 2000 Modem device (VT773) */ {QMI_GOBI_DEVICE(0x1199, 0x9008)}, /* Sierra Wireless Gobi 2000 Modem device (VT773) */ {QMI_GOBI_DEVICE(0x1199, 0x9009)}, /* Sierra Wireless Gobi 2000 Modem device (VT773) */ {QMI_GOBI_DEVICE(0x1199, 0x900a)}, /* Sierra Wireless Gobi 2000 Modem device (VT773) */ {QMI_GOBI_DEVICE(0x1199, 0x9011)}, /* Sierra Wireless Gobi 2000 Modem device (MC8305) */ {QMI_GOBI_DEVICE(0x16d8, 0x8002)}, /* CMDTech Gobi 2000 Modem device (VU922) */ {QMI_GOBI_DEVICE(0x05c6, 0x9205)}, /* Gobi 2000 Modem device */ {QMI_GOBI_DEVICE(0x1199, 0x9013)}, /* Sierra Wireless Gobi 3000 Modem device (MC8355) */ {QMI_GOBI_DEVICE(0x03f0, 0x371d)}, /* HP un2430 Mobile Broadband Module */ {QMI_GOBI_DEVICE(0x1199, 0x9015)}, /* Sierra Wireless Gobi 3000 Modem device */ {QMI_GOBI_DEVICE(0x1199, 0x9019)}, /* Sierra Wireless Gobi 3000 Modem device */ {QMI_GOBI_DEVICE(0x1199, 0x901b)}, /* Sierra Wireless MC7770 */ {QMI_GOBI_DEVICE(0x12d1, 0x14f1)}, /* Sony Gobi 3000 Composite */ {QMI_GOBI_DEVICE(0x1410, 0xa021)}, /* Foxconn Gobi 3000 Modem device (Novatel E396) */ { } /* END */ }; MODULE_DEVICE_TABLE(usb, products); static bool quectel_ec20_detected(struct usb_interface *intf) { struct usb_device *dev = interface_to_usbdev(intf); if (dev->actconfig && le16_to_cpu(dev->descriptor.idVendor) == 0x05c6 && le16_to_cpu(dev->descriptor.idProduct) == 0x9215 && dev->actconfig->desc.bNumInterfaces == 5) return true; return false; } static int qmi_wwan_probe(struct usb_interface *intf, const struct usb_device_id *prod) { struct usb_device_id *id = (struct usb_device_id *)prod; struct usb_interface_descriptor *desc = &intf->cur_altsetting->desc; /* Workaround to enable dynamic IDs. This disables usbnet * blacklisting functionality. Which, if required, can be * reimplemented here by using a magic "blacklist" value * instead of 0 in the static device id table */ if (!id->driver_info) { dev_dbg(&intf->dev, "setting defaults for dynamic device id\n"); id->driver_info = (unsigned long)&qmi_wwan_info; } /* There are devices where the same interface number can be * configured as different functions. We should only bind to * vendor specific functions when matching on interface number */ if (id->match_flags & USB_DEVICE_ID_MATCH_INT_NUMBER && desc->bInterfaceClass != USB_CLASS_VENDOR_SPEC) { dev_dbg(&intf->dev, "Rejecting interface number match for class %02x\n", desc->bInterfaceClass); return -ENODEV; } /* Quectel EC20 quirk where we've QMI on interface 4 instead of 0 */ if (quectel_ec20_detected(intf) && desc->bInterfaceNumber == 0) { dev_dbg(&intf->dev, "Quectel EC20 quirk, skipping interface 0\n"); return -ENODEV; } /* Several Quectel modems supports dynamic interface configuration, so * we need to match on class/subclass/protocol. These values are * identical for the diagnostic- and QMI-interface, but bNumEndpoints is * different. Ignore the current interface if the number of endpoints * equals the number for the diag interface (two). */ if (desc->bNumEndpoints == 2) return -ENODEV; return usbnet_probe(intf, id); } static void qmi_wwan_disconnect(struct usb_interface *intf) { struct usbnet *dev = usb_get_intfdata(intf); struct qmi_wwan_state *info; struct list_head *iter; struct net_device *ldev; LIST_HEAD(list); /* called twice if separate control and data intf */ if (!dev) return; info = (void *)&dev->data; if (info->flags & QMI_WWAN_FLAG_MUX) { if (!rtnl_trylock()) { restart_syscall(); return; } rcu_read_lock(); netdev_for_each_upper_dev_rcu(dev->net, ldev, iter) qmimux_unregister_device(ldev, &list); rcu_read_unlock(); unregister_netdevice_many(&list); rtnl_unlock(); info->flags &= ~QMI_WWAN_FLAG_MUX; } usbnet_disconnect(intf); } static struct usb_driver qmi_wwan_driver = { .name = "qmi_wwan", .id_table = products, .probe = qmi_wwan_probe, .disconnect = qmi_wwan_disconnect, .suspend = qmi_wwan_suspend, .resume = qmi_wwan_resume, .reset_resume = qmi_wwan_resume, .supports_autosuspend = 1, .disable_hub_initiated_lpm = 1, }; module_usb_driver(qmi_wwan_driver); MODULE_AUTHOR("Bjørn Mork <bjorn@mork.no>"); MODULE_DESCRIPTION("Qualcomm MSM Interface (QMI) WWAN driver"); MODULE_LICENSE("GPL");
6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_INCLUDE_PATH ../../drivers/dma-buf #define TRACE_SYSTEM sync_trace #if !defined(_TRACE_SYNC_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_SYNC_H #include "sync_debug.h" #include <linux/tracepoint.h> TRACE_EVENT(sync_timeline, TP_PROTO(struct sync_timeline *timeline), TP_ARGS(timeline), TP_STRUCT__entry( __string(name, timeline->name) __field(u32, value) ), TP_fast_assign( __assign_str(name); __entry->value = timeline->value; ), TP_printk("name=%s value=%d", __get_str(name), __entry->value) ); #endif /* if !defined(_TRACE_SYNC_H) || defined(TRACE_HEADER_MULTI_READ) */ /* This part must be outside protection */ #include <trace/define_trace.h>
6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 /* SPDX-License-Identifier: GPL-2.0-only */ /* * This file is part of the Linux kernel. * * Copyright (c) 2011-2014, Intel Corporation * Authors: Fenghua Yu <fenghua.yu@intel.com>, * H. Peter Anvin <hpa@linux.intel.com> */ #ifndef ASM_X86_ARCHRANDOM_H #define ASM_X86_ARCHRANDOM_H #include <asm/processor.h> #include <asm/cpufeature.h> #define RDRAND_RETRY_LOOPS 10 /* Unconditional execution of RDRAND and RDSEED */ static inline bool __must_check rdrand_long(unsigned long *v) { bool ok; unsigned int retry = RDRAND_RETRY_LOOPS; do { asm volatile("rdrand %[out]" CC_SET(c) : CC_OUT(c) (ok), [out] "=r" (*v)); if (ok) return true; } while (--retry); return false; } static inline bool __must_check rdseed_long(unsigned long *v) { bool ok; asm volatile("rdseed %[out]" CC_SET(c) : CC_OUT(c) (ok), [out] "=r" (*v)); return ok; } /* * These are the generic interfaces; they must not be declared if the * stubs in <linux/random.h> are to be invoked. */ static inline size_t __must_check arch_get_random_longs(unsigned long *v, size_t max_longs) { return max_longs && static_cpu_has(X86_FEATURE_RDRAND) && rdrand_long(v) ? 1 : 0; } static inline size_t __must_check arch_get_random_seed_longs(unsigned long *v, size_t max_longs) { return max_longs && static_cpu_has(X86_FEATURE_RDSEED) && rdseed_long(v) ? 1 : 0; } #ifndef CONFIG_UML void x86_init_rdrand(struct cpuinfo_x86 *c); #endif #endif /* ASM_X86_ARCHRANDOM_H */
58 44 11 55 2 6 6 1 1 6 6 2 2 20 8 54 6 49 48 47 7 49 45 20 4 4 3 3 2 1 3 3 2 1 63 8 5 2 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 // SPDX-License-Identifier: GPL-2.0-or-later /* * tcp_diag.c Module for monitoring TCP transport protocols sockets. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> */ #include <linux/module.h> #include <linux/net.h> #include <linux/sock_diag.h> #include <linux/inet_diag.h> #include <linux/tcp.h> #include <net/netlink.h> #include <net/tcp.h> static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, void *_info) { struct tcp_info *info = _info; if (inet_sk_state_load(sk) == TCP_LISTEN) { r->idiag_rqueue = READ_ONCE(sk->sk_ack_backlog); r->idiag_wqueue = READ_ONCE(sk->sk_max_ack_backlog); } else if (sk->sk_type == SOCK_STREAM) { const struct tcp_sock *tp = tcp_sk(sk); r->idiag_rqueue = max_t(int, READ_ONCE(tp->rcv_nxt) - READ_ONCE(tp->copied_seq), 0); r->idiag_wqueue = READ_ONCE(tp->write_seq) - tp->snd_una; } if (info) tcp_get_info(sk, info); } #ifdef CONFIG_TCP_MD5SIG static void tcp_diag_md5sig_fill(struct tcp_diag_md5sig *info, const struct tcp_md5sig_key *key) { info->tcpm_family = key->family; info->tcpm_prefixlen = key->prefixlen; info->tcpm_keylen = key->keylen; memcpy(info->tcpm_key, key->key, key->keylen); if (key->family == AF_INET) info->tcpm_addr[0] = key->addr.a4.s_addr; #if IS_ENABLED(CONFIG_IPV6) else if (key->family == AF_INET6) memcpy(&info->tcpm_addr, &key->addr.a6, sizeof(info->tcpm_addr)); #endif } static int tcp_diag_put_md5sig(struct sk_buff *skb, const struct tcp_md5sig_info *md5sig) { const struct tcp_md5sig_key *key; struct tcp_diag_md5sig *info; struct nlattr *attr; int md5sig_count = 0; hlist_for_each_entry_rcu(key, &md5sig->head, node) md5sig_count++; if (md5sig_count == 0) return 0; attr = nla_reserve(skb, INET_DIAG_MD5SIG, md5sig_count * sizeof(struct tcp_diag_md5sig)); if (!attr) return -EMSGSIZE; info = nla_data(attr); memset(info, 0, md5sig_count * sizeof(struct tcp_diag_md5sig)); hlist_for_each_entry_rcu(key, &md5sig->head, node) { tcp_diag_md5sig_fill(info++, key); if (--md5sig_count == 0) break; } return 0; } #endif static int tcp_diag_put_ulp(struct sk_buff *skb, struct sock *sk, const struct tcp_ulp_ops *ulp_ops) { struct nlattr *nest; int err; nest = nla_nest_start_noflag(skb, INET_DIAG_ULP_INFO); if (!nest) return -EMSGSIZE; err = nla_put_string(skb, INET_ULP_INFO_NAME, ulp_ops->name); if (err) goto nla_failure; if (ulp_ops->get_info) err = ulp_ops->get_info(sk, skb); if (err) goto nla_failure; nla_nest_end(skb, nest); return 0; nla_failure: nla_nest_cancel(skb, nest); return err; } static int tcp_diag_get_aux(struct sock *sk, bool net_admin, struct sk_buff *skb) { struct inet_connection_sock *icsk = inet_csk(sk); int err = 0; #ifdef CONFIG_TCP_MD5SIG if (net_admin) { struct tcp_md5sig_info *md5sig; rcu_read_lock(); md5sig = rcu_dereference(tcp_sk(sk)->md5sig_info); if (md5sig) err = tcp_diag_put_md5sig(skb, md5sig); rcu_read_unlock(); if (err < 0) return err; } #endif if (net_admin) { const struct tcp_ulp_ops *ulp_ops; ulp_ops = icsk->icsk_ulp_ops; if (ulp_ops) err = tcp_diag_put_ulp(skb, sk, ulp_ops); if (err) return err; } return 0; } static size_t tcp_diag_get_aux_size(struct sock *sk, bool net_admin) { struct inet_connection_sock *icsk = inet_csk(sk); size_t size = 0; #ifdef CONFIG_TCP_MD5SIG if (net_admin && sk_fullsock(sk)) { const struct tcp_md5sig_info *md5sig; const struct tcp_md5sig_key *key; size_t md5sig_count = 0; rcu_read_lock(); md5sig = rcu_dereference(tcp_sk(sk)->md5sig_info); if (md5sig) { hlist_for_each_entry_rcu(key, &md5sig->head, node) md5sig_count++; } rcu_read_unlock(); size += nla_total_size(md5sig_count * sizeof(struct tcp_diag_md5sig)); } #endif if (net_admin && sk_fullsock(sk)) { const struct tcp_ulp_ops *ulp_ops; ulp_ops = icsk->icsk_ulp_ops; if (ulp_ops) { size += nla_total_size(0) + nla_total_size(TCP_ULP_NAME_MAX); if (ulp_ops->get_info_size) size += ulp_ops->get_info_size(sk); } } return size; } static void tcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { struct inet_hashinfo *hinfo; hinfo = sock_net(cb->skb->sk)->ipv4.tcp_death_row.hashinfo; inet_diag_dump_icsk(hinfo, skb, cb, r); } static int tcp_diag_dump_one(struct netlink_callback *cb, const struct inet_diag_req_v2 *req) { struct inet_hashinfo *hinfo; hinfo = sock_net(cb->skb->sk)->ipv4.tcp_death_row.hashinfo; return inet_diag_dump_one_icsk(hinfo, cb, req); } #ifdef CONFIG_INET_DIAG_DESTROY static int tcp_diag_destroy(struct sk_buff *in_skb, const struct inet_diag_req_v2 *req) { struct net *net = sock_net(in_skb->sk); struct inet_hashinfo *hinfo; struct sock *sk; int err; hinfo = net->ipv4.tcp_death_row.hashinfo; sk = inet_diag_find_one_icsk(net, hinfo, req); if (IS_ERR(sk)) return PTR_ERR(sk); err = sock_diag_destroy(sk, ECONNABORTED); sock_gen_put(sk); return err; } #endif static const struct inet_diag_handler tcp_diag_handler = { .owner = THIS_MODULE, .dump = tcp_diag_dump, .dump_one = tcp_diag_dump_one, .idiag_get_info = tcp_diag_get_info, .idiag_get_aux = tcp_diag_get_aux, .idiag_get_aux_size = tcp_diag_get_aux_size, .idiag_type = IPPROTO_TCP, .idiag_info_size = sizeof(struct tcp_info), #ifdef CONFIG_INET_DIAG_DESTROY .destroy = tcp_diag_destroy, #endif }; static int __init tcp_diag_init(void) { return inet_diag_register(&tcp_diag_handler); } static void __exit tcp_diag_exit(void) { inet_diag_unregister(&tcp_diag_handler); } module_init(tcp_diag_init); module_exit(tcp_diag_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("TCP socket monitoring via SOCK_DIAG"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-6 /* AF_INET - IPPROTO_TCP */);
410 405 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 // SPDX-License-Identifier: GPL-2.0 /* Copyright(c) 2016-2020 Intel Corporation. All rights reserved. */ #include <linux/jump_label.h> #include <linux/uaccess.h> #include <linux/export.h> #include <linux/instrumented.h> #include <linux/string.h> #include <linux/types.h> #include <asm/mce.h> #ifdef CONFIG_X86_MCE static DEFINE_STATIC_KEY_FALSE(copy_mc_fragile_key); void enable_copy_mc_fragile(void) { static_branch_inc(&copy_mc_fragile_key); } #define copy_mc_fragile_enabled (static_branch_unlikely(&copy_mc_fragile_key)) /* * Similar to copy_user_handle_tail, probe for the write fault point, or * source exception point. */ __visible notrace unsigned long copy_mc_fragile_handle_tail(char *to, char *from, unsigned len) { for (; len; --len, to++, from++) if (copy_mc_fragile(to, from, 1)) break; return len; } #else /* * No point in doing careful copying, or consulting a static key when * there is no #MC handler in the CONFIG_X86_MCE=n case. */ void enable_copy_mc_fragile(void) { } #define copy_mc_fragile_enabled (0) #endif unsigned long copy_mc_enhanced_fast_string(void *dst, const void *src, unsigned len); /** * copy_mc_to_kernel - memory copy that handles source exceptions * * @dst: destination address * @src: source address * @len: number of bytes to copy * * Call into the 'fragile' version on systems that benefit from avoiding * corner case poison consumption scenarios, For example, accessing * poison across 2 cachelines with a single instruction. Almost all * other uses case can use copy_mc_enhanced_fast_string() for a fast * recoverable copy, or fallback to plain memcpy. * * Return 0 for success, or number of bytes not copied if there was an * exception. */ unsigned long __must_check copy_mc_to_kernel(void *dst, const void *src, unsigned len) { unsigned long ret; if (copy_mc_fragile_enabled) { instrument_memcpy_before(dst, src, len); ret = copy_mc_fragile(dst, src, len); instrument_memcpy_after(dst, src, len, ret); return ret; } if (static_cpu_has(X86_FEATURE_ERMS)) { instrument_memcpy_before(dst, src, len); ret = copy_mc_enhanced_fast_string(dst, src, len); instrument_memcpy_after(dst, src, len, ret); return ret; } memcpy(dst, src, len); return 0; } EXPORT_SYMBOL_GPL(copy_mc_to_kernel); unsigned long __must_check copy_mc_to_user(void __user *dst, const void *src, unsigned len) { unsigned long ret; if (copy_mc_fragile_enabled) { instrument_copy_to_user(dst, src, len); __uaccess_begin(); ret = copy_mc_fragile((__force void *)dst, src, len); __uaccess_end(); return ret; } if (static_cpu_has(X86_FEATURE_ERMS)) { instrument_copy_to_user(dst, src, len); __uaccess_begin(); ret = copy_mc_enhanced_fast_string((__force void *)dst, src, len); __uaccess_end(); return ret; } return copy_user_generic((__force void *)dst, src, len); }
168 43 165 170 43 72 62 33 49 61 9 2 50 68 12 1 3 8 60 17 51 19 19 1 18 2 1 1 1 1 2 2 204 227 203 58 51 2 52 2 5 45 2 24 48 48 5 43 76 19 53 1 77 77 1 77 51 1 77 35 42 69 19 1 214 161 1 77 166 6 98 1 9 97 5 96 163 161 2 162 76 4 140 3 139 100 163 163 90 96 31 25 9 64 8 8 8 1 7 2 1 5 2 2 8 3 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 // SPDX-License-Identifier: GPL-2.0-only /* * vfsv0 quota IO operations on file */ #include <linux/errno.h> #include <linux/fs.h> #include <linux/mount.h> #include <linux/dqblk_v2.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/quotaops.h> #include <asm/byteorder.h> #include "quota_tree.h" MODULE_AUTHOR("Jan Kara"); MODULE_DESCRIPTION("Quota trie support"); MODULE_LICENSE("GPL"); /* * Maximum quota tree depth we support. Only to limit recursion when working * with the tree. */ #define MAX_QTREE_DEPTH 6 #define __QUOTA_QT_PARANOIA static int __get_index(struct qtree_mem_dqinfo *info, qid_t id, int depth) { unsigned int epb = info->dqi_usable_bs >> 2; depth = info->dqi_qtree_depth - depth - 1; while (depth--) id /= epb; return id % epb; } static int get_index(struct qtree_mem_dqinfo *info, struct kqid qid, int depth) { qid_t id = from_kqid(&init_user_ns, qid); return __get_index(info, id, depth); } /* Number of entries in one blocks */ static int qtree_dqstr_in_blk(struct qtree_mem_dqinfo *info) { return (info->dqi_usable_bs - sizeof(struct qt_disk_dqdbheader)) / info->dqi_entry_size; } static ssize_t read_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf) { struct super_block *sb = info->dqi_sb; memset(buf, 0, info->dqi_usable_bs); return sb->s_op->quota_read(sb, info->dqi_type, buf, info->dqi_usable_bs, (loff_t)blk << info->dqi_blocksize_bits); } static ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf) { struct super_block *sb = info->dqi_sb; ssize_t ret; ret = sb->s_op->quota_write(sb, info->dqi_type, buf, info->dqi_usable_bs, (loff_t)blk << info->dqi_blocksize_bits); if (ret != info->dqi_usable_bs) { quota_error(sb, "dquota write failed"); if (ret >= 0) ret = -EIO; } return ret; } static inline int do_check_range(struct super_block *sb, const char *val_name, uint val, uint min_val, uint max_val) { if (val < min_val || val > max_val) { quota_error(sb, "Getting %s %u out of range %u-%u", val_name, val, min_val, max_val); return -EUCLEAN; } return 0; } static int check_dquot_block_header(struct qtree_mem_dqinfo *info, struct qt_disk_dqdbheader *dh) { int err = 0; err = do_check_range(info->dqi_sb, "dqdh_next_free", le32_to_cpu(dh->dqdh_next_free), 0, info->dqi_blocks - 1); if (err) return err; err = do_check_range(info->dqi_sb, "dqdh_prev_free", le32_to_cpu(dh->dqdh_prev_free), 0, info->dqi_blocks - 1); if (err) return err; err = do_check_range(info->dqi_sb, "dqdh_entries", le16_to_cpu(dh->dqdh_entries), 0, qtree_dqstr_in_blk(info)); return err; } /* Remove empty block from list and return it */ static int get_free_dqblk(struct qtree_mem_dqinfo *info) { char *buf = kmalloc(info->dqi_usable_bs, GFP_KERNEL); struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf; int ret, blk; if (!buf) return -ENOMEM; if (info->dqi_free_blk) { blk = info->dqi_free_blk; ret = read_blk(info, blk, buf); if (ret < 0) goto out_buf; ret = check_dquot_block_header(info, dh); if (ret) goto out_buf; info->dqi_free_blk = le32_to_cpu(dh->dqdh_next_free); } else { memset(buf, 0, info->dqi_usable_bs); /* Assure block allocation... */ ret = write_blk(info, info->dqi_blocks, buf); if (ret < 0) goto out_buf; blk = info->dqi_blocks++; } mark_info_dirty(info->dqi_sb, info->dqi_type); ret = blk; out_buf: kfree(buf); return ret; } /* Insert empty block to the list */ static int put_free_dqblk(struct qtree_mem_dqinfo *info, char *buf, uint blk) { struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf; int err; dh->dqdh_next_free = cpu_to_le32(info->dqi_free_blk); dh->dqdh_prev_free = cpu_to_le32(0); dh->dqdh_entries = cpu_to_le16(0); err = write_blk(info, blk, buf); if (err < 0) return err; info->dqi_free_blk = blk; mark_info_dirty(info->dqi_sb, info->dqi_type); return 0; } /* Remove given block from the list of blocks with free entries */ static int remove_free_dqentry(struct qtree_mem_dqinfo *info, char *buf, uint blk) { char *tmpbuf = kmalloc(info->dqi_usable_bs, GFP_KERNEL); struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf; uint nextblk = le32_to_cpu(dh->dqdh_next_free); uint prevblk = le32_to_cpu(dh->dqdh_prev_free); int err; if (!tmpbuf) return -ENOMEM; if (nextblk) { err = read_blk(info, nextblk, tmpbuf); if (err < 0) goto out_buf; ((struct qt_disk_dqdbheader *)tmpbuf)->dqdh_prev_free = dh->dqdh_prev_free; err = write_blk(info, nextblk, tmpbuf); if (err < 0) goto out_buf; } if (prevblk) { err = read_blk(info, prevblk, tmpbuf); if (err < 0) goto out_buf; ((struct qt_disk_dqdbheader *)tmpbuf)->dqdh_next_free = dh->dqdh_next_free; err = write_blk(info, prevblk, tmpbuf); if (err < 0) goto out_buf; } else { info->dqi_free_entry = nextblk; mark_info_dirty(info->dqi_sb, info->dqi_type); } kfree(tmpbuf); dh->dqdh_next_free = dh->dqdh_prev_free = cpu_to_le32(0); /* No matter whether write succeeds block is out of list */ if (write_blk(info, blk, buf) < 0) quota_error(info->dqi_sb, "Can't write block (%u) " "with free entries", blk); return 0; out_buf: kfree(tmpbuf); return err; } /* Insert given block to the beginning of list with free entries */ static int insert_free_dqentry(struct qtree_mem_dqinfo *info, char *buf, uint blk) { char *tmpbuf = kmalloc(info->dqi_usable_bs, GFP_KERNEL); struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf; int err; if (!tmpbuf) return -ENOMEM; dh->dqdh_next_free = cpu_to_le32(info->dqi_free_entry); dh->dqdh_prev_free = cpu_to_le32(0); err = write_blk(info, blk, buf); if (err < 0) goto out_buf; if (info->dqi_free_entry) { err = read_blk(info, info->dqi_free_entry, tmpbuf); if (err < 0) goto out_buf; ((struct qt_disk_dqdbheader *)tmpbuf)->dqdh_prev_free = cpu_to_le32(blk); err = write_blk(info, info->dqi_free_entry, tmpbuf); if (err < 0) goto out_buf; } kfree(tmpbuf); info->dqi_free_entry = blk; mark_info_dirty(info->dqi_sb, info->dqi_type); return 0; out_buf: kfree(tmpbuf); return err; } /* Is the entry in the block free? */ int qtree_entry_unused(struct qtree_mem_dqinfo *info, char *disk) { int i; for (i = 0; i < info->dqi_entry_size; i++) if (disk[i]) return 0; return 1; } EXPORT_SYMBOL(qtree_entry_unused); /* Find space for dquot */ static uint find_free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot, int *err) { uint blk, i; struct qt_disk_dqdbheader *dh; char *buf = kmalloc(info->dqi_usable_bs, GFP_KERNEL); char *ddquot; *err = 0; if (!buf) { *err = -ENOMEM; return 0; } dh = (struct qt_disk_dqdbheader *)buf; if (info->dqi_free_entry) { blk = info->dqi_free_entry; *err = read_blk(info, blk, buf); if (*err < 0) goto out_buf; *err = check_dquot_block_header(info, dh); if (*err) goto out_buf; } else { blk = get_free_dqblk(info); if ((int)blk < 0) { *err = blk; kfree(buf); return 0; } memset(buf, 0, info->dqi_usable_bs); /* This is enough as the block is already zeroed and the entry * list is empty... */ info->dqi_free_entry = blk; mark_info_dirty(dquot->dq_sb, dquot->dq_id.type); } /* Block will be full? */ if (le16_to_cpu(dh->dqdh_entries) + 1 >= qtree_dqstr_in_blk(info)) { *err = remove_free_dqentry(info, buf, blk); if (*err < 0) { quota_error(dquot->dq_sb, "Can't remove block (%u) " "from entry free list", blk); goto out_buf; } } le16_add_cpu(&dh->dqdh_entries, 1); /* Find free structure in block */ ddquot = buf + sizeof(struct qt_disk_dqdbheader); for (i = 0; i < qtree_dqstr_in_blk(info); i++) { if (qtree_entry_unused(info, ddquot)) break; ddquot += info->dqi_entry_size; } #ifdef __QUOTA_QT_PARANOIA if (i == qtree_dqstr_in_blk(info)) { quota_error(dquot->dq_sb, "Data block full but it shouldn't"); *err = -EIO; goto out_buf; } #endif *err = write_blk(info, blk, buf); if (*err < 0) { quota_error(dquot->dq_sb, "Can't write quota data block %u", blk); goto out_buf; } dquot->dq_off = ((loff_t)blk << info->dqi_blocksize_bits) + sizeof(struct qt_disk_dqdbheader) + i * info->dqi_entry_size; kfree(buf); return blk; out_buf: kfree(buf); return 0; } /* Insert reference to structure into the trie */ static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot, uint *blks, int depth) { char *buf = kmalloc(info->dqi_usable_bs, GFP_KERNEL); int ret = 0, newson = 0, newact = 0; __le32 *ref; uint newblk; int i; if (!buf) return -ENOMEM; if (!blks[depth]) { ret = get_free_dqblk(info); if (ret < 0) goto out_buf; for (i = 0; i < depth; i++) if (ret == blks[i]) { quota_error(dquot->dq_sb, "Free block already used in tree: block %u", ret); ret = -EIO; goto out_buf; } blks[depth] = ret; memset(buf, 0, info->dqi_usable_bs); newact = 1; } else { ret = read_blk(info, blks[depth], buf); if (ret < 0) { quota_error(dquot->dq_sb, "Can't read tree quota " "block %u", blks[depth]); goto out_buf; } } ref = (__le32 *)buf; newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]); ret = do_check_range(dquot->dq_sb, "block", newblk, 0, info->dqi_blocks - 1); if (ret) goto out_buf; if (!newblk) { newson = 1; } else { for (i = 0; i <= depth; i++) if (newblk == blks[i]) { quota_error(dquot->dq_sb, "Cycle in quota tree detected: block %u index %u", blks[depth], get_index(info, dquot->dq_id, depth)); ret = -EIO; goto out_buf; } } blks[depth + 1] = newblk; if (depth == info->dqi_qtree_depth - 1) { #ifdef __QUOTA_QT_PARANOIA if (newblk) { quota_error(dquot->dq_sb, "Inserting already present " "quota entry (block %u)", le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)])); ret = -EIO; goto out_buf; } #endif blks[depth + 1] = find_free_dqentry(info, dquot, &ret); } else { ret = do_insert_tree(info, dquot, blks, depth + 1); } if (newson && ret >= 0) { ref[get_index(info, dquot->dq_id, depth)] = cpu_to_le32(blks[depth + 1]); ret = write_blk(info, blks[depth], buf); } else if (newact && ret < 0) { put_free_dqblk(info, buf, blks[depth]); } out_buf: kfree(buf); return ret; } /* Wrapper for inserting quota structure into tree */ static inline int dq_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot) { uint blks[MAX_QTREE_DEPTH] = { QT_TREEOFF }; #ifdef __QUOTA_QT_PARANOIA if (info->dqi_blocks <= QT_TREEOFF) { quota_error(dquot->dq_sb, "Quota tree root isn't allocated!"); return -EIO; } #endif if (info->dqi_qtree_depth >= MAX_QTREE_DEPTH) { quota_error(dquot->dq_sb, "Quota tree depth too big!"); return -EIO; } return do_insert_tree(info, dquot, blks, 0); } /* * We don't have to be afraid of deadlocks as we never have quotas on quota * files... */ int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot) { int type = dquot->dq_id.type; struct super_block *sb = dquot->dq_sb; ssize_t ret; char *ddquot = kmalloc(info->dqi_entry_size, GFP_KERNEL); if (!ddquot) return -ENOMEM; /* dq_off is guarded by dqio_sem */ if (!dquot->dq_off) { ret = dq_insert_tree(info, dquot); if (ret < 0) { quota_error(sb, "Error %zd occurred while creating " "quota", ret); kfree(ddquot); return ret; } } spin_lock(&dquot->dq_dqb_lock); info->dqi_ops->mem2disk_dqblk(ddquot, dquot); spin_unlock(&dquot->dq_dqb_lock); ret = sb->s_op->quota_write(sb, type, ddquot, info->dqi_entry_size, dquot->dq_off); if (ret != info->dqi_entry_size) { quota_error(sb, "dquota write failed"); if (ret >= 0) ret = -ENOSPC; } else { ret = 0; } dqstats_inc(DQST_WRITES); kfree(ddquot); return ret; } EXPORT_SYMBOL(qtree_write_dquot); /* Free dquot entry in data block */ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot, uint blk) { struct qt_disk_dqdbheader *dh; char *buf = kmalloc(info->dqi_usable_bs, GFP_KERNEL); int ret = 0; if (!buf) return -ENOMEM; if (dquot->dq_off >> info->dqi_blocksize_bits != blk) { quota_error(dquot->dq_sb, "Quota structure has offset to " "other block (%u) than it should (%u)", blk, (uint)(dquot->dq_off >> info->dqi_blocksize_bits)); ret = -EIO; goto out_buf; } ret = read_blk(info, blk, buf); if (ret < 0) { quota_error(dquot->dq_sb, "Can't read quota data block %u", blk); goto out_buf; } dh = (struct qt_disk_dqdbheader *)buf; ret = check_dquot_block_header(info, dh); if (ret) goto out_buf; le16_add_cpu(&dh->dqdh_entries, -1); if (!le16_to_cpu(dh->dqdh_entries)) { /* Block got free? */ ret = remove_free_dqentry(info, buf, blk); if (ret >= 0) ret = put_free_dqblk(info, buf, blk); if (ret < 0) { quota_error(dquot->dq_sb, "Can't move quota data block " "(%u) to free list", blk); goto out_buf; } } else { memset(buf + (dquot->dq_off & ((1 << info->dqi_blocksize_bits) - 1)), 0, info->dqi_entry_size); if (le16_to_cpu(dh->dqdh_entries) == qtree_dqstr_in_blk(info) - 1) { /* Insert will write block itself */ ret = insert_free_dqentry(info, buf, blk); if (ret < 0) { quota_error(dquot->dq_sb, "Can't insert quota " "data block (%u) to free entry list", blk); goto out_buf; } } else { ret = write_blk(info, blk, buf); if (ret < 0) { quota_error(dquot->dq_sb, "Can't write quota " "data block %u", blk); goto out_buf; } } } dquot->dq_off = 0; /* Quota is now unattached */ out_buf: kfree(buf); return ret; } /* Remove reference to dquot from tree */ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot, uint *blks, int depth) { char *buf = kmalloc(info->dqi_usable_bs, GFP_KERNEL); int ret = 0; uint newblk; __le32 *ref = (__le32 *)buf; int i; if (!buf) return -ENOMEM; ret = read_blk(info, blks[depth], buf); if (ret < 0) { quota_error(dquot->dq_sb, "Can't read quota data block %u", blks[depth]); goto out_buf; } newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]); ret = do_check_range(dquot->dq_sb, "block", newblk, QT_TREEOFF, info->dqi_blocks - 1); if (ret) goto out_buf; for (i = 0; i <= depth; i++) if (newblk == blks[i]) { quota_error(dquot->dq_sb, "Cycle in quota tree detected: block %u index %u", blks[depth], get_index(info, dquot->dq_id, depth)); ret = -EIO; goto out_buf; } if (depth == info->dqi_qtree_depth - 1) { ret = free_dqentry(info, dquot, newblk); blks[depth + 1] = 0; } else { blks[depth + 1] = newblk; ret = remove_tree(info, dquot, blks, depth + 1); } if (ret >= 0 && !blks[depth + 1]) { ref[get_index(info, dquot->dq_id, depth)] = cpu_to_le32(0); /* Block got empty? */ for (i = 0; i < (info->dqi_usable_bs >> 2) && !ref[i]; i++) ; /* Don't put the root block into the free block list */ if (i == (info->dqi_usable_bs >> 2) && blks[depth] != QT_TREEOFF) { put_free_dqblk(info, buf, blks[depth]); blks[depth] = 0; } else { ret = write_blk(info, blks[depth], buf); if (ret < 0) quota_error(dquot->dq_sb, "Can't write quota tree block %u", blks[depth]); } } out_buf: kfree(buf); return ret; } /* Delete dquot from tree */ int qtree_delete_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot) { uint blks[MAX_QTREE_DEPTH] = { QT_TREEOFF }; if (!dquot->dq_off) /* Even not allocated? */ return 0; if (info->dqi_qtree_depth >= MAX_QTREE_DEPTH) { quota_error(dquot->dq_sb, "Quota tree depth too big!"); return -EIO; } return remove_tree(info, dquot, blks, 0); } EXPORT_SYMBOL(qtree_delete_dquot); /* Find entry in block */ static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot, uint blk) { char *buf = kmalloc(info->dqi_usable_bs, GFP_KERNEL); loff_t ret = 0; int i; char *ddquot; if (!buf) return -ENOMEM; ret = read_blk(info, blk, buf); if (ret < 0) { quota_error(dquot->dq_sb, "Can't read quota tree " "block %u", blk); goto out_buf; } ddquot = buf + sizeof(struct qt_disk_dqdbheader); for (i = 0; i < qtree_dqstr_in_blk(info); i++) { if (info->dqi_ops->is_id(ddquot, dquot)) break; ddquot += info->dqi_entry_size; } if (i == qtree_dqstr_in_blk(info)) { quota_error(dquot->dq_sb, "Quota for id %u referenced but not present", from_kqid(&init_user_ns, dquot->dq_id)); ret = -EIO; goto out_buf; } else { ret = ((loff_t)blk << info->dqi_blocksize_bits) + sizeof(struct qt_disk_dqdbheader) + i * info->dqi_entry_size; } out_buf: kfree(buf); return ret; } /* Find entry for given id in the tree */ static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot, uint *blks, int depth) { char *buf = kmalloc(info->dqi_usable_bs, GFP_KERNEL); loff_t ret = 0; __le32 *ref = (__le32 *)buf; uint blk; int i; if (!buf) return -ENOMEM; ret = read_blk(info, blks[depth], buf); if (ret < 0) { quota_error(dquot->dq_sb, "Can't read quota tree block %u", blks[depth]); goto out_buf; } ret = 0; blk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]); if (!blk) /* No reference? */ goto out_buf; ret = do_check_range(dquot->dq_sb, "block", blk, QT_TREEOFF, info->dqi_blocks - 1); if (ret) goto out_buf; /* Check for cycles in the tree */ for (i = 0; i <= depth; i++) if (blk == blks[i]) { quota_error(dquot->dq_sb, "Cycle in quota tree detected: block %u index %u", blks[depth], get_index(info, dquot->dq_id, depth)); ret = -EIO; goto out_buf; } blks[depth + 1] = blk; if (depth < info->dqi_qtree_depth - 1) ret = find_tree_dqentry(info, dquot, blks, depth + 1); else ret = find_block_dqentry(info, dquot, blk); out_buf: kfree(buf); return ret; } /* Find entry for given id in the tree - wrapper function */ static inline loff_t find_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot) { uint blks[MAX_QTREE_DEPTH] = { QT_TREEOFF }; if (info->dqi_qtree_depth >= MAX_QTREE_DEPTH) { quota_error(dquot->dq_sb, "Quota tree depth too big!"); return -EIO; } return find_tree_dqentry(info, dquot, blks, 0); } int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot) { int type = dquot->dq_id.type; struct super_block *sb = dquot->dq_sb; loff_t offset; char *ddquot; int ret = 0; #ifdef __QUOTA_QT_PARANOIA /* Invalidated quota? */ if (!sb_dqopt(dquot->dq_sb)->files[type]) { quota_error(sb, "Quota invalidated while reading!"); return -EIO; } #endif /* Do we know offset of the dquot entry in the quota file? */ if (!dquot->dq_off) { offset = find_dqentry(info, dquot); if (offset <= 0) { /* Entry not present? */ if (offset < 0) quota_error(sb,"Can't read quota structure " "for id %u", from_kqid(&init_user_ns, dquot->dq_id)); dquot->dq_off = 0; set_bit(DQ_FAKE_B, &dquot->dq_flags); memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk)); ret = offset; goto out; } dquot->dq_off = offset; } ddquot = kmalloc(info->dqi_entry_size, GFP_KERNEL); if (!ddquot) return -ENOMEM; ret = sb->s_op->quota_read(sb, type, ddquot, info->dqi_entry_size, dquot->dq_off); if (ret != info->dqi_entry_size) { if (ret >= 0) ret = -EIO; quota_error(sb, "Error while reading quota structure for id %u", from_kqid(&init_user_ns, dquot->dq_id)); set_bit(DQ_FAKE_B, &dquot->dq_flags); memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk)); kfree(ddquot); goto out; } spin_lock(&dquot->dq_dqb_lock); info->dqi_ops->disk2mem_dqblk(dquot, ddquot); if (!dquot->dq_dqb.dqb_bhardlimit && !dquot->dq_dqb.dqb_bsoftlimit && !dquot->dq_dqb.dqb_ihardlimit && !dquot->dq_dqb.dqb_isoftlimit) set_bit(DQ_FAKE_B, &dquot->dq_flags); spin_unlock(&dquot->dq_dqb_lock); kfree(ddquot); out: dqstats_inc(DQST_READS); return ret; } EXPORT_SYMBOL(qtree_read_dquot); /* Check whether dquot should not be deleted. We know we are * the only one operating on dquot (thanks to dq_lock) */ int qtree_release_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot) { if (test_bit(DQ_FAKE_B, &dquot->dq_flags) && !(dquot->dq_dqb.dqb_curinodes | dquot->dq_dqb.dqb_curspace)) return qtree_delete_dquot(info, dquot); return 0; } EXPORT_SYMBOL(qtree_release_dquot); static int find_next_id(struct qtree_mem_dqinfo *info, qid_t *id, unsigned int blk, int depth) { char *buf = kmalloc(info->dqi_usable_bs, GFP_KERNEL); __le32 *ref = (__le32 *)buf; ssize_t ret; unsigned int epb = info->dqi_usable_bs >> 2; unsigned int level_inc = 1; int i; if (!buf) return -ENOMEM; for (i = depth; i < info->dqi_qtree_depth - 1; i++) level_inc *= epb; ret = read_blk(info, blk, buf); if (ret < 0) { quota_error(info->dqi_sb, "Can't read quota tree block %u", blk); goto out_buf; } for (i = __get_index(info, *id, depth); i < epb; i++) { uint blk_no = le32_to_cpu(ref[i]); if (blk_no == 0) { *id += level_inc; continue; } ret = do_check_range(info->dqi_sb, "block", blk_no, 0, info->dqi_blocks - 1); if (ret) goto out_buf; if (depth == info->dqi_qtree_depth - 1) { ret = 0; goto out_buf; } ret = find_next_id(info, id, blk_no, depth + 1); if (ret != -ENOENT) break; } if (i == epb) { ret = -ENOENT; goto out_buf; } out_buf: kfree(buf); return ret; } int qtree_get_next_id(struct qtree_mem_dqinfo *info, struct kqid *qid) { qid_t id = from_kqid(&init_user_ns, *qid); int ret; ret = find_next_id(info, &id, QT_TREEOFF, 0); if (ret < 0) return ret; *qid = make_kqid(&init_user_ns, qid->type, id); return 0; } EXPORT_SYMBOL(qtree_get_next_id);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 /* SPDX-License-Identifier: GPL-2.0 */ /* * (C) 2001 Clemson University and The University of Chicago * * See COPYING in top-level directory. */ /* * The ORANGEFS Linux kernel support allows ORANGEFS volumes to be mounted and * accessed through the Linux VFS (i.e. using standard I/O system calls). * This support is only needed on clients that wish to mount the file system. * */ /* * Declarations and macros for the ORANGEFS Linux kernel support. */ #ifndef __ORANGEFSKERNEL_H #define __ORANGEFSKERNEL_H #include <linux/kernel.h> #include <linux/moduleparam.h> #include <linux/statfs.h> #include <linux/backing-dev.h> #include <linux/device.h> #include <linux/mpage.h> #include <linux/namei.h> #include <linux/errno.h> #include <linux/init.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/fs.h> #include <linux/vmalloc.h> #include <linux/aio.h> #include <linux/posix_acl.h> #include <linux/posix_acl_xattr.h> #include <linux/compat.h> #include <linux/mount.h> #include <linux/uaccess.h> #include <linux/atomic.h> #include <linux/uio.h> #include <linux/sched/signal.h> #include <linux/mm.h> #include <linux/wait.h> #include <linux/dcache.h> #include <linux/pagemap.h> #include <linux/poll.h> #include <linux/rwsem.h> #include <linux/xattr.h> #include <linux/exportfs.h> #include <linux/hashtable.h> #include <linux/unaligned.h> #include "orangefs-dev-proto.h" #define ORANGEFS_DEFAULT_OP_TIMEOUT_SECS 20 #define ORANGEFS_BUFMAP_WAIT_TIMEOUT_SECS 30 #define ORANGEFS_DEFAULT_SLOT_TIMEOUT_SECS 900 /* 15 minutes */ #define ORANGEFS_REQDEVICE_NAME "pvfs2-req" #define ORANGEFS_DEVREQ_MAGIC 0x20030529 #define ORANGEFS_PURGE_RETRY_COUNT 0x00000005 #define MAX_DEV_REQ_UPSIZE (2 * sizeof(__s32) + \ sizeof(__u64) + sizeof(struct orangefs_upcall_s)) #define MAX_DEV_REQ_DOWNSIZE (2 * sizeof(__s32) + \ sizeof(__u64) + sizeof(struct orangefs_downcall_s)) /* * valid orangefs kernel operation states * * unknown - op was just initialized * waiting - op is on request_list (upward bound) * inprogr - op is in progress (waiting for downcall) * serviced - op has matching downcall; ok * purged - op has to start a timer since client-core * exited uncleanly before servicing op * given up - submitter has given up waiting for it */ enum orangefs_vfs_op_states { OP_VFS_STATE_UNKNOWN = 0, OP_VFS_STATE_WAITING = 1, OP_VFS_STATE_INPROGR = 2, OP_VFS_STATE_SERVICED = 4, OP_VFS_STATE_PURGED = 8, OP_VFS_STATE_GIVEN_UP = 16, }; extern const struct xattr_handler * const orangefs_xattr_handlers[]; extern struct posix_acl *orangefs_get_acl(struct inode *inode, int type, bool rcu); extern int orangefs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type); int __orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type); /* * orangefs data structures */ struct orangefs_kernel_op_s { enum orangefs_vfs_op_states op_state; __u64 tag; /* * Set uses_shared_memory to non zero if this operation uses * shared memory. If true, then a retry on the op must also * get a new shared memory buffer and re-populate it. * Cancels don't care - it only matters for service_operation() * retry logics and cancels don't go through it anymore. It * safely stays non-zero when we use it as slot_to_free. */ union { int uses_shared_memory; int slot_to_free; }; struct orangefs_upcall_s upcall; struct orangefs_downcall_s downcall; struct completion waitq; spinlock_t lock; int attempts; struct list_head list; }; #define set_op_state_waiting(op) ((op)->op_state = OP_VFS_STATE_WAITING) #define set_op_state_inprogress(op) ((op)->op_state = OP_VFS_STATE_INPROGR) #define set_op_state_given_up(op) ((op)->op_state = OP_VFS_STATE_GIVEN_UP) static inline void set_op_state_serviced(struct orangefs_kernel_op_s *op) { op->op_state = OP_VFS_STATE_SERVICED; complete(&op->waitq); } #define op_state_waiting(op) ((op)->op_state & OP_VFS_STATE_WAITING) #define op_state_in_progress(op) ((op)->op_state & OP_VFS_STATE_INPROGR) #define op_state_serviced(op) ((op)->op_state & OP_VFS_STATE_SERVICED) #define op_state_purged(op) ((op)->op_state & OP_VFS_STATE_PURGED) #define op_state_given_up(op) ((op)->op_state & OP_VFS_STATE_GIVEN_UP) #define op_is_cancel(op) ((op)->upcall.type == ORANGEFS_VFS_OP_CANCEL) void op_release(struct orangefs_kernel_op_s *op); extern void orangefs_bufmap_put(int); static inline void put_cancel(struct orangefs_kernel_op_s *op) { orangefs_bufmap_put(op->slot_to_free); op_release(op); } static inline void set_op_state_purged(struct orangefs_kernel_op_s *op) { spin_lock(&op->lock); if (unlikely(op_is_cancel(op))) { list_del_init(&op->list); spin_unlock(&op->lock); put_cancel(op); } else { op->op_state |= OP_VFS_STATE_PURGED; complete(&op->waitq); spin_unlock(&op->lock); } } /* per inode private orangefs info */ struct orangefs_inode_s { struct orangefs_object_kref refn; char link_target[ORANGEFS_NAME_MAX]; /* * Reading/Writing Extended attributes need to acquire the appropriate * reader/writer semaphore on the orangefs_inode_s structure. */ struct rw_semaphore xattr_sem; struct inode vfs_inode; sector_t last_failed_block_index_read; unsigned long getattr_time; unsigned long mapping_time; int attr_valid; kuid_t attr_uid; kgid_t attr_gid; unsigned long bitlock; DECLARE_HASHTABLE(xattr_cache, 4); }; /* per superblock private orangefs info */ struct orangefs_sb_info_s { struct orangefs_khandle root_khandle; __s32 fs_id; int id; int flags; #define ORANGEFS_OPT_INTR 0x01 #define ORANGEFS_OPT_LOCAL_LOCK 0x02 char devname[ORANGEFS_MAX_SERVER_ADDR_LEN]; struct super_block *sb; int mount_pending; int no_list; struct list_head list; }; struct orangefs_stats { unsigned long cache_hits; unsigned long cache_misses; unsigned long reads; unsigned long writes; }; struct orangefs_cached_xattr { struct hlist_node node; char key[ORANGEFS_MAX_XATTR_NAMELEN]; char val[ORANGEFS_MAX_XATTR_VALUELEN]; ssize_t length; unsigned long timeout; }; struct orangefs_write_range { loff_t pos; size_t len; kuid_t uid; kgid_t gid; }; extern struct orangefs_stats orangefs_stats; /* * NOTE: See Documentation/filesystems/porting.rst for information * on implementing FOO_I and properly accessing fs private data */ static inline struct orangefs_inode_s *ORANGEFS_I(struct inode *inode) { return container_of(inode, struct orangefs_inode_s, vfs_inode); } static inline struct orangefs_sb_info_s *ORANGEFS_SB(struct super_block *sb) { return (struct orangefs_sb_info_s *) sb->s_fs_info; } /* ino_t descends from "unsigned long", 8 bytes, 64 bits. */ static inline ino_t orangefs_khandle_to_ino(struct orangefs_khandle *khandle) { union { unsigned char u[8]; __u64 ino; } ihandle; ihandle.u[0] = khandle->u[0] ^ khandle->u[4]; ihandle.u[1] = khandle->u[1] ^ khandle->u[5]; ihandle.u[2] = khandle->u[2] ^ khandle->u[6]; ihandle.u[3] = khandle->u[3] ^ khandle->u[7]; ihandle.u[4] = khandle->u[12] ^ khandle->u[8]; ihandle.u[5] = khandle->u[13] ^ khandle->u[9]; ihandle.u[6] = khandle->u[14] ^ khandle->u[10]; ihandle.u[7] = khandle->u[15] ^ khandle->u[11]; return ihandle.ino; } static inline struct orangefs_khandle *get_khandle_from_ino(struct inode *inode) { return &(ORANGEFS_I(inode)->refn.khandle); } static inline int is_root_handle(struct inode *inode) { gossip_debug(GOSSIP_DCACHE_DEBUG, "%s: root handle: %pU, this handle: %pU:\n", __func__, &ORANGEFS_SB(inode->i_sb)->root_khandle, get_khandle_from_ino(inode)); if (ORANGEFS_khandle_cmp(&(ORANGEFS_SB(inode->i_sb)->root_khandle), get_khandle_from_ino(inode))) return 0; else return 1; } static inline int match_handle(struct orangefs_khandle resp_handle, struct inode *inode) { gossip_debug(GOSSIP_DCACHE_DEBUG, "%s: one handle: %pU, another handle:%pU:\n", __func__, &resp_handle, get_khandle_from_ino(inode)); if (ORANGEFS_khandle_cmp(&resp_handle, get_khandle_from_ino(inode))) return 0; else return 1; } /* * defined in orangefs-cache.c */ int op_cache_initialize(void); int op_cache_finalize(void); struct orangefs_kernel_op_s *op_alloc(__s32 type); void orangefs_new_tag(struct orangefs_kernel_op_s *op); char *get_opname_string(struct orangefs_kernel_op_s *new_op); int orangefs_inode_cache_initialize(void); int orangefs_inode_cache_finalize(void); /* * defined in orangefs-mod.c */ void purge_inprogress_ops(void); /* * defined in waitqueue.c */ void purge_waiting_ops(void); /* * defined in super.c */ extern uint64_t orangefs_features; struct dentry *orangefs_mount(struct file_system_type *fst, int flags, const char *devname, void *data); void orangefs_kill_sb(struct super_block *sb); int orangefs_remount(struct orangefs_sb_info_s *); int fsid_key_table_initialize(void); void fsid_key_table_finalize(void); /* * defined in inode.c */ vm_fault_t orangefs_page_mkwrite(struct vm_fault *); struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir, umode_t mode, dev_t dev, struct orangefs_object_kref *ref); int __orangefs_setattr(struct inode *, struct iattr *); int __orangefs_setattr_mode(struct dentry *dentry, struct iattr *iattr); int orangefs_setattr(struct mnt_idmap *, struct dentry *, struct iattr *); int orangefs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags); int orangefs_permission(struct mnt_idmap *idmap, struct inode *inode, int mask); int orangefs_update_time(struct inode *, int); /* * defined in xattr.c */ ssize_t orangefs_listxattr(struct dentry *dentry, char *buffer, size_t size); /* * defined in namei.c */ struct inode *orangefs_iget(struct super_block *sb, struct orangefs_object_kref *ref); /* * defined in devorangefs-req.c */ extern uint32_t orangefs_userspace_version; int orangefs_dev_init(void); void orangefs_dev_cleanup(void); int is_daemon_in_service(void); bool __is_daemon_in_service(void); /* * defined in file.c */ int orangefs_revalidate_mapping(struct inode *); ssize_t wait_for_direct_io(enum ORANGEFS_io_type, struct inode *, loff_t *, struct iov_iter *, size_t, loff_t, struct orangefs_write_range *, int *, struct file *); ssize_t do_readv_writev(enum ORANGEFS_io_type, struct file *, loff_t *, struct iov_iter *); /* * defined in orangefs-utils.c */ __s32 fsid_of_op(struct orangefs_kernel_op_s *op); ssize_t orangefs_inode_getxattr(struct inode *inode, const char *name, void *buffer, size_t size); int orangefs_inode_setxattr(struct inode *inode, const char *name, const void *value, size_t size, int flags); #define ORANGEFS_GETATTR_NEW 1 #define ORANGEFS_GETATTR_SIZE 2 int orangefs_inode_getattr(struct inode *, int); int orangefs_inode_check_changed(struct inode *inode); int orangefs_inode_setattr(struct inode *inode); bool orangefs_cancel_op_in_progress(struct orangefs_kernel_op_s *op); int orangefs_normalize_to_errno(__s32 error_code); extern struct mutex orangefs_request_mutex; extern int op_timeout_secs; extern int slot_timeout_secs; extern int orangefs_cache_timeout_msecs; extern int orangefs_dcache_timeout_msecs; extern int orangefs_getattr_timeout_msecs; extern struct list_head orangefs_superblocks; extern spinlock_t orangefs_superblocks_lock; extern struct list_head orangefs_request_list; extern spinlock_t orangefs_request_list_lock; extern wait_queue_head_t orangefs_request_list_waitq; extern struct list_head *orangefs_htable_ops_in_progress; extern spinlock_t orangefs_htable_ops_in_progress_lock; extern int hash_table_size; extern const struct file_operations orangefs_file_operations; extern const struct inode_operations orangefs_symlink_inode_operations; extern const struct inode_operations orangefs_dir_inode_operations; extern const struct file_operations orangefs_dir_operations; extern const struct dentry_operations orangefs_dentry_operations; /* * misc convenience macros */ #define ORANGEFS_OP_INTERRUPTIBLE 1 /* service_operation() is interruptible */ #define ORANGEFS_OP_PRIORITY 2 /* service_operation() is high priority */ #define ORANGEFS_OP_CANCELLATION 4 /* this is a cancellation */ #define ORANGEFS_OP_NO_MUTEX 8 /* don't acquire request_mutex */ #define ORANGEFS_OP_ASYNC 16 /* Queue it, but don't wait */ #define ORANGEFS_OP_WRITEBACK 32 int service_operation(struct orangefs_kernel_op_s *op, const char *op_name, int flags); #define get_interruptible_flag(inode) \ ((ORANGEFS_SB(inode->i_sb)->flags & ORANGEFS_OPT_INTR) ? \ ORANGEFS_OP_INTERRUPTIBLE : 0) #define fill_default_sys_attrs(sys_attr, type, mode) \ do { \ sys_attr.owner = from_kuid(&init_user_ns, current_fsuid()); \ sys_attr.group = from_kgid(&init_user_ns, current_fsgid()); \ sys_attr.perms = ORANGEFS_util_translate_mode(mode); \ sys_attr.mtime = 0; \ sys_attr.atime = 0; \ sys_attr.ctime = 0; \ sys_attr.mask = ORANGEFS_ATTR_SYS_ALL_SETABLE; \ } while (0) static inline void orangefs_set_timeout(struct dentry *dentry) { unsigned long time = jiffies + orangefs_dcache_timeout_msecs*HZ/1000; dentry->d_fsdata = (void *) time; } #endif /* __ORANGEFSKERNEL_H */
1 1 1 1 4 1 2 1 1 4 1 1 4 1 1 1 1 2 1 1 1 1 1 1 1 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 // SPDX-License-Identifier: GPL-2.0-only /* * kernel/ksysfs.c - sysfs attributes in /sys/kernel, which * are not related to any other subsystem * * Copyright (C) 2004 Kay Sievers <kay.sievers@vrfy.org> */ #include <asm/byteorder.h> #include <linux/kobject.h> #include <linux/string.h> #include <linux/sysfs.h> #include <linux/export.h> #include <linux/init.h> #include <linux/kexec.h> #include <linux/profile.h> #include <linux/stat.h> #include <linux/sched.h> #include <linux/capability.h> #include <linux/compiler.h> #include <linux/rcupdate.h> /* rcu_expedited and rcu_normal */ #if defined(__LITTLE_ENDIAN) #define CPU_BYTEORDER_STRING "little" #elif defined(__BIG_ENDIAN) #define CPU_BYTEORDER_STRING "big" #else #error Unknown byteorder #endif #define KERNEL_ATTR_RO(_name) \ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) #define KERNEL_ATTR_RW(_name) \ static struct kobj_attribute _name##_attr = __ATTR_RW(_name) /* current uevent sequence number */ static ssize_t uevent_seqnum_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%llu\n", (u64)atomic64_read(&uevent_seqnum)); } KERNEL_ATTR_RO(uevent_seqnum); /* cpu byteorder */ static ssize_t cpu_byteorder_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%s\n", CPU_BYTEORDER_STRING); } KERNEL_ATTR_RO(cpu_byteorder); /* address bits */ static ssize_t address_bits_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%zu\n", sizeof(void *) * 8 /* CHAR_BIT */); } KERNEL_ATTR_RO(address_bits); #ifdef CONFIG_UEVENT_HELPER /* uevent helper program, used during early boot */ static ssize_t uevent_helper_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%s\n", uevent_helper); } static ssize_t uevent_helper_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { if (count+1 > UEVENT_HELPER_PATH_LEN) return -ENOENT; memcpy(uevent_helper, buf, count); uevent_helper[count] = '\0'; if (count && uevent_helper[count-1] == '\n') uevent_helper[count-1] = '\0'; return count; } KERNEL_ATTR_RW(uevent_helper); #endif #ifdef CONFIG_PROFILING static ssize_t profiling_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%d\n", prof_on); } static ssize_t profiling_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int ret; static DEFINE_MUTEX(lock); /* * We need serialization, for profile_setup() initializes prof_on * value and profile_init() must not reallocate prof_buffer after * once allocated. */ guard(mutex)(&lock); if (prof_on) return -EEXIST; /* * This eventually calls into get_option() which * has a ton of callers and is not const. It is * easiest to cast it away here. */ profile_setup((char *)buf); ret = profile_init(); if (ret) return ret; ret = create_proc_profile(); if (ret) return ret; return count; } KERNEL_ATTR_RW(profiling); #endif #ifdef CONFIG_KEXEC_CORE static ssize_t kexec_loaded_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%d\n", !!kexec_image); } KERNEL_ATTR_RO(kexec_loaded); #ifdef CONFIG_CRASH_DUMP static ssize_t kexec_crash_loaded_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%d\n", kexec_crash_loaded()); } KERNEL_ATTR_RO(kexec_crash_loaded); static ssize_t kexec_crash_size_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { ssize_t size = crash_get_memory_size(); if (size < 0) return size; return sysfs_emit(buf, "%zd\n", size); } static ssize_t kexec_crash_size_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { unsigned long cnt; int ret; if (kstrtoul(buf, 0, &cnt)) return -EINVAL; ret = crash_shrink_memory(cnt); return ret < 0 ? ret : count; } KERNEL_ATTR_RW(kexec_crash_size); #endif /* CONFIG_CRASH_DUMP*/ #endif /* CONFIG_KEXEC_CORE */ #ifdef CONFIG_VMCORE_INFO static ssize_t vmcoreinfo_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { phys_addr_t vmcore_base = paddr_vmcoreinfo_note(); return sysfs_emit(buf, "%pa %x\n", &vmcore_base, (unsigned int)VMCOREINFO_NOTE_SIZE); } KERNEL_ATTR_RO(vmcoreinfo); #ifdef CONFIG_CRASH_HOTPLUG static ssize_t crash_elfcorehdr_size_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { unsigned int sz = crash_get_elfcorehdr_size(); return sysfs_emit(buf, "%u\n", sz); } KERNEL_ATTR_RO(crash_elfcorehdr_size); #endif #endif /* CONFIG_VMCORE_INFO */ /* whether file capabilities are enabled */ static ssize_t fscaps_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%d\n", file_caps_enabled); } KERNEL_ATTR_RO(fscaps); #ifndef CONFIG_TINY_RCU int rcu_expedited; static ssize_t rcu_expedited_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%d\n", READ_ONCE(rcu_expedited)); } static ssize_t rcu_expedited_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { if (kstrtoint(buf, 0, &rcu_expedited)) return -EINVAL; return count; } KERNEL_ATTR_RW(rcu_expedited); int rcu_normal; static ssize_t rcu_normal_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%d\n", READ_ONCE(rcu_normal)); } static ssize_t rcu_normal_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { if (kstrtoint(buf, 0, &rcu_normal)) return -EINVAL; return count; } KERNEL_ATTR_RW(rcu_normal); #endif /* #ifndef CONFIG_TINY_RCU */ /* * Make /sys/kernel/notes give the raw contents of our kernel .notes section. */ extern const void __start_notes; extern const void __stop_notes; #define notes_size (&__stop_notes - &__start_notes) static __ro_after_init BIN_ATTR_SIMPLE_RO(notes); struct kobject *kernel_kobj; EXPORT_SYMBOL_GPL(kernel_kobj); static struct attribute * kernel_attrs[] = { &fscaps_attr.attr, &uevent_seqnum_attr.attr, &cpu_byteorder_attr.attr, &address_bits_attr.attr, #ifdef CONFIG_UEVENT_HELPER &uevent_helper_attr.attr, #endif #ifdef CONFIG_PROFILING &profiling_attr.attr, #endif #ifdef CONFIG_KEXEC_CORE &kexec_loaded_attr.attr, #ifdef CONFIG_CRASH_DUMP &kexec_crash_loaded_attr.attr, &kexec_crash_size_attr.attr, #endif #endif #ifdef CONFIG_VMCORE_INFO &vmcoreinfo_attr.attr, #ifdef CONFIG_CRASH_HOTPLUG &crash_elfcorehdr_size_attr.attr, #endif #endif #ifndef CONFIG_TINY_RCU &rcu_expedited_attr.attr, &rcu_normal_attr.attr, #endif NULL }; static const struct attribute_group kernel_attr_group = { .attrs = kernel_attrs, }; static int __init ksysfs_init(void) { int error; kernel_kobj = kobject_create_and_add("kernel", NULL); if (!kernel_kobj) { error = -ENOMEM; goto exit; } error = sysfs_create_group(kernel_kobj, &kernel_attr_group); if (error) goto kset_exit; if (notes_size > 0) { bin_attr_notes.private = (void *)&__start_notes; bin_attr_notes.size = notes_size; error = sysfs_create_bin_file(kernel_kobj, &bin_attr_notes); if (error) goto group_exit; } return 0; group_exit: sysfs_remove_group(kernel_kobj, &kernel_attr_group); kset_exit: kobject_put(kernel_kobj); exit: return error; } core_initcall(ksysfs_init);
1 3 3 4 3 1 4 4 6 1 1 3 1 4 1 2 1 2 10 1 1 19 1 1 2 12 2 1 1 3 3 7 6 2 2 3 1 4 2 2 8 1 10 10 38 38 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2015 Jiri Pirko <jiri@resnulli.us> */ #include <linux/module.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/filter.h> #include <linux/bpf.h> #include <net/netlink.h> #include <net/sock.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> #include <linux/tc_act/tc_bpf.h> #include <net/tc_act/tc_bpf.h> #include <net/tc_wrapper.h> #define ACT_BPF_NAME_LEN 256 struct tcf_bpf_cfg { struct bpf_prog *filter; struct sock_filter *bpf_ops; const char *bpf_name; u16 bpf_num_ops; bool is_ebpf; }; static struct tc_action_ops act_bpf_ops; TC_INDIRECT_SCOPE int tcf_bpf_act(struct sk_buff *skb, const struct tc_action *act, struct tcf_result *res) { bool at_ingress = skb_at_tc_ingress(skb); struct tcf_bpf *prog = to_bpf(act); struct bpf_prog *filter; int action, filter_res; tcf_lastuse_update(&prog->tcf_tm); bstats_update(this_cpu_ptr(prog->common.cpu_bstats), skb); filter = rcu_dereference(prog->filter); if (at_ingress) { __skb_push(skb, skb->mac_len); bpf_compute_data_pointers(skb); filter_res = bpf_prog_run(filter, skb); __skb_pull(skb, skb->mac_len); } else { bpf_compute_data_pointers(skb); filter_res = bpf_prog_run(filter, skb); } if (unlikely(!skb->tstamp && skb->tstamp_type)) skb->tstamp_type = SKB_CLOCK_REALTIME; if (skb_sk_is_prefetched(skb) && filter_res != TC_ACT_OK) skb_orphan(skb); /* A BPF program may overwrite the default action opcode. * Similarly as in cls_bpf, if filter_res == -1 we use the * default action specified from tc. * * In case a different well-known TC_ACT opcode has been * returned, it will overwrite the default one. * * For everything else that is unknown, TC_ACT_UNSPEC is * returned. */ switch (filter_res) { case TC_ACT_PIPE: case TC_ACT_RECLASSIFY: case TC_ACT_OK: case TC_ACT_REDIRECT: action = filter_res; break; case TC_ACT_SHOT: action = filter_res; qstats_drop_inc(this_cpu_ptr(prog->common.cpu_qstats)); break; case TC_ACT_UNSPEC: action = prog->tcf_action; break; default: action = TC_ACT_UNSPEC; break; } return action; } static bool tcf_bpf_is_ebpf(const struct tcf_bpf *prog) { return !prog->bpf_ops; } static int tcf_bpf_dump_bpf_info(const struct tcf_bpf *prog, struct sk_buff *skb) { struct nlattr *nla; if (nla_put_u16(skb, TCA_ACT_BPF_OPS_LEN, prog->bpf_num_ops)) return -EMSGSIZE; nla = nla_reserve(skb, TCA_ACT_BPF_OPS, prog->bpf_num_ops * sizeof(struct sock_filter)); if (nla == NULL) return -EMSGSIZE; memcpy(nla_data(nla), prog->bpf_ops, nla_len(nla)); return 0; } static int tcf_bpf_dump_ebpf_info(const struct tcf_bpf *prog, struct sk_buff *skb) { struct nlattr *nla; if (prog->bpf_name && nla_put_string(skb, TCA_ACT_BPF_NAME, prog->bpf_name)) return -EMSGSIZE; if (nla_put_u32(skb, TCA_ACT_BPF_ID, prog->filter->aux->id)) return -EMSGSIZE; nla = nla_reserve(skb, TCA_ACT_BPF_TAG, sizeof(prog->filter->tag)); if (nla == NULL) return -EMSGSIZE; memcpy(nla_data(nla), prog->filter->tag, nla_len(nla)); return 0; } static int tcf_bpf_dump(struct sk_buff *skb, struct tc_action *act, int bind, int ref) { unsigned char *tp = skb_tail_pointer(skb); struct tcf_bpf *prog = to_bpf(act); struct tc_act_bpf opt = { .index = prog->tcf_index, .refcnt = refcount_read(&prog->tcf_refcnt) - ref, .bindcnt = atomic_read(&prog->tcf_bindcnt) - bind, }; struct tcf_t tm; int ret; spin_lock_bh(&prog->tcf_lock); opt.action = prog->tcf_action; if (nla_put(skb, TCA_ACT_BPF_PARMS, sizeof(opt), &opt)) goto nla_put_failure; if (tcf_bpf_is_ebpf(prog)) ret = tcf_bpf_dump_ebpf_info(prog, skb); else ret = tcf_bpf_dump_bpf_info(prog, skb); if (ret) goto nla_put_failure; tcf_tm_dump(&tm, &prog->tcf_tm); if (nla_put_64bit(skb, TCA_ACT_BPF_TM, sizeof(tm), &tm, TCA_ACT_BPF_PAD)) goto nla_put_failure; spin_unlock_bh(&prog->tcf_lock); return skb->len; nla_put_failure: spin_unlock_bh(&prog->tcf_lock); nlmsg_trim(skb, tp); return -1; } static const struct nla_policy act_bpf_policy[TCA_ACT_BPF_MAX + 1] = { [TCA_ACT_BPF_PARMS] = { .len = sizeof(struct tc_act_bpf) }, [TCA_ACT_BPF_FD] = { .type = NLA_U32 }, [TCA_ACT_BPF_NAME] = { .type = NLA_NUL_STRING, .len = ACT_BPF_NAME_LEN }, [TCA_ACT_BPF_OPS_LEN] = { .type = NLA_U16 }, [TCA_ACT_BPF_OPS] = { .type = NLA_BINARY, .len = sizeof(struct sock_filter) * BPF_MAXINSNS }, }; static int tcf_bpf_init_from_ops(struct nlattr **tb, struct tcf_bpf_cfg *cfg) { struct sock_filter *bpf_ops; struct sock_fprog_kern fprog_tmp; struct bpf_prog *fp; u16 bpf_size, bpf_num_ops; int ret; bpf_num_ops = nla_get_u16(tb[TCA_ACT_BPF_OPS_LEN]); if (bpf_num_ops > BPF_MAXINSNS || bpf_num_ops == 0) return -EINVAL; bpf_size = bpf_num_ops * sizeof(*bpf_ops); if (bpf_size != nla_len(tb[TCA_ACT_BPF_OPS])) return -EINVAL; bpf_ops = kmemdup(nla_data(tb[TCA_ACT_BPF_OPS]), bpf_size, GFP_KERNEL); if (bpf_ops == NULL) return -ENOMEM; fprog_tmp.len = bpf_num_ops; fprog_tmp.filter = bpf_ops; ret = bpf_prog_create(&fp, &fprog_tmp); if (ret < 0) { kfree(bpf_ops); return ret; } cfg->bpf_ops = bpf_ops; cfg->bpf_num_ops = bpf_num_ops; cfg->filter = fp; cfg->is_ebpf = false; return 0; } static int tcf_bpf_init_from_efd(struct nlattr **tb, struct tcf_bpf_cfg *cfg) { struct bpf_prog *fp; char *name = NULL; u32 bpf_fd; bpf_fd = nla_get_u32(tb[TCA_ACT_BPF_FD]); fp = bpf_prog_get_type(bpf_fd, BPF_PROG_TYPE_SCHED_ACT); if (IS_ERR(fp)) return PTR_ERR(fp); if (tb[TCA_ACT_BPF_NAME]) { name = nla_memdup(tb[TCA_ACT_BPF_NAME], GFP_KERNEL); if (!name) { bpf_prog_put(fp); return -ENOMEM; } } cfg->bpf_name = name; cfg->filter = fp; cfg->is_ebpf = true; return 0; } static void tcf_bpf_cfg_cleanup(const struct tcf_bpf_cfg *cfg) { struct bpf_prog *filter = cfg->filter; if (filter) { if (cfg->is_ebpf) bpf_prog_put(filter); else bpf_prog_destroy(filter); } kfree(cfg->bpf_ops); kfree(cfg->bpf_name); } static void tcf_bpf_prog_fill_cfg(const struct tcf_bpf *prog, struct tcf_bpf_cfg *cfg) { cfg->is_ebpf = tcf_bpf_is_ebpf(prog); /* updates to prog->filter are prevented, since it's called either * with tcf lock or during final cleanup in rcu callback */ cfg->filter = rcu_dereference_protected(prog->filter, 1); cfg->bpf_ops = prog->bpf_ops; cfg->bpf_name = prog->bpf_name; } static int tcf_bpf_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **act, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, act_bpf_ops.net_id); bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_ACT_BPF_MAX + 1]; struct tcf_chain *goto_ch = NULL; struct tcf_bpf_cfg cfg, old; struct tc_act_bpf *parm; struct tcf_bpf *prog; bool is_bpf, is_ebpf; int ret, res = 0; u32 index; if (!nla) return -EINVAL; ret = nla_parse_nested_deprecated(tb, TCA_ACT_BPF_MAX, nla, act_bpf_policy, NULL); if (ret < 0) return ret; if (!tb[TCA_ACT_BPF_PARMS]) return -EINVAL; parm = nla_data(tb[TCA_ACT_BPF_PARMS]); index = parm->index; ret = tcf_idr_check_alloc(tn, &index, act, bind); if (!ret) { ret = tcf_idr_create(tn, index, est, act, &act_bpf_ops, bind, true, flags); if (ret < 0) { tcf_idr_cleanup(tn, index); return ret; } res = ACT_P_CREATED; } else if (ret > 0) { /* Don't override defaults. */ if (bind) return ACT_P_BOUND; if (!(flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*act, bind); return -EEXIST; } } else { return ret; } ret = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); if (ret < 0) goto release_idr; is_bpf = tb[TCA_ACT_BPF_OPS_LEN] && tb[TCA_ACT_BPF_OPS]; is_ebpf = tb[TCA_ACT_BPF_FD]; if (is_bpf == is_ebpf) { ret = -EINVAL; goto put_chain; } memset(&cfg, 0, sizeof(cfg)); ret = is_bpf ? tcf_bpf_init_from_ops(tb, &cfg) : tcf_bpf_init_from_efd(tb, &cfg); if (ret < 0) goto put_chain; prog = to_bpf(*act); spin_lock_bh(&prog->tcf_lock); if (res != ACT_P_CREATED) tcf_bpf_prog_fill_cfg(prog, &old); prog->bpf_ops = cfg.bpf_ops; prog->bpf_name = cfg.bpf_name; if (cfg.bpf_num_ops) prog->bpf_num_ops = cfg.bpf_num_ops; goto_ch = tcf_action_set_ctrlact(*act, parm->action, goto_ch); rcu_assign_pointer(prog->filter, cfg.filter); spin_unlock_bh(&prog->tcf_lock); if (goto_ch) tcf_chain_put_by_act(goto_ch); if (res != ACT_P_CREATED) { /* make sure the program being replaced is no longer executing */ synchronize_rcu(); tcf_bpf_cfg_cleanup(&old); } return res; put_chain: if (goto_ch) tcf_chain_put_by_act(goto_ch); release_idr: tcf_idr_release(*act, bind); return ret; } static void tcf_bpf_cleanup(struct tc_action *act) { struct tcf_bpf_cfg tmp; tcf_bpf_prog_fill_cfg(to_bpf(act), &tmp); tcf_bpf_cfg_cleanup(&tmp); } static struct tc_action_ops act_bpf_ops __read_mostly = { .kind = "bpf", .id = TCA_ID_BPF, .owner = THIS_MODULE, .act = tcf_bpf_act, .dump = tcf_bpf_dump, .cleanup = tcf_bpf_cleanup, .init = tcf_bpf_init, .size = sizeof(struct tcf_bpf), }; MODULE_ALIAS_NET_ACT("bpf"); static __net_init int bpf_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, act_bpf_ops.net_id); return tc_action_net_init(net, tn, &act_bpf_ops); } static void __net_exit bpf_exit_net(struct list_head *net_list) { tc_action_net_exit(net_list, act_bpf_ops.net_id); } static struct pernet_operations bpf_net_ops = { .init = bpf_init_net, .exit_batch = bpf_exit_net, .id = &act_bpf_ops.net_id, .size = sizeof(struct tc_action_net), }; static int __init bpf_init_module(void) { return tcf_register_action(&act_bpf_ops, &bpf_net_ops); } static void __exit bpf_cleanup_module(void) { tcf_unregister_action(&act_bpf_ops, &bpf_net_ops); } module_init(bpf_init_module); module_exit(bpf_cleanup_module); MODULE_AUTHOR("Jiri Pirko <jiri@resnulli.us>"); MODULE_DESCRIPTION("TC BPF based action"); MODULE_LICENSE("GPL v2");
407 404 118 148 107 269 5 92 198 92 5 90 175 190 364 363 120 284 7 63 306 304 2 121 285 244 244 242 245 244 243 245 108 43 4 54 46 90 4 126 126 58 5 47 4 126 126 6 121 139 138 2 82 201 163 43 42 42 163 111 104 13 1 8 110 110 105 5 110 104 5 103 264 263 265 266 162 103 266 163 163 15 2 1 1 1 2 2 87 88 88 88 71 17 17 77 12 76 88 33 33 329 327 125 81 75 5 80 80 167 167 4 2 8 1 5 3 145 367 95 46 47 28 20 178 80 326 327 327 93 163 103 103 104 16 95 99 104 93 104 90 15 255 127 19 108 314 154 163 163 1 162 163 163 1 1 162 1 163 163 163 163 161 163 162 208 112 96 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Generic INET transport hashtables * * Authors: Lotsa people, from code originally in tcp */ #include <linux/module.h> #include <linux/random.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/wait.h> #include <linux/vmalloc.h> #include <linux/memblock.h> #include <net/addrconf.h> #include <net/inet_connection_sock.h> #include <net/inet_hashtables.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/inet6_hashtables.h> #endif #include <net/secure_seq.h> #include <net/hotdata.h> #include <net/ip.h> #include <net/tcp.h> #include <net/sock_reuseport.h> u32 inet_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport, const __be32 faddr, const __be16 fport) { net_get_random_once(&inet_ehash_secret, sizeof(inet_ehash_secret)); return __inet_ehashfn(laddr, lport, faddr, fport, inet_ehash_secret + net_hash_mix(net)); } EXPORT_SYMBOL_GPL(inet_ehashfn); /* This function handles inet_sock, but also timewait and request sockets * for IPv4/IPv6. */ static u32 sk_ehashfn(const struct sock *sk) { #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6 && !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) return inet6_ehashfn(sock_net(sk), &sk->sk_v6_rcv_saddr, sk->sk_num, &sk->sk_v6_daddr, sk->sk_dport); #endif return inet_ehashfn(sock_net(sk), sk->sk_rcv_saddr, sk->sk_num, sk->sk_daddr, sk->sk_dport); } /* * Allocate and initialize a new local port bind bucket. * The bindhash mutex for snum's hash chain must be held here. */ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net, struct inet_bind_hashbucket *head, const unsigned short snum, int l3mdev) { struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC); if (tb) { write_pnet(&tb->ib_net, net); tb->l3mdev = l3mdev; tb->port = snum; tb->fastreuse = 0; tb->fastreuseport = 0; INIT_HLIST_HEAD(&tb->bhash2); hlist_add_head(&tb->node, &head->chain); } return tb; } /* * Caller must hold hashbucket lock for this tb with local BH disabled */ void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb) { if (hlist_empty(&tb->bhash2)) { __hlist_del(&tb->node); kmem_cache_free(cachep, tb); } } bool inet_bind_bucket_match(const struct inet_bind_bucket *tb, const struct net *net, unsigned short port, int l3mdev) { return net_eq(ib_net(tb), net) && tb->port == port && tb->l3mdev == l3mdev; } static void inet_bind2_bucket_init(struct inet_bind2_bucket *tb2, struct net *net, struct inet_bind_hashbucket *head, struct inet_bind_bucket *tb, const struct sock *sk) { write_pnet(&tb2->ib_net, net); tb2->l3mdev = tb->l3mdev; tb2->port = tb->port; #if IS_ENABLED(CONFIG_IPV6) BUILD_BUG_ON(USHRT_MAX < (IPV6_ADDR_ANY | IPV6_ADDR_MAPPED)); if (sk->sk_family == AF_INET6) { tb2->addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr); tb2->v6_rcv_saddr = sk->sk_v6_rcv_saddr; } else { tb2->addr_type = IPV6_ADDR_MAPPED; ipv6_addr_set_v4mapped(sk->sk_rcv_saddr, &tb2->v6_rcv_saddr); } #else tb2->rcv_saddr = sk->sk_rcv_saddr; #endif INIT_HLIST_HEAD(&tb2->owners); hlist_add_head(&tb2->node, &head->chain); hlist_add_head(&tb2->bhash_node, &tb->bhash2); } struct inet_bind2_bucket *inet_bind2_bucket_create(struct kmem_cache *cachep, struct net *net, struct inet_bind_hashbucket *head, struct inet_bind_bucket *tb, const struct sock *sk) { struct inet_bind2_bucket *tb2 = kmem_cache_alloc(cachep, GFP_ATOMIC); if (tb2) inet_bind2_bucket_init(tb2, net, head, tb, sk); return tb2; } /* Caller must hold hashbucket lock for this tb with local BH disabled */ void inet_bind2_bucket_destroy(struct kmem_cache *cachep, struct inet_bind2_bucket *tb) { if (hlist_empty(&tb->owners)) { __hlist_del(&tb->node); __hlist_del(&tb->bhash_node); kmem_cache_free(cachep, tb); } } static bool inet_bind2_bucket_addr_match(const struct inet_bind2_bucket *tb2, const struct sock *sk) { #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) return ipv6_addr_equal(&tb2->v6_rcv_saddr, &sk->sk_v6_rcv_saddr); if (tb2->addr_type != IPV6_ADDR_MAPPED) return false; #endif return tb2->rcv_saddr == sk->sk_rcv_saddr; } void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, struct inet_bind2_bucket *tb2, unsigned short port) { inet_sk(sk)->inet_num = port; inet_csk(sk)->icsk_bind_hash = tb; inet_csk(sk)->icsk_bind2_hash = tb2; sk_add_bind_node(sk, &tb2->owners); } /* * Get rid of any references to a local port held by the given sock. */ static void __inet_put_port(struct sock *sk) { struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); struct inet_bind_hashbucket *head, *head2; struct net *net = sock_net(sk); struct inet_bind_bucket *tb; int bhash; bhash = inet_bhashfn(net, inet_sk(sk)->inet_num, hashinfo->bhash_size); head = &hashinfo->bhash[bhash]; head2 = inet_bhashfn_portaddr(hashinfo, sk, net, inet_sk(sk)->inet_num); spin_lock(&head->lock); tb = inet_csk(sk)->icsk_bind_hash; inet_csk(sk)->icsk_bind_hash = NULL; inet_sk(sk)->inet_num = 0; spin_lock(&head2->lock); if (inet_csk(sk)->icsk_bind2_hash) { struct inet_bind2_bucket *tb2 = inet_csk(sk)->icsk_bind2_hash; __sk_del_bind_node(sk); inet_csk(sk)->icsk_bind2_hash = NULL; inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2); } spin_unlock(&head2->lock); inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); spin_unlock(&head->lock); } void inet_put_port(struct sock *sk) { local_bh_disable(); __inet_put_port(sk); local_bh_enable(); } EXPORT_SYMBOL(inet_put_port); int __inet_inherit_port(const struct sock *sk, struct sock *child) { struct inet_hashinfo *table = tcp_or_dccp_get_hashinfo(sk); unsigned short port = inet_sk(child)->inet_num; struct inet_bind_hashbucket *head, *head2; bool created_inet_bind_bucket = false; struct net *net = sock_net(sk); bool update_fastreuse = false; struct inet_bind2_bucket *tb2; struct inet_bind_bucket *tb; int bhash, l3mdev; bhash = inet_bhashfn(net, port, table->bhash_size); head = &table->bhash[bhash]; head2 = inet_bhashfn_portaddr(table, child, net, port); spin_lock(&head->lock); spin_lock(&head2->lock); tb = inet_csk(sk)->icsk_bind_hash; tb2 = inet_csk(sk)->icsk_bind2_hash; if (unlikely(!tb || !tb2)) { spin_unlock(&head2->lock); spin_unlock(&head->lock); return -ENOENT; } if (tb->port != port) { l3mdev = inet_sk_bound_l3mdev(sk); /* NOTE: using tproxy and redirecting skbs to a proxy * on a different listener port breaks the assumption * that the listener socket's icsk_bind_hash is the same * as that of the child socket. We have to look up or * create a new bind bucket for the child here. */ inet_bind_bucket_for_each(tb, &head->chain) { if (inet_bind_bucket_match(tb, net, port, l3mdev)) break; } if (!tb) { tb = inet_bind_bucket_create(table->bind_bucket_cachep, net, head, port, l3mdev); if (!tb) { spin_unlock(&head2->lock); spin_unlock(&head->lock); return -ENOMEM; } created_inet_bind_bucket = true; } update_fastreuse = true; goto bhash2_find; } else if (!inet_bind2_bucket_addr_match(tb2, child)) { l3mdev = inet_sk_bound_l3mdev(sk); bhash2_find: tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, child); if (!tb2) { tb2 = inet_bind2_bucket_create(table->bind2_bucket_cachep, net, head2, tb, child); if (!tb2) goto error; } } if (update_fastreuse) inet_csk_update_fastreuse(tb, child); inet_bind_hash(child, tb, tb2, port); spin_unlock(&head2->lock); spin_unlock(&head->lock); return 0; error: if (created_inet_bind_bucket) inet_bind_bucket_destroy(table->bind_bucket_cachep, tb); spin_unlock(&head2->lock); spin_unlock(&head->lock); return -ENOMEM; } EXPORT_SYMBOL_GPL(__inet_inherit_port); static struct inet_listen_hashbucket * inet_lhash2_bucket_sk(struct inet_hashinfo *h, struct sock *sk) { u32 hash; #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) hash = ipv6_portaddr_hash(sock_net(sk), &sk->sk_v6_rcv_saddr, inet_sk(sk)->inet_num); else #endif hash = ipv4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, inet_sk(sk)->inet_num); return inet_lhash2_bucket(h, hash); } static inline int compute_score(struct sock *sk, const struct net *net, const unsigned short hnum, const __be32 daddr, const int dif, const int sdif) { int score = -1; if (net_eq(sock_net(sk), net) && sk->sk_num == hnum && !ipv6_only_sock(sk)) { if (sk->sk_rcv_saddr != daddr) return -1; if (!inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif)) return -1; score = sk->sk_bound_dev_if ? 2 : 1; if (sk->sk_family == PF_INET) score++; if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id()) score++; } return score; } /** * inet_lookup_reuseport() - execute reuseport logic on AF_INET socket if necessary. * @net: network namespace. * @sk: AF_INET socket, must be in TCP_LISTEN state for TCP or TCP_CLOSE for UDP. * @skb: context for a potential SK_REUSEPORT program. * @doff: header offset. * @saddr: source address. * @sport: source port. * @daddr: destination address. * @hnum: destination port in host byte order. * @ehashfn: hash function used to generate the fallback hash. * * Return: NULL if sk doesn't have SO_REUSEPORT set, otherwise a pointer to * the selected sock or an error. */ struct sock *inet_lookup_reuseport(const struct net *net, struct sock *sk, struct sk_buff *skb, int doff, __be32 saddr, __be16 sport, __be32 daddr, unsigned short hnum, inet_ehashfn_t *ehashfn) { struct sock *reuse_sk = NULL; u32 phash; if (sk->sk_reuseport) { phash = INDIRECT_CALL_2(ehashfn, udp_ehashfn, inet_ehashfn, net, daddr, hnum, saddr, sport); reuse_sk = reuseport_select_sock(sk, phash, skb, doff); } return reuse_sk; } EXPORT_SYMBOL_GPL(inet_lookup_reuseport); /* * Here are some nice properties to exploit here. The BSD API * does not allow a listening sock to specify the remote port nor the * remote address for the connection. So always assume those are both * wildcarded during the search since they can never be otherwise. */ /* called with rcu_read_lock() : No refcount taken on the socket */ static struct sock *inet_lhash2_lookup(const struct net *net, struct inet_listen_hashbucket *ilb2, struct sk_buff *skb, int doff, const __be32 saddr, __be16 sport, const __be32 daddr, const unsigned short hnum, const int dif, const int sdif) { struct sock *sk, *result = NULL; struct hlist_nulls_node *node; int score, hiscore = 0; sk_nulls_for_each_rcu(sk, node, &ilb2->nulls_head) { score = compute_score(sk, net, hnum, daddr, dif, sdif); if (score > hiscore) { result = inet_lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum, inet_ehashfn); if (result) return result; result = sk; hiscore = score; } } return result; } struct sock *inet_lookup_run_sk_lookup(const struct net *net, int protocol, struct sk_buff *skb, int doff, __be32 saddr, __be16 sport, __be32 daddr, u16 hnum, const int dif, inet_ehashfn_t *ehashfn) { struct sock *sk, *reuse_sk; bool no_reuseport; no_reuseport = bpf_sk_lookup_run_v4(net, protocol, saddr, sport, daddr, hnum, dif, &sk); if (no_reuseport || IS_ERR_OR_NULL(sk)) return sk; reuse_sk = inet_lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum, ehashfn); if (reuse_sk) sk = reuse_sk; return sk; } struct sock *__inet_lookup_listener(const struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const __be32 saddr, __be16 sport, const __be32 daddr, const unsigned short hnum, const int dif, const int sdif) { struct inet_listen_hashbucket *ilb2; struct sock *result = NULL; unsigned int hash2; /* Lookup redirect from BPF */ if (static_branch_unlikely(&bpf_sk_lookup_enabled) && hashinfo == net->ipv4.tcp_death_row.hashinfo) { result = inet_lookup_run_sk_lookup(net, IPPROTO_TCP, skb, doff, saddr, sport, daddr, hnum, dif, inet_ehashfn); if (result) goto done; } hash2 = ipv4_portaddr_hash(net, daddr, hnum); ilb2 = inet_lhash2_bucket(hashinfo, hash2); result = inet_lhash2_lookup(net, ilb2, skb, doff, saddr, sport, daddr, hnum, dif, sdif); if (result) goto done; /* Lookup lhash2 with INADDR_ANY */ hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum); ilb2 = inet_lhash2_bucket(hashinfo, hash2); result = inet_lhash2_lookup(net, ilb2, skb, doff, saddr, sport, htonl(INADDR_ANY), hnum, dif, sdif); done: if (IS_ERR(result)) return NULL; return result; } EXPORT_SYMBOL_GPL(__inet_lookup_listener); /* All sockets share common refcount, but have different destructors */ void sock_gen_put(struct sock *sk) { if (!refcount_dec_and_test(&sk->sk_refcnt)) return; if (sk->sk_state == TCP_TIME_WAIT) inet_twsk_free(inet_twsk(sk)); else if (sk->sk_state == TCP_NEW_SYN_RECV) reqsk_free(inet_reqsk(sk)); else sk_free(sk); } EXPORT_SYMBOL_GPL(sock_gen_put); void sock_edemux(struct sk_buff *skb) { sock_gen_put(skb->sk); } EXPORT_SYMBOL(sock_edemux); struct sock *__inet_lookup_established(const struct net *net, struct inet_hashinfo *hashinfo, const __be32 saddr, const __be16 sport, const __be32 daddr, const u16 hnum, const int dif, const int sdif) { INET_ADDR_COOKIE(acookie, saddr, daddr); const __portpair ports = INET_COMBINED_PORTS(sport, hnum); struct sock *sk; const struct hlist_nulls_node *node; /* Optimize here for direct hit, only listening connections can * have wildcards anyways. */ unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport); unsigned int slot = hash & hashinfo->ehash_mask; struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; begin: sk_nulls_for_each_rcu(sk, node, &head->chain) { if (sk->sk_hash != hash) continue; if (likely(inet_match(net, sk, acookie, ports, dif, sdif))) { if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) goto out; if (unlikely(!inet_match(net, sk, acookie, ports, dif, sdif))) { sock_gen_put(sk); goto begin; } goto found; } } /* * if the nulls value we got at the end of this lookup is * not the expected one, we must restart lookup. * We probably met an item that was moved to another chain. */ if (get_nulls_value(node) != slot) goto begin; out: sk = NULL; found: return sk; } EXPORT_SYMBOL_GPL(__inet_lookup_established); /* called with local bh disabled */ static int __inet_check_established(struct inet_timewait_death_row *death_row, struct sock *sk, __u16 lport, struct inet_timewait_sock **twp) { struct inet_hashinfo *hinfo = death_row->hashinfo; struct inet_sock *inet = inet_sk(sk); __be32 daddr = inet->inet_rcv_saddr; __be32 saddr = inet->inet_daddr; int dif = sk->sk_bound_dev_if; struct net *net = sock_net(sk); int sdif = l3mdev_master_ifindex_by_index(net, dif); INET_ADDR_COOKIE(acookie, saddr, daddr); const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport); unsigned int hash = inet_ehashfn(net, daddr, lport, saddr, inet->inet_dport); struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); spinlock_t *lock = inet_ehash_lockp(hinfo, hash); struct sock *sk2; const struct hlist_nulls_node *node; struct inet_timewait_sock *tw = NULL; spin_lock(lock); sk_nulls_for_each(sk2, node, &head->chain) { if (sk2->sk_hash != hash) continue; if (likely(inet_match(net, sk2, acookie, ports, dif, sdif))) { if (sk2->sk_state == TCP_TIME_WAIT) { tw = inet_twsk(sk2); if (sk->sk_protocol == IPPROTO_TCP && tcp_twsk_unique(sk, sk2, twp)) break; } goto not_unique; } } /* Must record num and sport now. Otherwise we will see * in hash table socket with a funny identity. */ inet->inet_num = lport; inet->inet_sport = htons(lport); sk->sk_hash = hash; WARN_ON(!sk_unhashed(sk)); __sk_nulls_add_node_rcu(sk, &head->chain); if (tw) { sk_nulls_del_node_init_rcu((struct sock *)tw); __NET_INC_STATS(net, LINUX_MIB_TIMEWAITRECYCLED); } spin_unlock(lock); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); if (twp) { *twp = tw; } else if (tw) { /* Silly. Should hash-dance instead... */ inet_twsk_deschedule_put(tw); } return 0; not_unique: spin_unlock(lock); return -EADDRNOTAVAIL; } static u64 inet_sk_port_offset(const struct sock *sk) { const struct inet_sock *inet = inet_sk(sk); return secure_ipv4_port_ephemeral(inet->inet_rcv_saddr, inet->inet_daddr, inet->inet_dport); } /* Searches for an exsiting socket in the ehash bucket list. * Returns true if found, false otherwise. */ static bool inet_ehash_lookup_by_sk(struct sock *sk, struct hlist_nulls_head *list) { const __portpair ports = INET_COMBINED_PORTS(sk->sk_dport, sk->sk_num); const int sdif = sk->sk_bound_dev_if; const int dif = sk->sk_bound_dev_if; const struct hlist_nulls_node *node; struct net *net = sock_net(sk); struct sock *esk; INET_ADDR_COOKIE(acookie, sk->sk_daddr, sk->sk_rcv_saddr); sk_nulls_for_each_rcu(esk, node, list) { if (esk->sk_hash != sk->sk_hash) continue; if (sk->sk_family == AF_INET) { if (unlikely(inet_match(net, esk, acookie, ports, dif, sdif))) { return true; } } #if IS_ENABLED(CONFIG_IPV6) else if (sk->sk_family == AF_INET6) { if (unlikely(inet6_match(net, esk, &sk->sk_v6_daddr, &sk->sk_v6_rcv_saddr, ports, dif, sdif))) { return true; } } #endif } return false; } /* Insert a socket into ehash, and eventually remove another one * (The another one can be a SYN_RECV or TIMEWAIT) * If an existing socket already exists, socket sk is not inserted, * and sets found_dup_sk parameter to true. */ bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk) { struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); struct inet_ehash_bucket *head; struct hlist_nulls_head *list; spinlock_t *lock; bool ret = true; WARN_ON_ONCE(!sk_unhashed(sk)); sk->sk_hash = sk_ehashfn(sk); head = inet_ehash_bucket(hashinfo, sk->sk_hash); list = &head->chain; lock = inet_ehash_lockp(hashinfo, sk->sk_hash); spin_lock(lock); if (osk) { WARN_ON_ONCE(sk->sk_hash != osk->sk_hash); ret = sk_nulls_del_node_init_rcu(osk); } else if (found_dup_sk) { *found_dup_sk = inet_ehash_lookup_by_sk(sk, list); if (*found_dup_sk) ret = false; } if (ret) __sk_nulls_add_node_rcu(sk, list); spin_unlock(lock); return ret; } bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk) { bool ok = inet_ehash_insert(sk, osk, found_dup_sk); if (ok) { sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); } else { this_cpu_inc(*sk->sk_prot->orphan_count); inet_sk_set_state(sk, TCP_CLOSE); sock_set_flag(sk, SOCK_DEAD); inet_csk_destroy_sock(sk); } return ok; } EXPORT_SYMBOL_GPL(inet_ehash_nolisten); static int inet_reuseport_add_sock(struct sock *sk, struct inet_listen_hashbucket *ilb) { struct inet_bind_bucket *tb = inet_csk(sk)->icsk_bind_hash; const struct hlist_nulls_node *node; struct sock *sk2; kuid_t uid = sock_i_uid(sk); sk_nulls_for_each_rcu(sk2, node, &ilb->nulls_head) { if (sk2 != sk && sk2->sk_family == sk->sk_family && ipv6_only_sock(sk2) == ipv6_only_sock(sk) && sk2->sk_bound_dev_if == sk->sk_bound_dev_if && inet_csk(sk2)->icsk_bind_hash == tb && sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) && inet_rcv_saddr_equal(sk, sk2, false)) return reuseport_add_sock(sk, sk2, inet_rcv_saddr_any(sk)); } return reuseport_alloc(sk, inet_rcv_saddr_any(sk)); } int __inet_hash(struct sock *sk, struct sock *osk) { struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); struct inet_listen_hashbucket *ilb2; int err = 0; if (sk->sk_state != TCP_LISTEN) { local_bh_disable(); inet_ehash_nolisten(sk, osk, NULL); local_bh_enable(); return 0; } WARN_ON(!sk_unhashed(sk)); ilb2 = inet_lhash2_bucket_sk(hashinfo, sk); spin_lock(&ilb2->lock); if (sk->sk_reuseport) { err = inet_reuseport_add_sock(sk, ilb2); if (err) goto unlock; } sock_set_flag(sk, SOCK_RCU_FREE); if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && sk->sk_family == AF_INET6) __sk_nulls_add_node_tail_rcu(sk, &ilb2->nulls_head); else __sk_nulls_add_node_rcu(sk, &ilb2->nulls_head); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); unlock: spin_unlock(&ilb2->lock); return err; } EXPORT_SYMBOL(__inet_hash); int inet_hash(struct sock *sk) { int err = 0; if (sk->sk_state != TCP_CLOSE) err = __inet_hash(sk, NULL); return err; } EXPORT_SYMBOL_GPL(inet_hash); void inet_unhash(struct sock *sk) { struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); if (sk_unhashed(sk)) return; if (sk->sk_state == TCP_LISTEN) { struct inet_listen_hashbucket *ilb2; ilb2 = inet_lhash2_bucket_sk(hashinfo, sk); /* Don't disable bottom halves while acquiring the lock to * avoid circular locking dependency on PREEMPT_RT. */ spin_lock(&ilb2->lock); if (sk_unhashed(sk)) { spin_unlock(&ilb2->lock); return; } if (rcu_access_pointer(sk->sk_reuseport_cb)) reuseport_stop_listen_sock(sk); __sk_nulls_del_node_init_rcu(sk); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); spin_unlock(&ilb2->lock); } else { spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash); spin_lock_bh(lock); if (sk_unhashed(sk)) { spin_unlock_bh(lock); return; } __sk_nulls_del_node_init_rcu(sk); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); spin_unlock_bh(lock); } } EXPORT_SYMBOL_GPL(inet_unhash); static bool inet_bind2_bucket_match(const struct inet_bind2_bucket *tb, const struct net *net, unsigned short port, int l3mdev, const struct sock *sk) { if (!net_eq(ib2_net(tb), net) || tb->port != port || tb->l3mdev != l3mdev) return false; return inet_bind2_bucket_addr_match(tb, sk); } bool inet_bind2_bucket_match_addr_any(const struct inet_bind2_bucket *tb, const struct net *net, unsigned short port, int l3mdev, const struct sock *sk) { if (!net_eq(ib2_net(tb), net) || tb->port != port || tb->l3mdev != l3mdev) return false; #if IS_ENABLED(CONFIG_IPV6) if (tb->addr_type == IPV6_ADDR_ANY) return true; if (tb->addr_type != IPV6_ADDR_MAPPED) return false; if (sk->sk_family == AF_INET6 && !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) return false; #endif return tb->rcv_saddr == 0; } /* The socket's bhash2 hashbucket spinlock must be held when this is called */ struct inet_bind2_bucket * inet_bind2_bucket_find(const struct inet_bind_hashbucket *head, const struct net *net, unsigned short port, int l3mdev, const struct sock *sk) { struct inet_bind2_bucket *bhash2 = NULL; inet_bind_bucket_for_each(bhash2, &head->chain) if (inet_bind2_bucket_match(bhash2, net, port, l3mdev, sk)) break; return bhash2; } struct inet_bind_hashbucket * inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, int port) { struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk); u32 hash; #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) hash = ipv6_portaddr_hash(net, &in6addr_any, port); else #endif hash = ipv4_portaddr_hash(net, 0, port); return &hinfo->bhash2[hash & (hinfo->bhash_size - 1)]; } static void inet_update_saddr(struct sock *sk, void *saddr, int family) { if (family == AF_INET) { inet_sk(sk)->inet_saddr = *(__be32 *)saddr; sk_rcv_saddr_set(sk, inet_sk(sk)->inet_saddr); } #if IS_ENABLED(CONFIG_IPV6) else { sk->sk_v6_rcv_saddr = *(struct in6_addr *)saddr; } #endif } static int __inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family, bool reset) { struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk); struct inet_bind_hashbucket *head, *head2; struct inet_bind2_bucket *tb2, *new_tb2; int l3mdev = inet_sk_bound_l3mdev(sk); int port = inet_sk(sk)->inet_num; struct net *net = sock_net(sk); int bhash; if (!inet_csk(sk)->icsk_bind2_hash) { /* Not bind()ed before. */ if (reset) inet_reset_saddr(sk); else inet_update_saddr(sk, saddr, family); return 0; } /* Allocate a bind2 bucket ahead of time to avoid permanently putting * the bhash2 table in an inconsistent state if a new tb2 bucket * allocation fails. */ new_tb2 = kmem_cache_alloc(hinfo->bind2_bucket_cachep, GFP_ATOMIC); if (!new_tb2) { if (reset) { /* The (INADDR_ANY, port) bucket might have already * been freed, then we cannot fixup icsk_bind2_hash, * so we give up and unlink sk from bhash/bhash2 not * to leave inconsistency in bhash2. */ inet_put_port(sk); inet_reset_saddr(sk); } return -ENOMEM; } bhash = inet_bhashfn(net, port, hinfo->bhash_size); head = &hinfo->bhash[bhash]; head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); /* If we change saddr locklessly, another thread * iterating over bhash might see corrupted address. */ spin_lock_bh(&head->lock); spin_lock(&head2->lock); __sk_del_bind_node(sk); inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, inet_csk(sk)->icsk_bind2_hash); spin_unlock(&head2->lock); if (reset) inet_reset_saddr(sk); else inet_update_saddr(sk, saddr, family); head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); spin_lock(&head2->lock); tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk); if (!tb2) { tb2 = new_tb2; inet_bind2_bucket_init(tb2, net, head2, inet_csk(sk)->icsk_bind_hash, sk); } inet_csk(sk)->icsk_bind2_hash = tb2; sk_add_bind_node(sk, &tb2->owners); spin_unlock(&head2->lock); spin_unlock_bh(&head->lock); if (tb2 != new_tb2) kmem_cache_free(hinfo->bind2_bucket_cachep, new_tb2); return 0; } int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family) { return __inet_bhash2_update_saddr(sk, saddr, family, false); } EXPORT_SYMBOL_GPL(inet_bhash2_update_saddr); void inet_bhash2_reset_saddr(struct sock *sk) { if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) __inet_bhash2_update_saddr(sk, NULL, 0, true); } EXPORT_SYMBOL_GPL(inet_bhash2_reset_saddr); /* RFC 6056 3.3.4. Algorithm 4: Double-Hash Port Selection Algorithm * Note that we use 32bit integers (vs RFC 'short integers') * because 2^16 is not a multiple of num_ephemeral and this * property might be used by clever attacker. * * RFC claims using TABLE_LENGTH=10 buckets gives an improvement, though * attacks were since demonstrated, thus we use 65536 by default instead * to really give more isolation and privacy, at the expense of 256kB * of kernel memory. */ #define INET_TABLE_PERTURB_SIZE (1 << CONFIG_INET_TABLE_PERTURB_ORDER) static u32 *table_perturb; int __inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk, u64 port_offset, int (*check_established)(struct inet_timewait_death_row *, struct sock *, __u16, struct inet_timewait_sock **)) { struct inet_hashinfo *hinfo = death_row->hashinfo; struct inet_bind_hashbucket *head, *head2; struct inet_timewait_sock *tw = NULL; int port = inet_sk(sk)->inet_num; struct net *net = sock_net(sk); struct inet_bind2_bucket *tb2; struct inet_bind_bucket *tb; bool tb_created = false; u32 remaining, offset; int ret, i, low, high; bool local_ports; int step, l3mdev; u32 index; if (port) { local_bh_disable(); ret = check_established(death_row, sk, port, NULL); local_bh_enable(); return ret; } l3mdev = inet_sk_bound_l3mdev(sk); local_ports = inet_sk_get_local_port_range(sk, &low, &high); step = local_ports ? 1 : 2; high++; /* [32768, 60999] -> [32768, 61000[ */ remaining = high - low; if (!local_ports && remaining > 1) remaining &= ~1U; get_random_sleepable_once(table_perturb, INET_TABLE_PERTURB_SIZE * sizeof(*table_perturb)); index = port_offset & (INET_TABLE_PERTURB_SIZE - 1); offset = READ_ONCE(table_perturb[index]) + (port_offset >> 32); offset %= remaining; /* In first pass we try ports of @low parity. * inet_csk_get_port() does the opposite choice. */ if (!local_ports) offset &= ~1U; other_parity_scan: port = low + offset; for (i = 0; i < remaining; i += step, port += step) { if (unlikely(port >= high)) port -= remaining; if (inet_is_local_reserved_port(net, port)) continue; head = &hinfo->bhash[inet_bhashfn(net, port, hinfo->bhash_size)]; spin_lock_bh(&head->lock); /* Does not bother with rcv_saddr checks, because * the established check is already unique enough. */ inet_bind_bucket_for_each(tb, &head->chain) { if (inet_bind_bucket_match(tb, net, port, l3mdev)) { if (tb->fastreuse >= 0 || tb->fastreuseport >= 0) goto next_port; WARN_ON(hlist_empty(&tb->bhash2)); if (!check_established(death_row, sk, port, &tw)) goto ok; goto next_port; } } tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, net, head, port, l3mdev); if (!tb) { spin_unlock_bh(&head->lock); return -ENOMEM; } tb_created = true; tb->fastreuse = -1; tb->fastreuseport = -1; goto ok; next_port: spin_unlock_bh(&head->lock); cond_resched(); } if (!local_ports) { offset++; if ((offset & 1) && remaining > 1) goto other_parity_scan; } return -EADDRNOTAVAIL; ok: /* Find the corresponding tb2 bucket since we need to * add the socket to the bhash2 table as well */ head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); spin_lock(&head2->lock); tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk); if (!tb2) { tb2 = inet_bind2_bucket_create(hinfo->bind2_bucket_cachep, net, head2, tb, sk); if (!tb2) goto error; } /* Here we want to add a little bit of randomness to the next source * port that will be chosen. We use a max() with a random here so that * on low contention the randomness is maximal and on high contention * it may be inexistent. */ i = max_t(int, i, get_random_u32_below(8) * step); WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + step); /* Head lock still held and bh's disabled */ inet_bind_hash(sk, tb, tb2, port); if (sk_unhashed(sk)) { inet_sk(sk)->inet_sport = htons(port); inet_ehash_nolisten(sk, (struct sock *)tw, NULL); } if (tw) inet_twsk_bind_unhash(tw, hinfo); spin_unlock(&head2->lock); spin_unlock(&head->lock); if (tw) inet_twsk_deschedule_put(tw); local_bh_enable(); return 0; error: if (sk_hashed(sk)) { spinlock_t *lock = inet_ehash_lockp(hinfo, sk->sk_hash); sock_prot_inuse_add(net, sk->sk_prot, -1); spin_lock(lock); __sk_nulls_del_node_init_rcu(sk); spin_unlock(lock); sk->sk_hash = 0; inet_sk(sk)->inet_sport = 0; inet_sk(sk)->inet_num = 0; if (tw) inet_twsk_bind_unhash(tw, hinfo); } spin_unlock(&head2->lock); if (tb_created) inet_bind_bucket_destroy(hinfo->bind_bucket_cachep, tb); spin_unlock(&head->lock); if (tw) inet_twsk_deschedule_put(tw); local_bh_enable(); return -ENOMEM; } /* * Bind a port for a connect operation and hash it. */ int inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk) { u64 port_offset = 0; if (!inet_sk(sk)->inet_num) port_offset = inet_sk_port_offset(sk); return __inet_hash_connect(death_row, sk, port_offset, __inet_check_established); } EXPORT_SYMBOL_GPL(inet_hash_connect); static void init_hashinfo_lhash2(struct inet_hashinfo *h) { int i; for (i = 0; i <= h->lhash2_mask; i++) { spin_lock_init(&h->lhash2[i].lock); INIT_HLIST_NULLS_HEAD(&h->lhash2[i].nulls_head, i + LISTENING_NULLS_BASE); } } void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name, unsigned long numentries, int scale, unsigned long low_limit, unsigned long high_limit) { h->lhash2 = alloc_large_system_hash(name, sizeof(*h->lhash2), numentries, scale, 0, NULL, &h->lhash2_mask, low_limit, high_limit); init_hashinfo_lhash2(h); /* this one is used for source ports of outgoing connections */ table_perturb = alloc_large_system_hash("Table-perturb", sizeof(*table_perturb), INET_TABLE_PERTURB_SIZE, 0, 0, NULL, NULL, INET_TABLE_PERTURB_SIZE, INET_TABLE_PERTURB_SIZE); } int inet_hashinfo2_init_mod(struct inet_hashinfo *h) { h->lhash2 = kmalloc_array(INET_LHTABLE_SIZE, sizeof(*h->lhash2), GFP_KERNEL); if (!h->lhash2) return -ENOMEM; h->lhash2_mask = INET_LHTABLE_SIZE - 1; /* INET_LHTABLE_SIZE must be a power of 2 */ BUG_ON(INET_LHTABLE_SIZE & h->lhash2_mask); init_hashinfo_lhash2(h); return 0; } EXPORT_SYMBOL_GPL(inet_hashinfo2_init_mod); int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) { unsigned int locksz = sizeof(spinlock_t); unsigned int i, nblocks = 1; if (locksz != 0) { /* allocate 2 cache lines or at least one spinlock per cpu */ nblocks = max(2U * L1_CACHE_BYTES / locksz, 1U); nblocks = roundup_pow_of_two(nblocks * num_possible_cpus()); /* no more locks than number of hash buckets */ nblocks = min(nblocks, hashinfo->ehash_mask + 1); hashinfo->ehash_locks = kvmalloc_array(nblocks, locksz, GFP_KERNEL); if (!hashinfo->ehash_locks) return -ENOMEM; for (i = 0; i < nblocks; i++) spin_lock_init(&hashinfo->ehash_locks[i]); } hashinfo->ehash_locks_mask = nblocks - 1; return 0; } EXPORT_SYMBOL_GPL(inet_ehash_locks_alloc); struct inet_hashinfo *inet_pernet_hashinfo_alloc(struct inet_hashinfo *hashinfo, unsigned int ehash_entries) { struct inet_hashinfo *new_hashinfo; int i; new_hashinfo = kmemdup(hashinfo, sizeof(*hashinfo), GFP_KERNEL); if (!new_hashinfo) goto err; new_hashinfo->ehash = vmalloc_huge(ehash_entries * sizeof(struct inet_ehash_bucket), GFP_KERNEL_ACCOUNT); if (!new_hashinfo->ehash) goto free_hashinfo; new_hashinfo->ehash_mask = ehash_entries - 1; if (inet_ehash_locks_alloc(new_hashinfo)) goto free_ehash; for (i = 0; i < ehash_entries; i++) INIT_HLIST_NULLS_HEAD(&new_hashinfo->ehash[i].chain, i); new_hashinfo->pernet = true; return new_hashinfo; free_ehash: vfree(new_hashinfo->ehash); free_hashinfo: kfree(new_hashinfo); err: return NULL; } EXPORT_SYMBOL_GPL(inet_pernet_hashinfo_alloc); void inet_pernet_hashinfo_free(struct inet_hashinfo *hashinfo) { if (!hashinfo->pernet) return; inet_ehash_locks_free(hashinfo); vfree(hashinfo->ehash); kfree(hashinfo); } EXPORT_SYMBOL_GPL(inet_pernet_hashinfo_free);
26 31 7 42 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner */ #ifndef _NET_BATMAN_ADV_HASH_H_ #define _NET_BATMAN_ADV_HASH_H_ #include "main.h" #include <linux/atomic.h> #include <linux/compiler.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/rculist.h> #include <linux/spinlock.h> #include <linux/stddef.h> #include <linux/types.h> /* callback to a compare function. should compare 2 element data for their * keys * * Return: true if same and false if not same */ typedef bool (*batadv_hashdata_compare_cb)(const struct hlist_node *, const void *); /* the hashfunction * * Return: an index based on the key in the data of the first argument and the * size the second */ typedef u32 (*batadv_hashdata_choose_cb)(const void *, u32); typedef void (*batadv_hashdata_free_cb)(struct hlist_node *, void *); /** * struct batadv_hashtable - Wrapper of simple hlist based hashtable */ struct batadv_hashtable { /** @table: the hashtable itself with the buckets */ struct hlist_head *table; /** @list_locks: spinlock for each hash list entry */ spinlock_t *list_locks; /** @size: size of hashtable */ u32 size; /** @generation: current (generation) sequence number */ atomic_t generation; }; /* allocates and clears the hash */ struct batadv_hashtable *batadv_hash_new(u32 size); /* set class key for all locks */ void batadv_hash_set_lock_class(struct batadv_hashtable *hash, struct lock_class_key *key); /* free only the hashtable and the hash itself. */ void batadv_hash_destroy(struct batadv_hashtable *hash); /** * batadv_hash_add() - adds data to the hashtable * @hash: storage hash table * @compare: callback to determine if 2 hash elements are identical * @choose: callback calculating the hash index * @data: data passed to the aforementioned callbacks as argument * @data_node: to be added element * * Return: 0 on success, 1 if the element already is in the hash * and -1 on error. */ static inline int batadv_hash_add(struct batadv_hashtable *hash, batadv_hashdata_compare_cb compare, batadv_hashdata_choose_cb choose, const void *data, struct hlist_node *data_node) { u32 index; int ret = -1; struct hlist_head *head; struct hlist_node *node; spinlock_t *list_lock; /* spinlock to protect write access */ if (!hash) goto out; index = choose(data, hash->size); head = &hash->table[index]; list_lock = &hash->list_locks[index]; spin_lock_bh(list_lock); hlist_for_each(node, head) { if (!compare(node, data)) continue; ret = 1; goto unlock; } /* no duplicate found in list, add new element */ hlist_add_head_rcu(data_node, head); atomic_inc(&hash->generation); ret = 0; unlock: spin_unlock_bh(list_lock); out: return ret; } /** * batadv_hash_remove() - Removes data from hash, if found * @hash: hash table * @compare: callback to determine if 2 hash elements are identical * @choose: callback calculating the hash index * @data: data passed to the aforementioned callbacks as argument * * ata could be the structure you use with just the key filled, we just need * the key for comparing. * * Return: returns pointer do data on success, so you can remove the used * structure yourself, or NULL on error */ static inline void *batadv_hash_remove(struct batadv_hashtable *hash, batadv_hashdata_compare_cb compare, batadv_hashdata_choose_cb choose, void *data) { u32 index; struct hlist_node *node; struct hlist_head *head; void *data_save = NULL; index = choose(data, hash->size); head = &hash->table[index]; spin_lock_bh(&hash->list_locks[index]); hlist_for_each(node, head) { if (!compare(node, data)) continue; data_save = node; hlist_del_rcu(node); atomic_inc(&hash->generation); break; } spin_unlock_bh(&hash->list_locks[index]); return data_save; } #endif /* _NET_BATMAN_ADV_HASH_H_ */
12 1 1 2 2 1 2 1 17 17 17 8 3 6 13 5 1 12 2 3 8 7 1 1 4 1 15 10 15 10 2 2 17 11 11 5 1 4 1 5 5 1 5 3 1 2 1 1 1 9 3 1 2 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2017 Red Hat, Inc. */ #include "fuse_i.h" #include <linux/uio.h> #include <linux/compat.h> #include <linux/fileattr.h> #include <linux/fsverity.h> #define FUSE_VERITY_ENABLE_ARG_MAX_PAGES 256 static ssize_t fuse_send_ioctl(struct fuse_mount *fm, struct fuse_args *args, struct fuse_ioctl_out *outarg) { ssize_t ret; args->out_args[0].size = sizeof(*outarg); args->out_args[0].value = outarg; ret = fuse_simple_request(fm, args); /* Translate ENOSYS, which shouldn't be returned from fs */ if (ret == -ENOSYS) ret = -ENOTTY; if (ret >= 0 && outarg->result == -ENOSYS) outarg->result = -ENOTTY; return ret; } /* * CUSE servers compiled on 32bit broke on 64bit kernels because the * ABI was defined to be 'struct iovec' which is different on 32bit * and 64bit. Fortunately we can determine which structure the server * used from the size of the reply. */ static int fuse_copy_ioctl_iovec_old(struct iovec *dst, void *src, size_t transferred, unsigned count, bool is_compat) { #ifdef CONFIG_COMPAT if (count * sizeof(struct compat_iovec) == transferred) { struct compat_iovec *ciov = src; unsigned i; /* * With this interface a 32bit server cannot support * non-compat (i.e. ones coming from 64bit apps) ioctl * requests */ if (!is_compat) return -EINVAL; for (i = 0; i < count; i++) { dst[i].iov_base = compat_ptr(ciov[i].iov_base); dst[i].iov_len = ciov[i].iov_len; } return 0; } #endif if (count * sizeof(struct iovec) != transferred) return -EIO; memcpy(dst, src, transferred); return 0; } /* Make sure iov_length() won't overflow */ static int fuse_verify_ioctl_iov(struct fuse_conn *fc, struct iovec *iov, size_t count) { size_t n; u32 max = fc->max_pages << PAGE_SHIFT; for (n = 0; n < count; n++, iov++) { if (iov->iov_len > (size_t) max) return -ENOMEM; max -= iov->iov_len; } return 0; } static int fuse_copy_ioctl_iovec(struct fuse_conn *fc, struct iovec *dst, void *src, size_t transferred, unsigned count, bool is_compat) { unsigned i; struct fuse_ioctl_iovec *fiov = src; if (fc->minor < 16) { return fuse_copy_ioctl_iovec_old(dst, src, transferred, count, is_compat); } if (count * sizeof(struct fuse_ioctl_iovec) != transferred) return -EIO; for (i = 0; i < count; i++) { /* Did the server supply an inappropriate value? */ if (fiov[i].base != (unsigned long) fiov[i].base || fiov[i].len != (unsigned long) fiov[i].len) return -EIO; dst[i].iov_base = (void __user *) (unsigned long) fiov[i].base; dst[i].iov_len = (size_t) fiov[i].len; #ifdef CONFIG_COMPAT if (is_compat && (ptr_to_compat(dst[i].iov_base) != fiov[i].base || (compat_size_t) dst[i].iov_len != fiov[i].len)) return -EIO; #endif } return 0; } /* For fs-verity, determine iov lengths from input */ static int fuse_setup_measure_verity(unsigned long arg, struct iovec *iov) { __u16 digest_size; struct fsverity_digest __user *uarg = (void __user *)arg; if (copy_from_user(&digest_size, &uarg->digest_size, sizeof(digest_size))) return -EFAULT; if (digest_size > SIZE_MAX - sizeof(struct fsverity_digest)) return -EINVAL; iov->iov_len = sizeof(struct fsverity_digest) + digest_size; return 0; } static int fuse_setup_enable_verity(unsigned long arg, struct iovec *iov, unsigned int *in_iovs) { struct fsverity_enable_arg enable; struct fsverity_enable_arg __user *uarg = (void __user *)arg; const __u32 max_buffer_len = FUSE_VERITY_ENABLE_ARG_MAX_PAGES * PAGE_SIZE; if (copy_from_user(&enable, uarg, sizeof(enable))) return -EFAULT; if (enable.salt_size > max_buffer_len || enable.sig_size > max_buffer_len) return -ENOMEM; if (enable.salt_size > 0) { iov++; (*in_iovs)++; iov->iov_base = u64_to_user_ptr(enable.salt_ptr); iov->iov_len = enable.salt_size; } if (enable.sig_size > 0) { iov++; (*in_iovs)++; iov->iov_base = u64_to_user_ptr(enable.sig_ptr); iov->iov_len = enable.sig_size; } return 0; } /* * For ioctls, there is no generic way to determine how much memory * needs to be read and/or written. Furthermore, ioctls are allowed * to dereference the passed pointer, so the parameter requires deep * copying but FUSE has no idea whatsoever about what to copy in or * out. * * This is solved by allowing FUSE server to retry ioctl with * necessary in/out iovecs. Let's assume the ioctl implementation * needs to read in the following structure. * * struct a { * char *buf; * size_t buflen; * } * * On the first callout to FUSE server, inarg->in_size and * inarg->out_size will be NULL; then, the server completes the ioctl * with FUSE_IOCTL_RETRY set in out->flags, out->in_iovs set to 1 and * the actual iov array to * * { { .iov_base = inarg.arg, .iov_len = sizeof(struct a) } } * * which tells FUSE to copy in the requested area and retry the ioctl. * On the second round, the server has access to the structure and * from that it can tell what to look for next, so on the invocation, * it sets FUSE_IOCTL_RETRY, out->in_iovs to 2 and iov array to * * { { .iov_base = inarg.arg, .iov_len = sizeof(struct a) }, * { .iov_base = a.buf, .iov_len = a.buflen } } * * FUSE will copy both struct a and the pointed buffer from the * process doing the ioctl and retry ioctl with both struct a and the * buffer. * * This time, FUSE server has everything it needs and completes ioctl * without FUSE_IOCTL_RETRY which finishes the ioctl call. * * Copying data out works the same way. * * Note that if FUSE_IOCTL_UNRESTRICTED is clear, the kernel * automatically initializes in and out iovs by decoding @cmd with * _IOC_* macros and the server is not allowed to request RETRY. This * limits ioctl data transfers to well-formed ioctls and is the forced * behavior for all FUSE servers. */ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, unsigned int flags) { struct fuse_file *ff = file->private_data; struct fuse_mount *fm = ff->fm; struct fuse_ioctl_in inarg = { .fh = ff->fh, .cmd = cmd, .arg = arg, .flags = flags }; struct fuse_ioctl_out outarg; struct iovec *iov_page = NULL; struct iovec *in_iov = NULL, *out_iov = NULL; unsigned int in_iovs = 0, out_iovs = 0, max_pages; size_t in_size, out_size, c; ssize_t transferred; int err, i; struct iov_iter ii; struct fuse_args_pages ap = {}; #if BITS_PER_LONG == 32 inarg.flags |= FUSE_IOCTL_32BIT; #else if (flags & FUSE_IOCTL_COMPAT) { inarg.flags |= FUSE_IOCTL_32BIT; #ifdef CONFIG_X86_X32_ABI if (in_x32_syscall()) inarg.flags |= FUSE_IOCTL_COMPAT_X32; #endif } #endif /* assume all the iovs returned by client always fits in a page */ BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE); err = -ENOMEM; ap.folios = fuse_folios_alloc(fm->fc->max_pages, GFP_KERNEL, &ap.descs); iov_page = (struct iovec *) __get_free_page(GFP_KERNEL); if (!ap.folios || !iov_page) goto out; fuse_folio_descs_length_init(ap.descs, 0, fm->fc->max_pages); /* * If restricted, initialize IO parameters as encoded in @cmd. * RETRY from server is not allowed. */ if (!(flags & FUSE_IOCTL_UNRESTRICTED)) { struct iovec *iov = iov_page; iov->iov_base = (void __user *)arg; iov->iov_len = _IOC_SIZE(cmd); if (_IOC_DIR(cmd) & _IOC_WRITE) { in_iov = iov; in_iovs = 1; } if (_IOC_DIR(cmd) & _IOC_READ) { out_iov = iov; out_iovs = 1; } err = 0; switch (cmd) { case FS_IOC_MEASURE_VERITY: err = fuse_setup_measure_verity(arg, iov); break; case FS_IOC_ENABLE_VERITY: err = fuse_setup_enable_verity(arg, iov, &in_iovs); break; } if (err) goto out; } retry: inarg.in_size = in_size = iov_length(in_iov, in_iovs); inarg.out_size = out_size = iov_length(out_iov, out_iovs); /* * Out data can be used either for actual out data or iovs, * make sure there always is at least one page. */ out_size = max_t(size_t, out_size, PAGE_SIZE); max_pages = DIV_ROUND_UP(max(in_size, out_size), PAGE_SIZE); /* make sure there are enough buffer pages and init request with them */ err = -ENOMEM; if (max_pages > fm->fc->max_pages) goto out; while (ap.num_folios < max_pages) { ap.folios[ap.num_folios] = folio_alloc(GFP_KERNEL | __GFP_HIGHMEM, 0); if (!ap.folios[ap.num_folios]) goto out; ap.num_folios++; } /* okay, let's send it to the client */ ap.args.opcode = FUSE_IOCTL; ap.args.nodeid = ff->nodeid; ap.args.in_numargs = 1; ap.args.in_args[0].size = sizeof(inarg); ap.args.in_args[0].value = &inarg; if (in_size) { ap.args.in_numargs++; ap.args.in_args[1].size = in_size; ap.args.in_pages = true; err = -EFAULT; iov_iter_init(&ii, ITER_SOURCE, in_iov, in_iovs, in_size); for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_folios); i++) { c = copy_folio_from_iter(ap.folios[i], 0, PAGE_SIZE, &ii); if (c != PAGE_SIZE && iov_iter_count(&ii)) goto out; } } ap.args.out_numargs = 2; ap.args.out_args[1].size = out_size; ap.args.out_pages = true; ap.args.out_argvar = true; transferred = fuse_send_ioctl(fm, &ap.args, &outarg); err = transferred; if (transferred < 0) goto out; /* did it ask for retry? */ if (outarg.flags & FUSE_IOCTL_RETRY) { void *vaddr; /* no retry if in restricted mode */ err = -EIO; if (!(flags & FUSE_IOCTL_UNRESTRICTED)) goto out; in_iovs = outarg.in_iovs; out_iovs = outarg.out_iovs; /* * Make sure things are in boundary, separate checks * are to protect against overflow. */ err = -ENOMEM; if (in_iovs > FUSE_IOCTL_MAX_IOV || out_iovs > FUSE_IOCTL_MAX_IOV || in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV) goto out; vaddr = kmap_local_folio(ap.folios[0], 0); err = fuse_copy_ioctl_iovec(fm->fc, iov_page, vaddr, transferred, in_iovs + out_iovs, (flags & FUSE_IOCTL_COMPAT) != 0); kunmap_local(vaddr); if (err) goto out; in_iov = iov_page; out_iov = in_iov + in_iovs; err = fuse_verify_ioctl_iov(fm->fc, in_iov, in_iovs); if (err) goto out; err = fuse_verify_ioctl_iov(fm->fc, out_iov, out_iovs); if (err) goto out; goto retry; } err = -EIO; if (transferred > inarg.out_size) goto out; err = -EFAULT; iov_iter_init(&ii, ITER_DEST, out_iov, out_iovs, transferred); for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_folios); i++) { c = copy_folio_to_iter(ap.folios[i], 0, PAGE_SIZE, &ii); if (c != PAGE_SIZE && iov_iter_count(&ii)) goto out; } err = 0; out: free_page((unsigned long) iov_page); while (ap.num_folios) folio_put(ap.folios[--ap.num_folios]); kfree(ap.folios); return err ? err : outarg.result; } EXPORT_SYMBOL_GPL(fuse_do_ioctl); long fuse_ioctl_common(struct file *file, unsigned int cmd, unsigned long arg, unsigned int flags) { struct inode *inode = file_inode(file); struct fuse_conn *fc = get_fuse_conn(inode); if (!fuse_allow_current_process(fc)) return -EACCES; if (fuse_is_bad(inode)) return -EIO; return fuse_do_ioctl(file, cmd, arg, flags); } long fuse_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { return fuse_ioctl_common(file, cmd, arg, 0); } long fuse_file_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { return fuse_ioctl_common(file, cmd, arg, FUSE_IOCTL_COMPAT); } static int fuse_priv_ioctl(struct inode *inode, struct fuse_file *ff, unsigned int cmd, void *ptr, size_t size) { struct fuse_mount *fm = ff->fm; struct fuse_ioctl_in inarg; struct fuse_ioctl_out outarg; FUSE_ARGS(args); int err; memset(&inarg, 0, sizeof(inarg)); inarg.fh = ff->fh; inarg.cmd = cmd; #if BITS_PER_LONG == 32 inarg.flags |= FUSE_IOCTL_32BIT; #endif if (S_ISDIR(inode->i_mode)) inarg.flags |= FUSE_IOCTL_DIR; if (_IOC_DIR(cmd) & _IOC_READ) inarg.out_size = size; if (_IOC_DIR(cmd) & _IOC_WRITE) inarg.in_size = size; args.opcode = FUSE_IOCTL; args.nodeid = ff->nodeid; args.in_numargs = 2; args.in_args[0].size = sizeof(inarg); args.in_args[0].value = &inarg; args.in_args[1].size = inarg.in_size; args.in_args[1].value = ptr; args.out_numargs = 2; args.out_args[1].size = inarg.out_size; args.out_args[1].value = ptr; err = fuse_send_ioctl(fm, &args, &outarg); if (!err) { if (outarg.result < 0) err = outarg.result; else if (outarg.flags & FUSE_IOCTL_RETRY) err = -EIO; } return err; } static struct fuse_file *fuse_priv_ioctl_prepare(struct inode *inode) { struct fuse_mount *fm = get_fuse_mount(inode); bool isdir = S_ISDIR(inode->i_mode); if (!fuse_allow_current_process(fm->fc)) return ERR_PTR(-EACCES); if (fuse_is_bad(inode)) return ERR_PTR(-EIO); if (!S_ISREG(inode->i_mode) && !isdir) return ERR_PTR(-ENOTTY); return fuse_file_open(fm, get_node_id(inode), O_RDONLY, isdir); } static void fuse_priv_ioctl_cleanup(struct inode *inode, struct fuse_file *ff) { fuse_file_release(inode, ff, O_RDONLY, NULL, S_ISDIR(inode->i_mode)); } int fuse_fileattr_get(struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); struct fuse_file *ff; unsigned int flags; struct fsxattr xfa; int err; ff = fuse_priv_ioctl_prepare(inode); if (IS_ERR(ff)) return PTR_ERR(ff); if (fa->flags_valid) { err = fuse_priv_ioctl(inode, ff, FS_IOC_GETFLAGS, &flags, sizeof(flags)); if (err) goto cleanup; fileattr_fill_flags(fa, flags); } else { err = fuse_priv_ioctl(inode, ff, FS_IOC_FSGETXATTR, &xfa, sizeof(xfa)); if (err) goto cleanup; fileattr_fill_xflags(fa, xfa.fsx_xflags); fa->fsx_extsize = xfa.fsx_extsize; fa->fsx_nextents = xfa.fsx_nextents; fa->fsx_projid = xfa.fsx_projid; fa->fsx_cowextsize = xfa.fsx_cowextsize; } cleanup: fuse_priv_ioctl_cleanup(inode, ff); return err; } int fuse_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); struct fuse_file *ff; unsigned int flags = fa->flags; struct fsxattr xfa; int err; ff = fuse_priv_ioctl_prepare(inode); if (IS_ERR(ff)) return PTR_ERR(ff); if (fa->flags_valid) { err = fuse_priv_ioctl(inode, ff, FS_IOC_SETFLAGS, &flags, sizeof(flags)); if (err) goto cleanup; } else { memset(&xfa, 0, sizeof(xfa)); xfa.fsx_xflags = fa->fsx_xflags; xfa.fsx_extsize = fa->fsx_extsize; xfa.fsx_nextents = fa->fsx_nextents; xfa.fsx_projid = fa->fsx_projid; xfa.fsx_cowextsize = fa->fsx_cowextsize; err = fuse_priv_ioctl(inode, ff, FS_IOC_FSSETXATTR, &xfa, sizeof(xfa)); } cleanup: fuse_priv_ioctl_cleanup(inode, ff); return err; }
1 1 1 1 1 1 1 2 5 13 13 6 1 5 2 2 2 7 7 1 1 1 2 1 1 7 5 5 5 5 5 5 5 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 // SPDX-License-Identifier: GPL-2.0-only /* * * Copyright (C) Hans Alblas PE1AYX <hans@esrac.ele.tue.nl> * Copyright (C) 2004, 05 Ralf Baechle DL5RB <ralf@linux-mips.org> * Copyright (C) 2004, 05 Thomas Osterried DL9SAU <thomas@x-berg.in-berlin.de> */ #include <linux/module.h> #include <linux/bitops.h> #include <linux/uaccess.h> #include <linux/crc16.h> #include <linux/string.h> #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/slab.h> #include <linux/tty.h> #include <linux/errno.h> #include <linux/netdevice.h> #include <linux/major.h> #include <linux/init.h> #include <linux/rtnetlink.h> #include <linux/etherdevice.h> #include <linux/skbuff.h> #include <linux/if_arp.h> #include <linux/jiffies.h> #include <linux/refcount.h> #include <net/ax25.h> #define AX_MTU 236 /* some arch define END as assembly function ending, just undef it */ #undef END /* SLIP/KISS protocol characters. */ #define END 0300 /* indicates end of frame */ #define ESC 0333 /* indicates byte stuffing */ #define ESC_END 0334 /* ESC ESC_END means END 'data' */ #define ESC_ESC 0335 /* ESC ESC_ESC means ESC 'data' */ struct mkiss { struct tty_struct *tty; /* ptr to TTY structure */ struct net_device *dev; /* easy for intr handling */ /* These are pointers to the malloc()ed frame buffers. */ spinlock_t buflock;/* lock for rbuf and xbuf */ unsigned char *rbuff; /* receiver buffer */ int rcount; /* received chars counter */ unsigned char *xbuff; /* transmitter buffer */ unsigned char *xhead; /* pointer to next byte to XMIT */ int xleft; /* bytes left in XMIT queue */ /* Detailed SLIP statistics. */ int mtu; /* Our mtu (to spot changes!) */ int buffsize; /* Max buffers sizes */ unsigned long flags; /* Flag values/ mode etc */ /* long req'd: used by set_bit --RR */ #define AXF_INUSE 0 /* Channel in use */ #define AXF_ESCAPE 1 /* ESC received */ #define AXF_ERROR 2 /* Parity, etc. error */ #define AXF_KEEPTEST 3 /* Keepalive test flag */ #define AXF_OUTWAIT 4 /* is outpacket was flag */ int mode; int crcmode; /* MW: for FlexNet, SMACK etc. */ int crcauto; /* CRC auto mode */ #define CRC_MODE_NONE 0 #define CRC_MODE_FLEX 1 #define CRC_MODE_SMACK 2 #define CRC_MODE_FLEX_TEST 3 #define CRC_MODE_SMACK_TEST 4 refcount_t refcnt; struct completion dead; }; /*---------------------------------------------------------------------------*/ static const unsigned short crc_flex_table[] = { 0x0f87, 0x1e0e, 0x2c95, 0x3d1c, 0x49a3, 0x582a, 0x6ab1, 0x7b38, 0x83cf, 0x9246, 0xa0dd, 0xb154, 0xc5eb, 0xd462, 0xe6f9, 0xf770, 0x1f06, 0x0e8f, 0x3c14, 0x2d9d, 0x5922, 0x48ab, 0x7a30, 0x6bb9, 0x934e, 0x82c7, 0xb05c, 0xa1d5, 0xd56a, 0xc4e3, 0xf678, 0xe7f1, 0x2e85, 0x3f0c, 0x0d97, 0x1c1e, 0x68a1, 0x7928, 0x4bb3, 0x5a3a, 0xa2cd, 0xb344, 0x81df, 0x9056, 0xe4e9, 0xf560, 0xc7fb, 0xd672, 0x3e04, 0x2f8d, 0x1d16, 0x0c9f, 0x7820, 0x69a9, 0x5b32, 0x4abb, 0xb24c, 0xa3c5, 0x915e, 0x80d7, 0xf468, 0xe5e1, 0xd77a, 0xc6f3, 0x4d83, 0x5c0a, 0x6e91, 0x7f18, 0x0ba7, 0x1a2e, 0x28b5, 0x393c, 0xc1cb, 0xd042, 0xe2d9, 0xf350, 0x87ef, 0x9666, 0xa4fd, 0xb574, 0x5d02, 0x4c8b, 0x7e10, 0x6f99, 0x1b26, 0x0aaf, 0x3834, 0x29bd, 0xd14a, 0xc0c3, 0xf258, 0xe3d1, 0x976e, 0x86e7, 0xb47c, 0xa5f5, 0x6c81, 0x7d08, 0x4f93, 0x5e1a, 0x2aa5, 0x3b2c, 0x09b7, 0x183e, 0xe0c9, 0xf140, 0xc3db, 0xd252, 0xa6ed, 0xb764, 0x85ff, 0x9476, 0x7c00, 0x6d89, 0x5f12, 0x4e9b, 0x3a24, 0x2bad, 0x1936, 0x08bf, 0xf048, 0xe1c1, 0xd35a, 0xc2d3, 0xb66c, 0xa7e5, 0x957e, 0x84f7, 0x8b8f, 0x9a06, 0xa89d, 0xb914, 0xcdab, 0xdc22, 0xeeb9, 0xff30, 0x07c7, 0x164e, 0x24d5, 0x355c, 0x41e3, 0x506a, 0x62f1, 0x7378, 0x9b0e, 0x8a87, 0xb81c, 0xa995, 0xdd2a, 0xcca3, 0xfe38, 0xefb1, 0x1746, 0x06cf, 0x3454, 0x25dd, 0x5162, 0x40eb, 0x7270, 0x63f9, 0xaa8d, 0xbb04, 0x899f, 0x9816, 0xeca9, 0xfd20, 0xcfbb, 0xde32, 0x26c5, 0x374c, 0x05d7, 0x145e, 0x60e1, 0x7168, 0x43f3, 0x527a, 0xba0c, 0xab85, 0x991e, 0x8897, 0xfc28, 0xeda1, 0xdf3a, 0xceb3, 0x3644, 0x27cd, 0x1556, 0x04df, 0x7060, 0x61e9, 0x5372, 0x42fb, 0xc98b, 0xd802, 0xea99, 0xfb10, 0x8faf, 0x9e26, 0xacbd, 0xbd34, 0x45c3, 0x544a, 0x66d1, 0x7758, 0x03e7, 0x126e, 0x20f5, 0x317c, 0xd90a, 0xc883, 0xfa18, 0xeb91, 0x9f2e, 0x8ea7, 0xbc3c, 0xadb5, 0x5542, 0x44cb, 0x7650, 0x67d9, 0x1366, 0x02ef, 0x3074, 0x21fd, 0xe889, 0xf900, 0xcb9b, 0xda12, 0xaead, 0xbf24, 0x8dbf, 0x9c36, 0x64c1, 0x7548, 0x47d3, 0x565a, 0x22e5, 0x336c, 0x01f7, 0x107e, 0xf808, 0xe981, 0xdb1a, 0xca93, 0xbe2c, 0xafa5, 0x9d3e, 0x8cb7, 0x7440, 0x65c9, 0x5752, 0x46db, 0x3264, 0x23ed, 0x1176, 0x00ff }; static unsigned short calc_crc_flex(unsigned char *cp, int size) { unsigned short crc = 0xffff; while (size--) crc = (crc << 8) ^ crc_flex_table[((crc >> 8) ^ *cp++) & 0xff]; return crc; } static int check_crc_flex(unsigned char *cp, int size) { unsigned short crc = 0xffff; if (size < 3) return -1; while (size--) crc = (crc << 8) ^ crc_flex_table[((crc >> 8) ^ *cp++) & 0xff]; if ((crc & 0xffff) != 0x7070) return -1; return 0; } static int check_crc_16(unsigned char *cp, int size) { unsigned short crc = 0x0000; if (size < 3) return -1; crc = crc16(0, cp, size); if (crc != 0x0000) return -1; return 0; } /* * Standard encapsulation */ static int kiss_esc(unsigned char *s, unsigned char *d, int len) { unsigned char *ptr = d; unsigned char c; /* * Send an initial END character to flush out any data that may have * accumulated in the receiver due to line noise. */ *ptr++ = END; while (len-- > 0) { switch (c = *s++) { case END: *ptr++ = ESC; *ptr++ = ESC_END; break; case ESC: *ptr++ = ESC; *ptr++ = ESC_ESC; break; default: *ptr++ = c; break; } } *ptr++ = END; return ptr - d; } /* * MW: * OK its ugly, but tell me a better solution without copying the * packet to a temporary buffer :-) */ static int kiss_esc_crc(unsigned char *s, unsigned char *d, unsigned short crc, int len) { unsigned char *ptr = d; unsigned char c=0;