Total coverage: 102224 (6%)of 1815826
3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 /* Copyright 2011, Siemens AG * written by Alexander Smirnov <alex.bluesman.smirnov@gmail.com> */ /* Based on patches from Jon Smirl <jonsmirl@gmail.com> * Copyright (c) 2011 Jon Smirl <jonsmirl@gmail.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 * as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ /* Jon's code is based on 6lowpan implementation for Contiki which is: * Copyright (c) 2008, Swedish Institute of Computer Science. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the Institute nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include <linux/module.h> #include <linux/netdevice.h> #include <linux/ieee802154.h> #include <linux/if_arp.h> #include <net/ipv6.h> #include <net/netdev_lock.h> #include "6lowpan_i.h" static int open_count; static const struct header_ops lowpan_header_ops = { .create = lowpan_header_create, }; static int lowpan_dev_init(struct net_device *ldev) { netdev_lockdep_set_classes(ldev); return 0; } static int lowpan_open(struct net_device *dev) { if (!open_count) lowpan_rx_init(); open_count++; return 0; } static int lowpan_stop(struct net_device *dev) { open_count--; if (!open_count) lowpan_rx_exit(); return 0; } static int lowpan_neigh_construct(struct net_device *dev, struct neighbour *n) { struct lowpan_802154_neigh *neigh = lowpan_802154_neigh(neighbour_priv(n)); /* default no short_addr is available for a neighbour */ neigh->short_addr = cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC); return 0; } static int lowpan_get_iflink(const struct net_device *dev) { return READ_ONCE(lowpan_802154_dev(dev)->wdev->ifindex); } static const struct net_device_ops lowpan_netdev_ops = { .ndo_init = lowpan_dev_init, .ndo_start_xmit = lowpan_xmit, .ndo_open = lowpan_open, .ndo_stop = lowpan_stop, .ndo_neigh_construct = lowpan_neigh_construct, .ndo_get_iflink = lowpan_get_iflink, }; static void lowpan_setup(struct net_device *ldev) { memset(ldev->broadcast, 0xff, IEEE802154_ADDR_LEN); /* We need an ipv6hdr as minimum len when calling xmit */ ldev->hard_header_len = sizeof(struct ipv6hdr); ldev->flags = IFF_BROADCAST | IFF_MULTICAST; ldev->priv_flags |= IFF_NO_QUEUE; ldev->netdev_ops = &lowpan_netdev_ops; ldev->header_ops = &lowpan_header_ops; ldev->needs_free_netdev = true; ldev->netns_immutable = true; } static int lowpan_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { if (tb[IFLA_ADDRESS]) { if (nla_len(tb[IFLA_ADDRESS]) != IEEE802154_ADDR_LEN) return -EINVAL; } return 0; } static int lowpan_newlink(struct net_device *ldev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct nlattr **tb = params->tb; struct net_device *wdev; int ret; ASSERT_RTNL(); pr_debug("adding new link\n"); if (!tb[IFLA_LINK]) return -EINVAL; if (params->link_net && !net_eq(params->link_net, dev_net(ldev))) return -EINVAL; /* find and hold wpan device */ wdev = dev_get_by_index(dev_net(ldev), nla_get_u32(tb[IFLA_LINK])); if (!wdev) return -ENODEV; if (wdev->type != ARPHRD_IEEE802154) { dev_put(wdev); return -EINVAL; } if (wdev->ieee802154_ptr->lowpan_dev) { dev_put(wdev); return -EBUSY; } lowpan_802154_dev(ldev)->wdev = wdev; /* Set the lowpan hardware address to the wpan hardware address. */ __dev_addr_set(ldev, wdev->dev_addr, IEEE802154_ADDR_LEN); /* We need headroom for possible wpan_dev_hard_header call and tailroom * for encryption/fcs handling. The lowpan interface will replace * the IPv6 header with 6LoWPAN header. At worst case the 6LoWPAN * header has LOWPAN_IPHC_MAX_HEADER_LEN more bytes than the IPv6 * header. */ ldev->needed_headroom = LOWPAN_IPHC_MAX_HEADER_LEN + wdev->needed_headroom; ldev->needed_tailroom = wdev->needed_tailroom; ldev->neigh_priv_len = sizeof(struct lowpan_802154_neigh); ret = lowpan_register_netdevice(ldev, LOWPAN_LLTYPE_IEEE802154); if (ret < 0) { dev_put(wdev); return ret; } wdev->ieee802154_ptr->lowpan_dev = ldev; return 0; } static void lowpan_dellink(struct net_device *ldev, struct list_head *head) { struct net_device *wdev = lowpan_802154_dev(ldev)->wdev; ASSERT_RTNL(); wdev->ieee802154_ptr->lowpan_dev = NULL; lowpan_unregister_netdevice(ldev); dev_put(wdev); } static struct rtnl_link_ops lowpan_link_ops __read_mostly = { .kind = "lowpan", .priv_size = LOWPAN_PRIV_SIZE(sizeof(struct lowpan_802154_dev)), .setup = lowpan_setup, .newlink = lowpan_newlink, .dellink = lowpan_dellink, .validate = lowpan_validate, }; static inline int __init lowpan_netlink_init(void) { return rtnl_link_register(&lowpan_link_ops); } static inline void lowpan_netlink_fini(void) { rtnl_link_unregister(&lowpan_link_ops); } static int lowpan_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *ndev = netdev_notifier_info_to_dev(ptr); struct wpan_dev *wpan_dev; if (ndev->type != ARPHRD_IEEE802154) return NOTIFY_DONE; wpan_dev = ndev->ieee802154_ptr; if (!wpan_dev) return NOTIFY_DONE; switch (event) { case NETDEV_UNREGISTER: /* Check if wpan interface is unregistered that we * also delete possible lowpan interfaces which belongs * to the wpan interface. */ if (wpan_dev->lowpan_dev) lowpan_dellink(wpan_dev->lowpan_dev, NULL); break; default: return NOTIFY_DONE; } return NOTIFY_OK; } static struct notifier_block lowpan_dev_notifier = { .notifier_call = lowpan_device_event, }; static int __init lowpan_init_module(void) { int err = 0; err = lowpan_net_frag_init(); if (err < 0) goto out; err = lowpan_netlink_init(); if (err < 0) goto out_frag; err = register_netdevice_notifier(&lowpan_dev_notifier); if (err < 0) goto out_pack; return 0; out_pack: lowpan_netlink_fini(); out_frag: lowpan_net_frag_exit(); out: return err; } static void __exit lowpan_cleanup_module(void) { lowpan_netlink_fini(); lowpan_net_frag_exit(); unregister_netdevice_notifier(&lowpan_dev_notifier); } module_init(lowpan_init_module); module_exit(lowpan_cleanup_module); MODULE_DESCRIPTION("IPv6 over Low power Wireless Personal Area Network IEEE 802.15.4 core"); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("lowpan");
6263 6454 172 6645 6660 1563 1559 569 168 6963 2371 5445 6565 5490 5445 168 172 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_FIND_H_ #define __LINUX_FIND_H_ #ifndef __LINUX_BITMAP_H #error only <linux/bitmap.h> can be included directly #endif #include <linux/bitops.h> unsigned long _find_next_bit(const unsigned long *addr1, unsigned long nbits, unsigned long start); unsigned long _find_next_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long nbits, unsigned long start); unsigned long _find_next_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long nbits, unsigned long start); unsigned long _find_next_or_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long nbits, unsigned long start); unsigned long _find_next_zero_bit(const unsigned long *addr, unsigned long nbits, unsigned long start); extern unsigned long _find_first_bit(const unsigned long *addr, unsigned long size); unsigned long __find_nth_bit(const unsigned long *addr, unsigned long size, unsigned long n); unsigned long __find_nth_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long n); unsigned long __find_nth_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long n); unsigned long __find_nth_and_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, const unsigned long *addr3, unsigned long size, unsigned long n); extern unsigned long _find_first_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size); unsigned long _find_first_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size); unsigned long _find_first_and_and_bit(const unsigned long *addr1, const unsigned long *addr2, const unsigned long *addr3, unsigned long size); extern unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size); extern unsigned long _find_last_bit(const unsigned long *addr, unsigned long size); #ifdef __BIG_ENDIAN unsigned long _find_first_zero_bit_le(const unsigned long *addr, unsigned long size); unsigned long _find_next_zero_bit_le(const unsigned long *addr, unsigned long size, unsigned long offset); unsigned long _find_next_bit_le(const unsigned long *addr, unsigned long size, unsigned long offset); #endif unsigned long find_random_bit(const unsigned long *addr, unsigned long size); #ifndef find_next_bit /** * find_next_bit - find the next set bit in a memory region * @addr: The address to base the search on * @size: The bitmap size in bits * @offset: The bitnumber to start searching at * * Returns the bit number for the next set bit * If no bits are set, returns @size. */ static __always_inline unsigned long find_next_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { if (small_const_nbits(size)) { unsigned long val; if (unlikely(offset >= size)) return size; val = *addr & GENMASK(size - 1, offset); return val ? __ffs(val) : size; } return _find_next_bit(addr, size, offset); } #endif #ifndef find_next_and_bit /** * find_next_and_bit - find the next set bit in both memory regions * @addr1: The first address to base the search on * @addr2: The second address to base the search on * @size: The bitmap size in bits * @offset: The bitnumber to start searching at * * Returns the bit number for the next set bit * If no bits are set, returns @size. */ static __always_inline unsigned long find_next_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long offset) { if (small_const_nbits(size)) { unsigned long val; if (unlikely(offset >= size)) return size; val = *addr1 & *addr2 & GENMASK(size - 1, offset); return val ? __ffs(val) : size; } return _find_next_and_bit(addr1, addr2, size, offset); } #endif #ifndef find_next_andnot_bit /** * find_next_andnot_bit - find the next set bit in *addr1 excluding all the bits * in *addr2 * @addr1: The first address to base the search on * @addr2: The second address to base the search on * @size: The bitmap size in bits * @offset: The bitnumber to start searching at * * Returns the bit number for the next set bit * If no bits are set, returns @size. */ static __always_inline unsigned long find_next_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long offset) { if (small_const_nbits(size)) { unsigned long val; if (unlikely(offset >= size)) return size; val = *addr1 & ~*addr2 & GENMASK(size - 1, offset); return val ? __ffs(val) : size; } return _find_next_andnot_bit(addr1, addr2, size, offset); } #endif #ifndef find_next_or_bit /** * find_next_or_bit - find the next set bit in either memory regions * @addr1: The first address to base the search on * @addr2: The second address to base the search on * @size: The bitmap size in bits * @offset: The bitnumber to start searching at * * Returns the bit number for the next set bit * If no bits are set, returns @size. */ static __always_inline unsigned long find_next_or_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long offset) { if (small_const_nbits(size)) { unsigned long val; if (unlikely(offset >= size)) return size; val = (*addr1 | *addr2) & GENMASK(size - 1, offset); return val ? __ffs(val) : size; } return _find_next_or_bit(addr1, addr2, size, offset); } #endif #ifndef find_next_zero_bit /** * find_next_zero_bit - find the next cleared bit in a memory region * @addr: The address to base the search on * @size: The bitmap size in bits * @offset: The bitnumber to start searching at * * Returns the bit number of the next zero bit * If no bits are zero, returns @size. */ static __always_inline unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { if (small_const_nbits(size)) { unsigned long val; if (unlikely(offset >= size)) return size; val = *addr | ~GENMASK(size - 1, offset); return val == ~0UL ? size : ffz(val); } return _find_next_zero_bit(addr, size, offset); } #endif #ifndef find_first_bit /** * find_first_bit - find the first set bit in a memory region * @addr: The address to start the search at * @size: The maximum number of bits to search * * Returns the bit number of the first set bit. * If no bits are set, returns @size. */ static __always_inline unsigned long find_first_bit(const unsigned long *addr, unsigned long size) { if (small_const_nbits(size)) { unsigned long val = *addr & GENMASK(size - 1, 0); return val ? __ffs(val) : size; } return _find_first_bit(addr, size); } #endif /** * find_nth_bit - find N'th set bit in a memory region * @addr: The address to start the search at * @size: The maximum number of bits to search * @n: The number of set bit, which position is needed, counting from 0 * * The following is semantically equivalent: * idx = find_nth_bit(addr, size, 0); * idx = find_first_bit(addr, size); * * Returns the bit number of the N'th set bit. * If no such, returns >= @size. */ static __always_inline unsigned long find_nth_bit(const unsigned long *addr, unsigned long size, unsigned long n) { if (n >= size) return size; if (small_const_nbits(size)) { unsigned long val = *addr & GENMASK(size - 1, 0); return val ? fns(val, n) : size; } return __find_nth_bit(addr, size, n); } /** * find_nth_and_bit - find N'th set bit in 2 memory regions * @addr1: The 1st address to start the search at * @addr2: The 2nd address to start the search at * @size: The maximum number of bits to search * @n: The number of set bit, which position is needed, counting from 0 * * Returns the bit number of the N'th set bit. * If no such, returns @size. */ static __always_inline unsigned long find_nth_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long n) { if (n >= size) return size; if (small_const_nbits(size)) { unsigned long val = *addr1 & *addr2 & GENMASK(size - 1, 0); return val ? fns(val, n) : size; } return __find_nth_and_bit(addr1, addr2, size, n); } /** * find_nth_and_andnot_bit - find N'th set bit in 2 memory regions, * excluding those set in 3rd region * @addr1: The 1st address to start the search at * @addr2: The 2nd address to start the search at * @addr3: The 3rd address to start the search at * @size: The maximum number of bits to search * @n: The number of set bit, which position is needed, counting from 0 * * Returns the bit number of the N'th set bit. * If no such, returns @size. */ static __always_inline unsigned long find_nth_and_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, const unsigned long *addr3, unsigned long size, unsigned long n) { if (n >= size) return size; if (small_const_nbits(size)) { unsigned long val = *addr1 & *addr2 & (~*addr3) & GENMASK(size - 1, 0); return val ? fns(val, n) : size; } return __find_nth_and_andnot_bit(addr1, addr2, addr3, size, n); } #ifndef find_first_and_bit /** * find_first_and_bit - find the first set bit in both memory regions * @addr1: The first address to base the search on * @addr2: The second address to base the search on * @size: The bitmap size in bits * * Returns the bit number for the next set bit * If no bits are set, returns @size. */ static __always_inline unsigned long find_first_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size) { if (small_const_nbits(size)) { unsigned long val = *addr1 & *addr2 & GENMASK(size - 1, 0); return val ? __ffs(val) : size; } return _find_first_and_bit(addr1, addr2, size); } #endif /** * find_first_andnot_bit - find the first bit set in 1st memory region and unset in 2nd * @addr1: The first address to base the search on * @addr2: The second address to base the search on * @size: The bitmap size in bits * * Returns the bit number for the first set bit * If no bits are set, returns >= @size. */ static __always_inline unsigned long find_first_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size) { if (small_const_nbits(size)) { unsigned long val = *addr1 & (~*addr2) & GENMASK(size - 1, 0); return val ? __ffs(val) : size; } return _find_first_andnot_bit(addr1, addr2, size); } /** * find_first_and_and_bit - find the first set bit in 3 memory regions * @addr1: The first address to base the search on * @addr2: The second address to base the search on * @addr3: The third address to base the search on * @size: The bitmap size in bits * * Returns the bit number for the first set bit * If no bits are set, returns @size. */ static __always_inline unsigned long find_first_and_and_bit(const unsigned long *addr1, const unsigned long *addr2, const unsigned long *addr3, unsigned long size) { if (small_const_nbits(size)) { unsigned long val = *addr1 & *addr2 & *addr3 & GENMASK(size - 1, 0); return val ? __ffs(val) : size; } return _find_first_and_and_bit(addr1, addr2, addr3, size); } #ifndef find_first_zero_bit /** * find_first_zero_bit - find the first cleared bit in a memory region * @addr: The address to start the search at * @size: The maximum number of bits to search * * Returns the bit number of the first cleared bit. * If no bits are zero, returns @size. */ static __always_inline unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size) { if (small_const_nbits(size)) { unsigned long val = *addr | ~GENMASK(size - 1, 0); return val == ~0UL ? size : ffz(val); } return _find_first_zero_bit(addr, size); } #endif #ifndef find_last_bit /** * find_last_bit - find the last set bit in a memory region * @addr: The address to start the search at * @size: The number of bits to search * * Returns the bit number of the last set bit, or size. */ static __always_inline unsigned long find_last_bit(const unsigned long *addr, unsigned long size) { if (small_const_nbits(size)) { unsigned long val = *addr & GENMASK(size - 1, 0); return val ? __fls(val) : size; } return _find_last_bit(addr, size); } #endif /** * find_next_and_bit_wrap - find the next set bit in both memory regions * @addr1: The first address to base the search on * @addr2: The second address to base the search on * @size: The bitmap size in bits * @offset: The bitnumber to start searching at * * Returns the bit number for the next set bit, or first set bit up to @offset * If no bits are set, returns @size. */ static __always_inline unsigned long find_next_and_bit_wrap(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long offset) { unsigned long bit = find_next_and_bit(addr1, addr2, size, offset); if (bit < size || offset == 0) return bit; bit = find_first_and_bit(addr1, addr2, offset); return bit < offset ? bit : size; } /** * find_next_bit_wrap - find the next set bit in a memory region * @addr: The address to base the search on * @size: The bitmap size in bits * @offset: The bitnumber to start searching at * * Returns the bit number for the next set bit, or first set bit up to @offset * If no bits are set, returns @size. */ static __always_inline unsigned long find_next_bit_wrap(const unsigned long *addr, unsigned long size, unsigned long offset) { unsigned long bit = find_next_bit(addr, size, offset); if (bit < size || offset == 0) return bit; bit = find_first_bit(addr, offset); return bit < offset ? bit : size; } /* * Helper for for_each_set_bit_wrap(). Make sure you're doing right thing * before using it alone. */ static __always_inline unsigned long __for_each_wrap(const unsigned long *bitmap, unsigned long size, unsigned long start, unsigned long n) { unsigned long bit; /* If not wrapped around */ if (n > start) { /* and have a bit, just return it. */ bit = find_next_bit(bitmap, size, n); if (bit < size) return bit; /* Otherwise, wrap around and ... */ n = 0; } /* Search the other part. */ bit = find_next_bit(bitmap, start, n); return bit < start ? bit : size; } /** * find_next_clump8 - find next 8-bit clump with set bits in a memory region * @clump: location to store copy of found clump * @addr: address to base the search on * @size: bitmap size in number of bits * @offset: bit offset at which to start searching * * Returns the bit offset for the next set clump; the found clump value is * copied to the location pointed by @clump. If no bits are set, returns @size. */ extern unsigned long find_next_clump8(unsigned long *clump, const unsigned long *addr, unsigned long size, unsigned long offset); #define find_first_clump8(clump, bits, size) \ find_next_clump8((clump), (bits), (size), 0) #if defined(__LITTLE_ENDIAN) static __always_inline unsigned long find_next_zero_bit_le(const void *addr, unsigned long size, unsigned long offset) { return find_next_zero_bit(addr, size, offset); } static __always_inline unsigned long find_next_bit_le(const void *addr, unsigned long size, unsigned long offset) { return find_next_bit(addr, size, offset); } static __always_inline unsigned long find_first_zero_bit_le(const void *addr, unsigned long size) { return find_first_zero_bit(addr, size); } #elif defined(__BIG_ENDIAN) #ifndef find_next_zero_bit_le static __always_inline unsigned long find_next_zero_bit_le(const void *addr, unsigned long size, unsigned long offset) { if (small_const_nbits(size)) { unsigned long val = *(const unsigned long *)addr; if (unlikely(offset >= size)) return size; val = swab(val) | ~GENMASK(size - 1, offset); return val == ~0UL ? size : ffz(val); } return _find_next_zero_bit_le(addr, size, offset); } #endif #ifndef find_first_zero_bit_le static __always_inline unsigned long find_first_zero_bit_le(const void *addr, unsigned long size) { if (small_const_nbits(size)) { unsigned long val = swab(*(const unsigned long *)addr) | ~GENMASK(size - 1, 0); return val == ~0UL ? size : ffz(val); } return _find_first_zero_bit_le(addr, size); } #endif #ifndef find_next_bit_le static __always_inline unsigned long find_next_bit_le(const void *addr, unsigned long size, unsigned long offset) { if (small_const_nbits(size)) { unsigned long val = *(const unsigned long *)addr; if (unlikely(offset >= size)) return size; val = swab(val) & GENMASK(size - 1, offset); return val ? __ffs(val) : size; } return _find_next_bit_le(addr, size, offset); } #endif #else #error "Please fix <asm/byteorder.h>" #endif #define for_each_set_bit(bit, addr, size) \ for ((bit) = 0; (bit) = find_next_bit((addr), (size), (bit)), (bit) < (size); (bit)++) #define for_each_and_bit(bit, addr1, addr2, size) \ for ((bit) = 0; \ (bit) = find_next_and_bit((addr1), (addr2), (size), (bit)), (bit) < (size);\ (bit)++) #define for_each_andnot_bit(bit, addr1, addr2, size) \ for ((bit) = 0; \ (bit) = find_next_andnot_bit((addr1), (addr2), (size), (bit)), (bit) < (size);\ (bit)++) #define for_each_or_bit(bit, addr1, addr2, size) \ for ((bit) = 0; \ (bit) = find_next_or_bit((addr1), (addr2), (size), (bit)), (bit) < (size);\ (bit)++) /* same as for_each_set_bit() but use bit as value to start with */ #define for_each_set_bit_from(bit, addr, size) \ for (; (bit) = find_next_bit((addr), (size), (bit)), (bit) < (size); (bit)++) #define for_each_clear_bit(bit, addr, size) \ for ((bit) = 0; \ (bit) = find_next_zero_bit((addr), (size), (bit)), (bit) < (size); \ (bit)++) /* same as for_each_clear_bit() but use bit as value to start with */ #define for_each_clear_bit_from(bit, addr, size) \ for (; (bit) = find_next_zero_bit((addr), (size), (bit)), (bit) < (size); (bit)++) /** * for_each_set_bitrange - iterate over all set bit ranges [b; e) * @b: bit offset of start of current bitrange (first set bit) * @e: bit offset of end of current bitrange (first unset bit) * @addr: bitmap address to base the search on * @size: bitmap size in number of bits */ #define for_each_set_bitrange(b, e, addr, size) \ for ((b) = 0; \ (b) = find_next_bit((addr), (size), b), \ (e) = find_next_zero_bit((addr), (size), (b) + 1), \ (b) < (size); \ (b) = (e) + 1) /** * for_each_set_bitrange_from - iterate over all set bit ranges [b; e) * @b: bit offset of start of current bitrange (first set bit); must be initialized * @e: bit offset of end of current bitrange (first unset bit) * @addr: bitmap address to base the search on * @size: bitmap size in number of bits */ #define for_each_set_bitrange_from(b, e, addr, size) \ for (; \ (b) = find_next_bit((addr), (size), (b)), \ (e) = find_next_zero_bit((addr), (size), (b) + 1), \ (b) < (size); \ (b) = (e) + 1) /** * for_each_clear_bitrange - iterate over all unset bit ranges [b; e) * @b: bit offset of start of current bitrange (first unset bit) * @e: bit offset of end of current bitrange (first set bit) * @addr: bitmap address to base the search on * @size: bitmap size in number of bits */ #define for_each_clear_bitrange(b, e, addr, size) \ for ((b) = 0; \ (b) = find_next_zero_bit((addr), (size), (b)), \ (e) = find_next_bit((addr), (size), (b) + 1), \ (b) < (size); \ (b) = (e) + 1) /** * for_each_clear_bitrange_from - iterate over all unset bit ranges [b; e) * @b: bit offset of start of current bitrange (first set bit); must be initialized * @e: bit offset of end of current bitrange (first unset bit) * @addr: bitmap address to base the search on * @size: bitmap size in number of bits */ #define for_each_clear_bitrange_from(b, e, addr, size) \ for (; \ (b) = find_next_zero_bit((addr), (size), (b)), \ (e) = find_next_bit((addr), (size), (b) + 1), \ (b) < (size); \ (b) = (e) + 1) /** * for_each_set_bit_wrap - iterate over all set bits starting from @start, and * wrapping around the end of bitmap. * @bit: offset for current iteration * @addr: bitmap address to base the search on * @size: bitmap size in number of bits * @start: Starting bit for bitmap traversing, wrapping around the bitmap end */ #define for_each_set_bit_wrap(bit, addr, size, start) \ for ((bit) = find_next_bit_wrap((addr), (size), (start)); \ (bit) < (size); \ (bit) = __for_each_wrap((addr), (size), (start), (bit) + 1)) /** * for_each_set_clump8 - iterate over bitmap for each 8-bit clump with set bits * @start: bit offset to start search and to store the current iteration offset * @clump: location to store copy of current 8-bit clump * @bits: bitmap address to base the search on * @size: bitmap size in number of bits */ #define for_each_set_clump8(start, clump, bits, size) \ for ((start) = find_first_clump8(&(clump), (bits), (size)); \ (start) < (size); \ (start) = find_next_clump8(&(clump), (bits), (size), (start) + 8)) #endif /*__LINUX_FIND_H_ */
18 16 62 87 76 31 31 29 10 20 20 1 51 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * All Rights Reserved. */ #ifndef __XFS_DQUOT_H__ #define __XFS_DQUOT_H__ /* * Dquots are structures that hold quota information about a user or a group, * much like inodes are for files. In fact, dquots share many characteristics * with inodes. However, dquots can also be a centralized resource, relative * to a collection of inodes. In this respect, dquots share some characteristics * of the superblock. * XFS dquots exploit both those in its algorithms. They make every attempt * to not be a bottleneck when quotas are on and have minimal impact, if any, * when quotas are off. */ struct xfs_mount; struct xfs_trans; enum { XFS_QLOWSP_1_PCNT = 0, XFS_QLOWSP_3_PCNT, XFS_QLOWSP_5_PCNT, XFS_QLOWSP_MAX }; struct xfs_dquot_res { /* Total resources allocated and reserved. */ xfs_qcnt_t reserved; /* Total resources allocated. */ xfs_qcnt_t count; /* Absolute and preferred limits. */ xfs_qcnt_t hardlimit; xfs_qcnt_t softlimit; /* * For root dquots, this is the default grace period, in seconds. * Otherwise, this is when the quota grace period expires, * in seconds since the Unix epoch. */ time64_t timer; }; static inline bool xfs_dquot_res_over_limits( const struct xfs_dquot_res *qres) { if ((qres->softlimit && qres->softlimit < qres->reserved) || (qres->hardlimit && qres->hardlimit < qres->reserved)) return true; return false; } struct xfs_dquot_pre { xfs_qcnt_t q_prealloc_lo_wmark; xfs_qcnt_t q_prealloc_hi_wmark; int64_t q_low_space[XFS_QLOWSP_MAX]; }; /* * The incore dquot structure */ struct xfs_dquot { struct list_head q_lru; struct xfs_mount *q_mount; xfs_dqtype_t q_type; uint16_t q_flags; xfs_dqid_t q_id; struct lockref q_lockref; int q_bufoffset; xfs_daddr_t q_blkno; xfs_fileoff_t q_fileoffset; struct xfs_dquot_res q_blk; /* regular blocks */ struct xfs_dquot_res q_ino; /* inodes */ struct xfs_dquot_res q_rtb; /* realtime blocks */ struct xfs_dq_logitem q_logitem; struct xfs_dquot_pre q_blk_prealloc; struct xfs_dquot_pre q_rtb_prealloc; struct mutex q_qlock; struct completion q_flush; atomic_t q_pincount; struct wait_queue_head q_pinwait; }; /* * Lock hierarchy for q_qlock: * XFS_QLOCK_NORMAL is the implicit default, * XFS_QLOCK_NESTED is the dquot with the higher id in xfs_dqlock2 */ enum { XFS_QLOCK_NORMAL = 0, XFS_QLOCK_NESTED, }; /* * Manage the q_flush completion queue embedded in the dquot. This completion * queue synchronizes processes attempting to flush the in-core dquot back to * disk. */ static inline void xfs_dqflock(struct xfs_dquot *dqp) { wait_for_completion(&dqp->q_flush); } static inline bool xfs_dqflock_nowait(struct xfs_dquot *dqp) { return try_wait_for_completion(&dqp->q_flush); } static inline void xfs_dqfunlock(struct xfs_dquot *dqp) { complete(&dqp->q_flush); } static inline int xfs_dquot_type(const struct xfs_dquot *dqp) { return dqp->q_type & XFS_DQTYPE_REC_MASK; } static inline int xfs_this_quota_on(struct xfs_mount *mp, xfs_dqtype_t type) { switch (type) { case XFS_DQTYPE_USER: return XFS_IS_UQUOTA_ON(mp); case XFS_DQTYPE_GROUP: return XFS_IS_GQUOTA_ON(mp); case XFS_DQTYPE_PROJ: return XFS_IS_PQUOTA_ON(mp); default: return 0; } } static inline struct xfs_dquot *xfs_inode_dquot( struct xfs_inode *ip, xfs_dqtype_t type) { if (xfs_is_metadir_inode(ip)) return NULL; switch (type) { case XFS_DQTYPE_USER: return ip->i_udquot; case XFS_DQTYPE_GROUP: return ip->i_gdquot; case XFS_DQTYPE_PROJ: return ip->i_pdquot; default: return NULL; } } /* Decide if the dquot's limits are actually being enforced. */ static inline bool xfs_dquot_is_enforced( const struct xfs_dquot *dqp) { switch (xfs_dquot_type(dqp)) { case XFS_DQTYPE_USER: return XFS_IS_UQUOTA_ENFORCED(dqp->q_mount); case XFS_DQTYPE_GROUP: return XFS_IS_GQUOTA_ENFORCED(dqp->q_mount); case XFS_DQTYPE_PROJ: return XFS_IS_PQUOTA_ENFORCED(dqp->q_mount); } ASSERT(0); return false; } /* * Check whether a dquot is under low free space conditions. We assume the quota * is enabled and enforced. */ static inline bool xfs_dquot_lowsp(struct xfs_dquot *dqp) { int64_t freesp; freesp = dqp->q_blk.hardlimit - dqp->q_blk.reserved; if (freesp < dqp->q_blk_prealloc.q_low_space[XFS_QLOWSP_1_PCNT]) return true; freesp = dqp->q_rtb.hardlimit - dqp->q_rtb.reserved; if (freesp < dqp->q_rtb_prealloc.q_low_space[XFS_QLOWSP_1_PCNT]) return true; return false; } void xfs_dquot_to_disk(struct xfs_disk_dquot *ddqp, struct xfs_dquot *dqp); #define XFS_DQ_IS_LOCKED(dqp) (mutex_is_locked(&((dqp)->q_qlock))) #define XFS_DQ_IS_DIRTY(dqp) ((dqp)->q_flags & XFS_DQFLAG_DIRTY) void xfs_qm_dqdestroy(struct xfs_dquot *dqp); int xfs_qm_dqflush(struct xfs_dquot *dqp, struct xfs_buf *bp); void xfs_qm_dqunpin_wait(struct xfs_dquot *dqp); void xfs_qm_adjust_dqtimers(struct xfs_dquot *d); void xfs_qm_adjust_dqlimits(struct xfs_dquot *d); xfs_dqid_t xfs_qm_id_for_quotatype(struct xfs_inode *ip, xfs_dqtype_t type); int xfs_qm_dqget(struct xfs_mount *mp, xfs_dqid_t id, xfs_dqtype_t type, bool can_alloc, struct xfs_dquot **dqpp); int xfs_qm_dqget_inode(struct xfs_inode *ip, xfs_dqtype_t type, bool can_alloc, struct xfs_dquot **dqpp); int xfs_qm_dqget_next(struct xfs_mount *mp, xfs_dqid_t id, xfs_dqtype_t type, struct xfs_dquot **dqpp); int xfs_qm_dqget_uncached(struct xfs_mount *mp, xfs_dqid_t id, xfs_dqtype_t type, struct xfs_dquot **dqpp); void xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *); void xfs_dqlockn(struct xfs_dqtrx *q); void xfs_dquot_set_prealloc_limits(struct xfs_dquot *); int xfs_dquot_attach_buf(struct xfs_trans *tp, struct xfs_dquot *dqp); int xfs_dquot_use_attached_buf(struct xfs_dquot *dqp, struct xfs_buf **bpp); void xfs_dquot_detach_buf(struct xfs_dquot *dqp); static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp) { lockref_get(&dqp->q_lockref); return dqp; } time64_t xfs_dquot_set_timeout(struct xfs_mount *mp, time64_t timeout); time64_t xfs_dquot_set_grace_period(time64_t grace); void xfs_qm_init_dquot_blk(struct xfs_trans *tp, xfs_dqid_t id, xfs_dqtype_t type, struct xfs_buf *bp); #endif /* __XFS_DQUOT_H__ */
16 15 1430 47 47 13 1 1207 1054 1058 231 228 847 846 1207 1205 1182 26 1208 5984 2 5 1287 2 4 988 962 17 979 983 981 3 982 1178 4 1204 1117 1055 1055 1058 1052 1054 1053 1055 1055 2 2 1278 1274 15 15 1465 1465 1 1466 1397 1395 1390 1394 1266 15 15 15 1277 1274 2 2 3 1273 1275 15 137 136 136 2 138 136 2 127 11 176 2 177 124 22 47 34 23 12 20 15 23 12 178 91 178 143 23 115 87 21 125 45 1466 91 1431 1240 1242 1277 1282 1 1578 1 1057 1056 1058 1057 1055 1055 1133 1135 1132 1090 1089 102 36 74 93 9 4 1054 1058 1052 48 1014 31 31 30 30 28 79 80 80 52 31 22 22 22 3 1057 19 19 531 1 5 13 18 18 467 27 1792 998 962 1 1783 2 119 1430 312 140 1791 1792 1787 992 994 993 996 992 99 958 67 992 178 180 992 995 47 34 18 6 18 6 6 6 6 12 20 45 1002 6 993 4 14 14 21 13 14 3 3 14 6 21 21 12 3 3 6 2 80 37 3 19 22 3 19 2 20 20 20 20 3 18 20 10 10 40 121 2 119 2 119 812 811 2 810 810 75 74 73 1 297 1 296 517 38 16 27 38 141 137 4 141 25 2 23 133 10 2 124 69 63 6 9 82 32 141 1 140 107 33 20 20 20 20 63 55 55 16 5 13 951 952 952 1117 73 52 72 71 58 72 58 19 7 7 7 55 13 35 7 76 2 22 71 29 3 1 1 3 1 1 2 1 3 3 1 2 3 1 1 1 1 2 1 1 2 1 1 1 30 20 24 22 32 7 7 6 1 6 6 6 6 6 5 5 5 5 4 1 4 1 5 166 165 164 165 165 167 165 166 165 167 1 167 1 165 8 166 7 165 167 1 167 6 166 2 2 2 17 1 16 2 15 1 16 17 17 12 5 17 17 35 35 1782 1279 96 1237 1252 27 28 40 40 982 982 983 982 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 // SPDX-License-Identifier: GPL-2.0 /* * Resizable virtual memory filesystem for Linux. * * Copyright (C) 2000 Linus Torvalds. * 2000 Transmeta Corp. * 2000-2001 Christoph Rohland * 2000-2001 SAP AG * 2002 Red Hat Inc. * Copyright (C) 2002-2011 Hugh Dickins. * Copyright (C) 2011 Google Inc. * Copyright (C) 2002-2005 VERITAS Software Corporation. * Copyright (C) 2004 Andi Kleen, SuSE Labs * * Extended attribute support for tmpfs: * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net> * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com> * * tiny-shmem: * Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com> */ #include <linux/fs.h> #include <linux/init.h> #include <linux/vfs.h> #include <linux/mount.h> #include <linux/ramfs.h> #include <linux/pagemap.h> #include <linux/file.h> #include <linux/fileattr.h> #include <linux/filelock.h> #include <linux/mm.h> #include <linux/random.h> #include <linux/sched/signal.h> #include <linux/export.h> #include <linux/shmem_fs.h> #include <linux/swap.h> #include <linux/uio.h> #include <linux/hugetlb.h> #include <linux/fs_parser.h> #include <linux/swapfile.h> #include <linux/iversion.h> #include <linux/unicode.h> #include "swap.h" static struct vfsmount *shm_mnt __ro_after_init; #ifdef CONFIG_SHMEM /* * This virtual memory filesystem is heavily based on the ramfs. It * extends ramfs by the ability to use swap and honor resource limits * which makes it a completely usable filesystem. */ #include <linux/xattr.h> #include <linux/exportfs.h> #include <linux/posix_acl.h> #include <linux/posix_acl_xattr.h> #include <linux/mman.h> #include <linux/string.h> #include <linux/slab.h> #include <linux/backing-dev.h> #include <linux/writeback.h> #include <linux/pagevec.h> #include <linux/percpu_counter.h> #include <linux/falloc.h> #include <linux/splice.h> #include <linux/security.h> #include <linux/leafops.h> #include <linux/mempolicy.h> #include <linux/namei.h> #include <linux/ctype.h> #include <linux/migrate.h> #include <linux/highmem.h> #include <linux/seq_file.h> #include <linux/magic.h> #include <linux/syscalls.h> #include <linux/fcntl.h> #include <uapi/linux/memfd.h> #include <linux/rmap.h> #include <linux/uuid.h> #include <linux/quotaops.h> #include <linux/rcupdate_wait.h> #include <linux/uaccess.h> #include "internal.h" #define VM_ACCT(size) (PAGE_ALIGN(size) >> PAGE_SHIFT) /* Pretend that each entry is of this size in directory's i_size */ #define BOGO_DIRENT_SIZE 20 /* Pretend that one inode + its dentry occupy this much memory */ #define BOGO_INODE_SIZE 1024 /* Symlink up to this size is kmalloc'ed instead of using a swappable page */ #define SHORT_SYMLINK_LEN 128 /* * shmem_fallocate communicates with shmem_fault or shmem_writeout via * inode->i_private (with i_rwsem making sure that it has only one user at * a time): we would prefer not to enlarge the shmem inode just for that. */ struct shmem_falloc { wait_queue_head_t *waitq; /* faults into hole wait for punch to end */ pgoff_t start; /* start of range currently being fallocated */ pgoff_t next; /* the next page offset to be fallocated */ pgoff_t nr_falloced; /* how many new pages have been fallocated */ pgoff_t nr_unswapped; /* how often writeout refused to swap out */ }; struct shmem_options { unsigned long long blocks; unsigned long long inodes; struct mempolicy *mpol; kuid_t uid; kgid_t gid; umode_t mode; bool full_inums; int huge; int seen; bool noswap; unsigned short quota_types; struct shmem_quota_limits qlimits; #if IS_ENABLED(CONFIG_UNICODE) struct unicode_map *encoding; bool strict_encoding; #endif #define SHMEM_SEEN_BLOCKS 1 #define SHMEM_SEEN_INODES 2 #define SHMEM_SEEN_HUGE 4 #define SHMEM_SEEN_INUMS 8 #define SHMEM_SEEN_QUOTA 16 }; #ifdef CONFIG_TRANSPARENT_HUGEPAGE static unsigned long huge_shmem_orders_always __read_mostly; static unsigned long huge_shmem_orders_madvise __read_mostly; static unsigned long huge_shmem_orders_inherit __read_mostly; static unsigned long huge_shmem_orders_within_size __read_mostly; static bool shmem_orders_configured __initdata; #endif #ifdef CONFIG_TMPFS static unsigned long shmem_default_max_blocks(void) { return totalram_pages() / 2; } static unsigned long shmem_default_max_inodes(void) { unsigned long nr_pages = totalram_pages(); return min3(nr_pages - totalhigh_pages(), nr_pages / 2, ULONG_MAX / BOGO_INODE_SIZE); } #endif static int shmem_swapin_folio(struct inode *inode, pgoff_t index, struct folio **foliop, enum sgp_type sgp, gfp_t gfp, struct vm_area_struct *vma, vm_fault_t *fault_type); static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) { return sb->s_fs_info; } /* * shmem_file_setup pre-accounts the whole fixed size of a VM object, * for shared memory and for shared anonymous (/dev/zero) mappings * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1), * consistent with the pre-accounting of private mappings ... */ static inline int shmem_acct_size(unsigned long flags, loff_t size) { return (flags & SHMEM_F_NORESERVE) ? 0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size)); } static inline void shmem_unacct_size(unsigned long flags, loff_t size) { if (!(flags & SHMEM_F_NORESERVE)) vm_unacct_memory(VM_ACCT(size)); } static inline int shmem_reacct_size(unsigned long flags, loff_t oldsize, loff_t newsize) { if (!(flags & SHMEM_F_NORESERVE)) { if (VM_ACCT(newsize) > VM_ACCT(oldsize)) return security_vm_enough_memory_mm(current->mm, VM_ACCT(newsize) - VM_ACCT(oldsize)); else if (VM_ACCT(newsize) < VM_ACCT(oldsize)) vm_unacct_memory(VM_ACCT(oldsize) - VM_ACCT(newsize)); } return 0; } /* * ... whereas tmpfs objects are accounted incrementally as * pages are allocated, in order to allow large sparse files. * shmem_get_folio reports shmem_acct_blocks failure as -ENOSPC not -ENOMEM, * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM. */ static inline int shmem_acct_blocks(unsigned long flags, long pages) { if (!(flags & SHMEM_F_NORESERVE)) return 0; return security_vm_enough_memory_mm(current->mm, pages * VM_ACCT(PAGE_SIZE)); } static inline void shmem_unacct_blocks(unsigned long flags, long pages) { if (flags & SHMEM_F_NORESERVE) vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE)); } int shmem_inode_acct_blocks(struct inode *inode, long pages) { struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); int err = -ENOSPC; if (shmem_acct_blocks(info->flags, pages)) return err; might_sleep(); /* when quotas */ if (sbinfo->max_blocks) { if (!percpu_counter_limited_add(&sbinfo->used_blocks, sbinfo->max_blocks, pages)) goto unacct; err = dquot_alloc_block_nodirty(inode, pages); if (err) { percpu_counter_sub(&sbinfo->used_blocks, pages); goto unacct; } } else { err = dquot_alloc_block_nodirty(inode, pages); if (err) goto unacct; } return 0; unacct: shmem_unacct_blocks(info->flags, pages); return err; } static void shmem_inode_unacct_blocks(struct inode *inode, long pages) { struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); might_sleep(); /* when quotas */ dquot_free_block_nodirty(inode, pages); if (sbinfo->max_blocks) percpu_counter_sub(&sbinfo->used_blocks, pages); shmem_unacct_blocks(info->flags, pages); } static const struct super_operations shmem_ops; static const struct address_space_operations shmem_aops; static const struct file_operations shmem_file_operations; static const struct inode_operations shmem_inode_operations; static const struct inode_operations shmem_dir_inode_operations; static const struct inode_operations shmem_special_inode_operations; static const struct vm_operations_struct shmem_vm_ops; static const struct vm_operations_struct shmem_anon_vm_ops; static struct file_system_type shmem_fs_type; bool shmem_mapping(const struct address_space *mapping) { return mapping->a_ops == &shmem_aops; } EXPORT_SYMBOL_GPL(shmem_mapping); bool vma_is_anon_shmem(const struct vm_area_struct *vma) { return vma->vm_ops == &shmem_anon_vm_ops; } bool vma_is_shmem(const struct vm_area_struct *vma) { return vma_is_anon_shmem(vma) || vma->vm_ops == &shmem_vm_ops; } static LIST_HEAD(shmem_swaplist); static DEFINE_SPINLOCK(shmem_swaplist_lock); #ifdef CONFIG_TMPFS_QUOTA static int shmem_enable_quotas(struct super_block *sb, unsigned short quota_types) { int type, err = 0; sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NOLIST_DIRTY; for (type = 0; type < SHMEM_MAXQUOTAS; type++) { if (!(quota_types & (1 << type))) continue; err = dquot_load_quota_sb(sb, type, QFMT_SHMEM, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); if (err) goto out_err; } return 0; out_err: pr_warn("tmpfs: failed to enable quota tracking (type=%d, err=%d)\n", type, err); for (type--; type >= 0; type--) dquot_quota_off(sb, type); return err; } static void shmem_disable_quotas(struct super_block *sb) { int type; for (type = 0; type < SHMEM_MAXQUOTAS; type++) dquot_quota_off(sb, type); } static struct dquot __rcu **shmem_get_dquots(struct inode *inode) { return SHMEM_I(inode)->i_dquot; } #endif /* CONFIG_TMPFS_QUOTA */ /* * shmem_reserve_inode() performs bookkeeping to reserve a shmem inode, and * produces a novel ino for the newly allocated inode. * * It may also be called when making a hard link to permit the space needed by * each dentry. However, in that case, no new inode number is needed since that * internally draws from another pool of inode numbers (currently global * get_next_ino()). This case is indicated by passing NULL as inop. */ #define SHMEM_INO_BATCH 1024 static int shmem_reserve_inode(struct super_block *sb, ino_t *inop) { struct shmem_sb_info *sbinfo = SHMEM_SB(sb); ino_t ino; if (!(sb->s_flags & SB_KERNMOUNT)) { raw_spin_lock(&sbinfo->stat_lock); if (sbinfo->max_inodes) { if (sbinfo->free_ispace < BOGO_INODE_SIZE) { raw_spin_unlock(&sbinfo->stat_lock); return -ENOSPC; } sbinfo->free_ispace -= BOGO_INODE_SIZE; } if (inop) { ino = sbinfo->next_ino++; if (unlikely(is_zero_ino(ino))) ino = sbinfo->next_ino++; if (unlikely(!sbinfo->full_inums && ino > UINT_MAX)) { /* * Emulate get_next_ino uint wraparound for * compatibility */ if (IS_ENABLED(CONFIG_64BIT)) pr_warn("%s: inode number overflow on device %d, consider using inode64 mount option\n", __func__, MINOR(sb->s_dev)); sbinfo->next_ino = 1; ino = sbinfo->next_ino++; } *inop = ino; } raw_spin_unlock(&sbinfo->stat_lock); } else if (inop) { /* * __shmem_file_setup, one of our callers, is lock-free: it * doesn't hold stat_lock in shmem_reserve_inode since * max_inodes is always 0, and is called from potentially * unknown contexts. As such, use a per-cpu batched allocator * which doesn't require the per-sb stat_lock unless we are at * the batch boundary. * * We don't need to worry about inode{32,64} since SB_KERNMOUNT * shmem mounts are not exposed to userspace, so we don't need * to worry about things like glibc compatibility. */ ino_t *next_ino; next_ino = per_cpu_ptr(sbinfo->ino_batch, get_cpu()); ino = *next_ino; if (unlikely(ino % SHMEM_INO_BATCH == 0)) { raw_spin_lock(&sbinfo->stat_lock); ino = sbinfo->next_ino; sbinfo->next_ino += SHMEM_INO_BATCH; raw_spin_unlock(&sbinfo->stat_lock); if (unlikely(is_zero_ino(ino))) ino++; } *inop = ino; *next_ino = ++ino; put_cpu(); } return 0; } static void shmem_free_inode(struct super_block *sb, size_t freed_ispace) { struct shmem_sb_info *sbinfo = SHMEM_SB(sb); if (sbinfo->max_inodes) { raw_spin_lock(&sbinfo->stat_lock); sbinfo->free_ispace += BOGO_INODE_SIZE + freed_ispace; raw_spin_unlock(&sbinfo->stat_lock); } } /** * shmem_recalc_inode - recalculate the block usage of an inode * @inode: inode to recalc * @alloced: the change in number of pages allocated to inode * @swapped: the change in number of pages swapped from inode * * We have to calculate the free blocks since the mm can drop * undirtied hole pages behind our back. * * But normally info->alloced == inode->i_mapping->nrpages + info->swapped * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped) * * Return: true if swapped was incremented from 0, for shmem_writeout(). */ bool shmem_recalc_inode(struct inode *inode, long alloced, long swapped) { struct shmem_inode_info *info = SHMEM_I(inode); bool first_swapped = false; long freed; spin_lock(&info->lock); info->alloced += alloced; info->swapped += swapped; freed = info->alloced - info->swapped - READ_ONCE(inode->i_mapping->nrpages); /* * Special case: whereas normally shmem_recalc_inode() is called * after i_mapping->nrpages has already been adjusted (up or down), * shmem_writeout() has to raise swapped before nrpages is lowered - * to stop a racing shmem_recalc_inode() from thinking that a page has * been freed. Compensate here, to avoid the need for a followup call. */ if (swapped > 0) { if (info->swapped == swapped) first_swapped = true; freed += swapped; } if (freed > 0) info->alloced -= freed; spin_unlock(&info->lock); /* The quota case may block */ if (freed > 0) shmem_inode_unacct_blocks(inode, freed); return first_swapped; } bool shmem_charge(struct inode *inode, long pages) { struct address_space *mapping = inode->i_mapping; if (shmem_inode_acct_blocks(inode, pages)) return false; /* nrpages adjustment first, then shmem_recalc_inode() when balanced */ xa_lock_irq(&mapping->i_pages); mapping->nrpages += pages; xa_unlock_irq(&mapping->i_pages); shmem_recalc_inode(inode, pages, 0); return true; } void shmem_uncharge(struct inode *inode, long pages) { /* pages argument is currently unused: keep it to help debugging */ /* nrpages adjustment done by __filemap_remove_folio() or caller */ shmem_recalc_inode(inode, 0, 0); } /* * Replace item expected in xarray by a new item, while holding xa_lock. */ static int shmem_replace_entry(struct address_space *mapping, pgoff_t index, void *expected, void *replacement) { XA_STATE(xas, &mapping->i_pages, index); void *item; VM_BUG_ON(!expected); VM_BUG_ON(!replacement); item = xas_load(&xas); if (item != expected) return -ENOENT; xas_store(&xas, replacement); return 0; } /* * Sometimes, before we decide whether to proceed or to fail, we must check * that an entry was not already brought back or split by a racing thread. * * Checking folio is not enough: by the time a swapcache folio is locked, it * might be reused, and again be swapcache, using the same swap as before. * Returns the swap entry's order if it still presents, else returns -1. */ static int shmem_confirm_swap(struct address_space *mapping, pgoff_t index, swp_entry_t swap) { XA_STATE(xas, &mapping->i_pages, index); int ret = -1; void *entry; rcu_read_lock(); do { entry = xas_load(&xas); if (entry == swp_to_radix_entry(swap)) ret = xas_get_order(&xas); } while (xas_retry(&xas, entry)); rcu_read_unlock(); return ret; } /* * Definitions for "huge tmpfs": tmpfs mounted with the huge= option * * SHMEM_HUGE_NEVER: * disables huge pages for the mount; * SHMEM_HUGE_ALWAYS: * enables huge pages for the mount; * SHMEM_HUGE_WITHIN_SIZE: * only allocate huge pages if the page will be fully within i_size, * also respect madvise() hints; * SHMEM_HUGE_ADVISE: * only allocate huge pages if requested with madvise(); */ #define SHMEM_HUGE_NEVER 0 #define SHMEM_HUGE_ALWAYS 1 #define SHMEM_HUGE_WITHIN_SIZE 2 #define SHMEM_HUGE_ADVISE 3 /* * Special values. * Only can be set via /sys/kernel/mm/transparent_hugepage/shmem_enabled: * * SHMEM_HUGE_DENY: * disables huge on shm_mnt and all mounts, for emergency use; * SHMEM_HUGE_FORCE: * enables huge on shm_mnt and all mounts, w/o needing option, for testing; * */ #define SHMEM_HUGE_DENY (-1) #define SHMEM_HUGE_FORCE (-2) #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* ifdef here to avoid bloating shmem.o when not necessary */ #if defined(CONFIG_TRANSPARENT_HUGEPAGE_SHMEM_HUGE_NEVER) #define SHMEM_HUGE_DEFAULT SHMEM_HUGE_NEVER #elif defined(CONFIG_TRANSPARENT_HUGEPAGE_SHMEM_HUGE_ALWAYS) #define SHMEM_HUGE_DEFAULT SHMEM_HUGE_ALWAYS #elif defined(CONFIG_TRANSPARENT_HUGEPAGE_SHMEM_HUGE_WITHIN_SIZE) #define SHMEM_HUGE_DEFAULT SHMEM_HUGE_WITHIN_SIZE #elif defined(CONFIG_TRANSPARENT_HUGEPAGE_SHMEM_HUGE_ADVISE) #define SHMEM_HUGE_DEFAULT SHMEM_HUGE_ADVISE #else #define SHMEM_HUGE_DEFAULT SHMEM_HUGE_NEVER #endif static int shmem_huge __read_mostly = SHMEM_HUGE_DEFAULT; #undef SHMEM_HUGE_DEFAULT #if defined(CONFIG_TRANSPARENT_HUGEPAGE_TMPFS_HUGE_NEVER) #define TMPFS_HUGE_DEFAULT SHMEM_HUGE_NEVER #elif defined(CONFIG_TRANSPARENT_HUGEPAGE_TMPFS_HUGE_ALWAYS) #define TMPFS_HUGE_DEFAULT SHMEM_HUGE_ALWAYS #elif defined(CONFIG_TRANSPARENT_HUGEPAGE_TMPFS_HUGE_WITHIN_SIZE) #define TMPFS_HUGE_DEFAULT SHMEM_HUGE_WITHIN_SIZE #elif defined(CONFIG_TRANSPARENT_HUGEPAGE_TMPFS_HUGE_ADVISE) #define TMPFS_HUGE_DEFAULT SHMEM_HUGE_ADVISE #else #define TMPFS_HUGE_DEFAULT SHMEM_HUGE_NEVER #endif static int tmpfs_huge __read_mostly = TMPFS_HUGE_DEFAULT; #undef TMPFS_HUGE_DEFAULT static unsigned int shmem_get_orders_within_size(struct inode *inode, unsigned long within_size_orders, pgoff_t index, loff_t write_end) { pgoff_t aligned_index; unsigned long order; loff_t i_size; order = highest_order(within_size_orders); while (within_size_orders) { aligned_index = round_up(index + 1, 1 << order); i_size = max(write_end, i_size_read(inode)); i_size = round_up(i_size, PAGE_SIZE); if (i_size >> PAGE_SHIFT >= aligned_index) return within_size_orders; order = next_order(&within_size_orders, order); } return 0; } static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index, loff_t write_end, bool shmem_huge_force, struct vm_area_struct *vma, vm_flags_t vm_flags) { unsigned int maybe_pmd_order = HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER ? 0 : BIT(HPAGE_PMD_ORDER); unsigned long within_size_orders; if (!S_ISREG(inode->i_mode)) return 0; if (shmem_huge == SHMEM_HUGE_DENY) return 0; if (shmem_huge_force || shmem_huge == SHMEM_HUGE_FORCE) return maybe_pmd_order; /* * The huge order allocation for anon shmem is controlled through * the mTHP interface, so we still use PMD-sized huge order to * check whether global control is enabled. * * For tmpfs with 'huge=always' or 'huge=within_size' mount option, * we will always try PMD-sized order first. If that failed, it will * fall back to small large folios. */ switch (SHMEM_SB(inode->i_sb)->huge) { case SHMEM_HUGE_ALWAYS: return THP_ORDERS_ALL_FILE_DEFAULT; case SHMEM_HUGE_WITHIN_SIZE: within_size_orders = shmem_get_orders_within_size(inode, THP_ORDERS_ALL_FILE_DEFAULT, index, write_end); if (within_size_orders > 0) return within_size_orders; fallthrough; case SHMEM_HUGE_ADVISE: if (vm_flags & VM_HUGEPAGE) return THP_ORDERS_ALL_FILE_DEFAULT; fallthrough; default: return 0; } } static int shmem_parse_huge(const char *str) { int huge; if (!str) return -EINVAL; if (!strcmp(str, "never")) huge = SHMEM_HUGE_NEVER; else if (!strcmp(str, "always")) huge = SHMEM_HUGE_ALWAYS; else if (!strcmp(str, "within_size")) huge = SHMEM_HUGE_WITHIN_SIZE; else if (!strcmp(str, "advise")) huge = SHMEM_HUGE_ADVISE; else if (!strcmp(str, "deny")) huge = SHMEM_HUGE_DENY; else if (!strcmp(str, "force")) huge = SHMEM_HUGE_FORCE; else return -EINVAL; if (!has_transparent_hugepage() && huge != SHMEM_HUGE_NEVER && huge != SHMEM_HUGE_DENY) return -EINVAL; /* Do not override huge allocation policy with non-PMD sized mTHP */ if (huge == SHMEM_HUGE_FORCE && huge_shmem_orders_inherit != BIT(HPAGE_PMD_ORDER)) return -EINVAL; return huge; } #if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS) static const char *shmem_format_huge(int huge) { switch (huge) { case SHMEM_HUGE_NEVER: return "never"; case SHMEM_HUGE_ALWAYS: return "always"; case SHMEM_HUGE_WITHIN_SIZE: return "within_size"; case SHMEM_HUGE_ADVISE: return "advise"; case SHMEM_HUGE_DENY: return "deny"; case SHMEM_HUGE_FORCE: return "force"; default: VM_BUG_ON(1); return "bad_val"; } } #endif static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, struct shrink_control *sc, unsigned long nr_to_free) { LIST_HEAD(list), *pos, *next; struct inode *inode; struct shmem_inode_info *info; struct folio *folio; unsigned long batch = sc ? sc->nr_to_scan : 128; unsigned long split = 0, freed = 0; if (list_empty(&sbinfo->shrinklist)) return SHRINK_STOP; spin_lock(&sbinfo->shrinklist_lock); list_for_each_safe(pos, next, &sbinfo->shrinklist) { info = list_entry(pos, struct shmem_inode_info, shrinklist); /* pin the inode */ inode = igrab(&info->vfs_inode); /* inode is about to be evicted */ if (!inode) { list_del_init(&info->shrinklist); goto next; } list_move(&info->shrinklist, &list); next: sbinfo->shrinklist_len--; if (!--batch) break; } spin_unlock(&sbinfo->shrinklist_lock); list_for_each_safe(pos, next, &list) { pgoff_t next, end; loff_t i_size; int ret; info = list_entry(pos, struct shmem_inode_info, shrinklist); inode = &info->vfs_inode; if (nr_to_free && freed >= nr_to_free) goto move_back; i_size = i_size_read(inode); folio = filemap_get_entry(inode->i_mapping, i_size / PAGE_SIZE); if (!folio || xa_is_value(folio)) goto drop; /* No large folio at the end of the file: nothing to split */ if (!folio_test_large(folio)) { folio_put(folio); goto drop; } /* Check if there is anything to gain from splitting */ next = folio_next_index(folio); end = shmem_fallocend(inode, DIV_ROUND_UP(i_size, PAGE_SIZE)); if (end <= folio->index || end >= next) { folio_put(folio); goto drop; } /* * Move the inode on the list back to shrinklist if we failed * to lock the page at this time. * * Waiting for the lock may lead to deadlock in the * reclaim path. */ if (!folio_trylock(folio)) { folio_put(folio); goto move_back; } ret = split_folio(folio); folio_unlock(folio); folio_put(folio); /* If split failed move the inode on the list back to shrinklist */ if (ret) goto move_back; freed += next - end; split++; drop: list_del_init(&info->shrinklist); goto put; move_back: /* * Make sure the inode is either on the global list or deleted * from any local list before iput() since it could be deleted * in another thread once we put the inode (then the local list * is corrupted). */ spin_lock(&sbinfo->shrinklist_lock); list_move(&info->shrinklist, &sbinfo->shrinklist); sbinfo->shrinklist_len++; spin_unlock(&sbinfo->shrinklist_lock); put: iput(inode); } return split; } static long shmem_unused_huge_scan(struct super_block *sb, struct shrink_control *sc) { struct shmem_sb_info *sbinfo = SHMEM_SB(sb); if (!READ_ONCE(sbinfo->shrinklist_len)) return SHRINK_STOP; return shmem_unused_huge_shrink(sbinfo, sc, 0); } static long shmem_unused_huge_count(struct super_block *sb, struct shrink_control *sc) { struct shmem_sb_info *sbinfo = SHMEM_SB(sb); return READ_ONCE(sbinfo->shrinklist_len); } #else /* !CONFIG_TRANSPARENT_HUGEPAGE */ #define shmem_huge SHMEM_HUGE_DENY static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, struct shrink_control *sc, unsigned long nr_to_free) { return 0; } static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index, loff_t write_end, bool shmem_huge_force, struct vm_area_struct *vma, vm_flags_t vm_flags) { return 0; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static void shmem_update_stats(struct folio *folio, int nr_pages) { if (folio_test_pmd_mappable(folio)) lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr_pages); lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages); lruvec_stat_mod_folio(folio, NR_SHMEM, nr_pages); } /* * Somewhat like filemap_add_folio, but error if expected item has gone. */ int shmem_add_to_page_cache(struct folio *folio, struct address_space *mapping, pgoff_t index, void *expected, gfp_t gfp) { XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio)); unsigned long nr = folio_nr_pages(folio); swp_entry_t iter, swap; void *entry; VM_BUG_ON_FOLIO(index != round_down(index, nr), folio); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio); folio_ref_add(folio, nr); folio->mapping = mapping; folio->index = index; gfp &= GFP_RECLAIM_MASK; folio_throttle_swaprate(folio, gfp); swap = radix_to_swp_entry(expected); do { iter = swap; xas_lock_irq(&xas); xas_for_each_conflict(&xas, entry) { /* * The range must either be empty, or filled with * expected swap entries. Shmem swap entries are never * partially freed without split of both entry and * folio, so there shouldn't be any holes. */ if (!expected || entry != swp_to_radix_entry(iter)) { xas_set_err(&xas, -EEXIST); goto unlock; } iter.val += 1 << xas_get_order(&xas); } if (expected && iter.val - nr != swap.val) { xas_set_err(&xas, -EEXIST); goto unlock; } xas_store(&xas, folio); if (xas_error(&xas)) goto unlock; shmem_update_stats(folio, nr); mapping->nrpages += nr; unlock: xas_unlock_irq(&xas); } while (xas_nomem(&xas, gfp)); if (xas_error(&xas)) { folio->mapping = NULL; folio_ref_sub(folio, nr); return xas_error(&xas); } return 0; } /* * Somewhat like filemap_remove_folio, but substitutes swap for @folio. */ static void shmem_delete_from_page_cache(struct folio *folio, void *radswap) { struct address_space *mapping = folio->mapping; long nr = folio_nr_pages(folio); int error; xa_lock_irq(&mapping->i_pages); error = shmem_replace_entry(mapping, folio->index, folio, radswap); folio->mapping = NULL; mapping->nrpages -= nr; shmem_update_stats(folio, -nr); xa_unlock_irq(&mapping->i_pages); folio_put_refs(folio, nr); BUG_ON(error); } /* * Remove swap entry from page cache, free the swap and its page cache. Returns * the number of pages being freed. 0 means entry not found in XArray (0 pages * being freed). */ static long shmem_free_swap(struct address_space *mapping, pgoff_t index, pgoff_t end, void *radswap) { XA_STATE(xas, &mapping->i_pages, index); unsigned int nr_pages = 0; pgoff_t base; void *entry; xas_lock_irq(&xas); entry = xas_load(&xas); if (entry == radswap) { nr_pages = 1 << xas_get_order(&xas); base = round_down(xas.xa_index, nr_pages); if (base < index || base + nr_pages - 1 > end) nr_pages = 0; else xas_store(&xas, NULL); } xas_unlock_irq(&xas); if (nr_pages) swap_put_entries_direct(radix_to_swp_entry(radswap), nr_pages); return nr_pages; } /* * Determine (in bytes) how many of the shmem object's pages mapped by the * given offsets are swapped out. * * This is safe to call without i_rwsem or the i_pages lock thanks to RCU, * as long as the inode doesn't go away and racy results are not a problem. */ unsigned long shmem_partial_swap_usage(struct address_space *mapping, pgoff_t start, pgoff_t end) { XA_STATE(xas, &mapping->i_pages, start); struct folio *folio; unsigned long swapped = 0; unsigned long max = end - 1; rcu_read_lock(); xas_for_each(&xas, folio, max) { if (xas_retry(&xas, folio)) continue; if (xa_is_value(folio)) swapped += 1 << xas_get_order(&xas); if (xas.xa_index == max) break; if (need_resched()) { xas_pause(&xas); cond_resched_rcu(); } } rcu_read_unlock(); return swapped << PAGE_SHIFT; } /* * Determine (in bytes) how many of the shmem object's pages mapped by the * given vma is swapped out. * * This is safe to call without i_rwsem or the i_pages lock thanks to RCU, * as long as the inode doesn't go away and racy results are not a problem. */ unsigned long shmem_swap_usage(struct vm_area_struct *vma) { struct inode *inode = file_inode(vma->vm_file); struct shmem_inode_info *info = SHMEM_I(inode); struct address_space *mapping = inode->i_mapping; unsigned long swapped; /* Be careful as we don't hold info->lock */ swapped = READ_ONCE(info->swapped); /* * The easier cases are when the shmem object has nothing in swap, or * the vma maps it whole. Then we can simply use the stats that we * already track. */ if (!swapped) return 0; if (!vma->vm_pgoff && vma->vm_end - vma->vm_start >= inode->i_size) return swapped << PAGE_SHIFT; /* Here comes the more involved part */ return shmem_partial_swap_usage(mapping, vma->vm_pgoff, vma->vm_pgoff + vma_pages(vma)); } /* * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists. */ void shmem_unlock_mapping(struct address_space *mapping) { struct folio_batch fbatch; pgoff_t index = 0; folio_batch_init(&fbatch); /* * Minor point, but we might as well stop if someone else SHM_LOCKs it. */ while (!mapping_unevictable(mapping) && filemap_get_folios(mapping, &index, ~0UL, &fbatch)) { check_move_unevictable_folios(&fbatch); folio_batch_release(&fbatch); cond_resched(); } } static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index) { struct folio *folio; /* * At first avoid shmem_get_folio(,,,SGP_READ): that fails * beyond i_size, and reports fallocated folios as holes. */ folio = filemap_get_entry(inode->i_mapping, index); if (!folio) return folio; if (!xa_is_value(folio)) { folio_lock(folio); if (folio->mapping == inode->i_mapping) return folio; /* The folio has been swapped out */ folio_unlock(folio); folio_put(folio); } /* * But read a folio back from swap if any of it is within i_size * (although in some cases this is just a waste of time). */ folio = NULL; shmem_get_folio(inode, index, 0, &folio, SGP_READ); return folio; } /* * Remove range of pages and swap entries from page cache, and free them. * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate. */ static void shmem_undo_range(struct inode *inode, loff_t lstart, uoff_t lend, bool unfalloc) { struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); pgoff_t start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT; pgoff_t end = (lend + 1) >> PAGE_SHIFT; struct folio_batch fbatch; pgoff_t indices[PAGEVEC_SIZE]; struct folio *folio; bool same_folio; long nr_swaps_freed = 0; pgoff_t index; int i; if (lend == -1) end = -1; /* unsigned, so actually very big */ if (info->fallocend > start && info->fallocend <= end && !unfalloc) info->fallocend = start; folio_batch_init(&fbatch); index = start; while (index < end && find_lock_entries(mapping, &index, end - 1, &fbatch, indices)) { for (i = 0; i < folio_batch_count(&fbatch); i++) { folio = fbatch.folios[i]; if (xa_is_value(folio)) { if (unfalloc) continue; nr_swaps_freed += shmem_free_swap(mapping, indices[i], end - 1, folio); continue; } if (!unfalloc || !folio_test_uptodate(folio)) truncate_inode_folio(mapping, folio); folio_unlock(folio); } folio_batch_remove_exceptionals(&fbatch); folio_batch_release(&fbatch); cond_resched(); } /* * When undoing a failed fallocate, we want none of the partial folio * zeroing and splitting below, but shall want to truncate the whole * folio when !uptodate indicates that it was added by this fallocate, * even when [lstart, lend] covers only a part of the folio. */ if (unfalloc) goto whole_folios; same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT); folio = shmem_get_partial_folio(inode, lstart >> PAGE_SHIFT); if (folio) { same_folio = lend < folio_next_pos(folio); folio_mark_dirty(folio); if (!truncate_inode_partial_folio(folio, lstart, lend)) { start = folio_next_index(folio); if (same_folio) end = folio->index; } folio_unlock(folio); folio_put(folio); folio = NULL; } if (!same_folio) folio = shmem_get_partial_folio(inode, lend >> PAGE_SHIFT); if (folio) { folio_mark_dirty(folio); if (!truncate_inode_partial_folio(folio, lstart, lend)) end = folio->index; folio_unlock(folio); folio_put(folio); } whole_folios: index = start; while (index < end) { cond_resched(); if (!find_get_entries(mapping, &index, end - 1, &fbatch, indices)) { /* If all gone or hole-punch or unfalloc, we're done */ if (index == start || end != -1) break; /* But if truncating, restart to make sure all gone */ index = start; continue; } for (i = 0; i < folio_batch_count(&fbatch); i++) { folio = fbatch.folios[i]; if (xa_is_value(folio)) { int order; long swaps_freed; if (unfalloc) continue; swaps_freed = shmem_free_swap(mapping, indices[i], end - 1, folio); if (!swaps_freed) { pgoff_t base = indices[i]; order = shmem_confirm_swap(mapping, indices[i], radix_to_swp_entry(folio)); /* * If found a large swap entry cross the end or start * border, skip it as the truncate_inode_partial_folio * above should have at least zerod its content once. */ if (order > 0) { base = round_down(base, 1 << order); if (base < start || base + (1 << order) > end) continue; } /* Swap was replaced by page or extended, retry */ index = base; break; } nr_swaps_freed += swaps_freed; continue; } folio_lock(folio); if (!unfalloc || !folio_test_uptodate(folio)) { if (folio_mapping(folio) != mapping) { /* Page was replaced by swap: retry */ folio_unlock(folio); index = indices[i]; break; } VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio); if (!folio_test_large(folio)) { truncate_inode_folio(mapping, folio); } else if (truncate_inode_partial_folio(folio, lstart, lend)) { /* * If we split a page, reset the loop so * that we pick up the new sub pages. * Otherwise the THP was entirely * dropped or the target range was * zeroed, so just continue the loop as * is. */ if (!folio_test_large(folio)) { folio_unlock(folio); index = start; break; } } } folio_unlock(folio); } folio_batch_remove_exceptionals(&fbatch); folio_batch_release(&fbatch); } shmem_recalc_inode(inode, 0, -nr_swaps_freed); } void shmem_truncate_range(struct inode *inode, loff_t lstart, uoff_t lend) { shmem_undo_range(inode, lstart, lend, false); inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); inode_inc_iversion(inode); } EXPORT_SYMBOL_GPL(shmem_truncate_range); static int shmem_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = path->dentry->d_inode; struct shmem_inode_info *info = SHMEM_I(inode); if (info->alloced - info->swapped != inode->i_mapping->nrpages) shmem_recalc_inode(inode, 0, 0); if (info->fsflags & FS_APPEND_FL) stat->attributes |= STATX_ATTR_APPEND; if (info->fsflags & FS_IMMUTABLE_FL) stat->attributes |= STATX_ATTR_IMMUTABLE; if (info->fsflags & FS_NODUMP_FL) stat->attributes |= STATX_ATTR_NODUMP; stat->attributes_mask |= (STATX_ATTR_APPEND | STATX_ATTR_IMMUTABLE | STATX_ATTR_NODUMP); generic_fillattr(idmap, request_mask, inode, stat); if (shmem_huge_global_enabled(inode, 0, 0, false, NULL, 0)) stat->blksize = HPAGE_PMD_SIZE; if (request_mask & STATX_BTIME) { stat->result_mask |= STATX_BTIME; stat->btime.tv_sec = info->i_crtime.tv_sec; stat->btime.tv_nsec = info->i_crtime.tv_nsec; } return 0; } static int shmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); struct shmem_inode_info *info = SHMEM_I(inode); int error; bool update_mtime = false; bool update_ctime = true; error = setattr_prepare(idmap, dentry, attr); if (error) return error; if ((info->seals & F_SEAL_EXEC) && (attr->ia_valid & ATTR_MODE)) { if ((inode->i_mode ^ attr->ia_mode) & 0111) { return -EPERM; } } if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { loff_t oldsize = inode->i_size; loff_t newsize = attr->ia_size; /* protected by i_rwsem */ if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) || (newsize > oldsize && (info->seals & F_SEAL_GROW))) return -EPERM; if (newsize != oldsize) { if (info->flags & SHMEM_F_MAPPING_FROZEN) return -EPERM; error = shmem_reacct_size(SHMEM_I(inode)->flags, oldsize, newsize); if (error) return error; i_size_write(inode, newsize); update_mtime = true; } else { update_ctime = false; } if (newsize <= oldsize) { loff_t holebegin = round_up(newsize, PAGE_SIZE); if (oldsize > holebegin) unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); if (info->alloced) shmem_truncate_range(inode, newsize, (loff_t)-1); /* unmap again to remove racily COWed private pages */ if (oldsize > holebegin) unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); } } if (is_quota_modification(idmap, inode, attr)) { error = dquot_initialize(inode); if (error) return error; } /* Transfer quota accounting */ if (i_uid_needs_update(idmap, attr, inode) || i_gid_needs_update(idmap, attr, inode)) { error = dquot_transfer(idmap, inode, attr); if (error) return error; } setattr_copy(idmap, inode, attr); if (attr->ia_valid & ATTR_MODE) error = posix_acl_chmod(idmap, dentry, inode->i_mode); if (!error && update_ctime) { inode_set_ctime_current(inode); if (update_mtime) inode_set_mtime_to_ts(inode, inode_get_ctime(inode)); inode_inc_iversion(inode); } return error; } static void shmem_evict_inode(struct inode *inode) { struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); size_t freed = 0; if (shmem_mapping(inode->i_mapping)) { shmem_unacct_size(info->flags, inode->i_size); inode->i_size = 0; mapping_set_exiting(inode->i_mapping); shmem_truncate_range(inode, 0, (loff_t)-1); if (!list_empty(&info->shrinklist)) { spin_lock(&sbinfo->shrinklist_lock); if (!list_empty(&info->shrinklist)) { list_del_init(&info->shrinklist); sbinfo->shrinklist_len--; } spin_unlock(&sbinfo->shrinklist_lock); } while (!list_empty(&info->swaplist)) { /* Wait while shmem_unuse() is scanning this inode... */ wait_var_event(&info->stop_eviction, !atomic_read(&info->stop_eviction)); spin_lock(&shmem_swaplist_lock); /* ...but beware of the race if we peeked too early */ if (!atomic_read(&info->stop_eviction)) list_del_init(&info->swaplist); spin_unlock(&shmem_swaplist_lock); } } simple_xattrs_free(&info->xattrs, sbinfo->max_inodes ? &freed : NULL); shmem_free_inode(inode->i_sb, freed); WARN_ON(inode->i_blocks); clear_inode(inode); #ifdef CONFIG_TMPFS_QUOTA dquot_free_inode(inode); dquot_drop(inode); #endif } static unsigned int shmem_find_swap_entries(struct address_space *mapping, pgoff_t start, struct folio_batch *fbatch, pgoff_t *indices, unsigned int type) { XA_STATE(xas, &mapping->i_pages, start); struct folio *folio; swp_entry_t entry; rcu_read_lock(); xas_for_each(&xas, folio, ULONG_MAX) { if (xas_retry(&xas, folio)) continue; if (!xa_is_value(folio)) continue; entry = radix_to_swp_entry(folio); /* * swapin error entries can be found in the mapping. But they're * deliberately ignored here as we've done everything we can do. */ if (swp_type(entry) != type) continue; indices[folio_batch_count(fbatch)] = xas.xa_index; if (!folio_batch_add(fbatch, folio)) break; if (need_resched()) { xas_pause(&xas); cond_resched_rcu(); } } rcu_read_unlock(); return folio_batch_count(fbatch); } /* * Move the swapped pages for an inode to page cache. Returns the count * of pages swapped in, or the error in case of failure. */ static int shmem_unuse_swap_entries(struct inode *inode, struct folio_batch *fbatch, pgoff_t *indices) { int i = 0; int ret = 0; int error = 0; struct address_space *mapping = inode->i_mapping; for (i = 0; i < folio_batch_count(fbatch); i++) { struct folio *folio = fbatch->folios[i]; error = shmem_swapin_folio(inode, indices[i], &folio, SGP_CACHE, mapping_gfp_mask(mapping), NULL, NULL); if (error == 0) { folio_unlock(folio); folio_put(folio); ret++; } if (error == -ENOMEM) break; error = 0; } return error ? error : ret; } /* * If swap found in inode, free it and move page from swapcache to filecache. */ static int shmem_unuse_inode(struct inode *inode, unsigned int type) { struct address_space *mapping = inode->i_mapping; pgoff_t start = 0; struct folio_batch fbatch; pgoff_t indices[PAGEVEC_SIZE]; int ret = 0; do { folio_batch_init(&fbatch); if (!shmem_find_swap_entries(mapping, start, &fbatch, indices, type)) { ret = 0; break; } ret = shmem_unuse_swap_entries(inode, &fbatch, indices); if (ret < 0) break; start = indices[folio_batch_count(&fbatch) - 1]; } while (true); return ret; } /* * Read all the shared memory data that resides in the swap * device 'type' back into memory, so the swap device can be * unused. */ int shmem_unuse(unsigned int type) { struct shmem_inode_info *info, *next; int error = 0; if (list_empty(&shmem_swaplist)) return 0; spin_lock(&shmem_swaplist_lock); start_over: list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) { if (!info->swapped) { list_del_init(&info->swaplist); continue; } /* * Drop the swaplist mutex while searching the inode for swap; * but before doing so, make sure shmem_evict_inode() will not * remove placeholder inode from swaplist, nor let it be freed * (igrab() would protect from unlink, but not from unmount). */ atomic_inc(&info->stop_eviction); spin_unlock(&shmem_swaplist_lock); error = shmem_unuse_inode(&info->vfs_inode, type); cond_resched(); spin_lock(&shmem_swaplist_lock); if (atomic_dec_and_test(&info->stop_eviction)) wake_up_var(&info->stop_eviction); if (error) break; if (list_empty(&info->swaplist)) goto start_over; next = list_next_entry(info, swaplist); if (!info->swapped) list_del_init(&info->swaplist); } spin_unlock(&shmem_swaplist_lock); return error; } /** * shmem_writeout - Write the folio to swap * @folio: The folio to write * @plug: swap plug * @folio_list: list to put back folios on split * * Move the folio from the page cache to the swap cache. */ int shmem_writeout(struct folio *folio, struct swap_iocb **plug, struct list_head *folio_list) { struct address_space *mapping = folio->mapping; struct inode *inode = mapping->host; struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); pgoff_t index; int nr_pages; bool split = false; if ((info->flags & SHMEM_F_LOCKED) || sbinfo->noswap) goto redirty; if (!total_swap_pages) goto redirty; /* * If CONFIG_THP_SWAP is not enabled, the large folio should be * split when swapping. * * And shrinkage of pages beyond i_size does not split swap, so * swapout of a large folio crossing i_size needs to split too * (unless fallocate has been used to preallocate beyond EOF). */ if (folio_test_large(folio)) { index = shmem_fallocend(inode, DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE)); if ((index > folio->index && index < folio_next_index(folio)) || !IS_ENABLED(CONFIG_THP_SWAP)) split = true; } if (split) { int order; try_split: order = folio_order(folio); /* Ensure the subpages are still dirty */ folio_test_set_dirty(folio); if (split_folio_to_list(folio, folio_list)) goto redirty; #ifdef CONFIG_TRANSPARENT_HUGEPAGE if (order >= HPAGE_PMD_ORDER) { count_memcg_folio_events(folio, THP_SWPOUT_FALLBACK, 1); count_vm_event(THP_SWPOUT_FALLBACK); } #endif count_mthp_stat(order, MTHP_STAT_SWPOUT_FALLBACK); folio_clear_dirty(folio); } index = folio->index; nr_pages = folio_nr_pages(folio); /* * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC * value into swapfile.c, the only way we can correctly account for a * fallocated folio arriving here is now to initialize it and write it. * * That's okay for a folio already fallocated earlier, but if we have * not yet completed the fallocation, then (a) we want to keep track * of this folio in case we have to undo it, and (b) it may not be a * good idea to continue anyway, once we're pushing into swap. So * reactivate the folio, and let shmem_fallocate() quit when too many. */ if (!folio_test_uptodate(folio)) { if (inode->i_private) { struct shmem_falloc *shmem_falloc; spin_lock(&inode->i_lock); shmem_falloc = inode->i_private; if (shmem_falloc && !shmem_falloc->waitq && index >= shmem_falloc->start && index < shmem_falloc->next) shmem_falloc->nr_unswapped += nr_pages; else shmem_falloc = NULL; spin_unlock(&inode->i_lock); if (shmem_falloc) goto redirty; } folio_zero_range(folio, 0, folio_size(folio)); flush_dcache_folio(folio); folio_mark_uptodate(folio); } if (!folio_alloc_swap(folio)) { bool first_swapped = shmem_recalc_inode(inode, 0, nr_pages); int error; /* * Add inode to shmem_unuse()'s list of swapped-out inodes, * if it's not already there. Do it now before the folio is * removed from page cache, when its pagelock no longer * protects the inode from eviction. And do it now, after * we've incremented swapped, because shmem_unuse() will * prune a !swapped inode from the swaplist. */ if (first_swapped) { spin_lock(&shmem_swaplist_lock); if (list_empty(&info->swaplist)) list_add(&info->swaplist, &shmem_swaplist); spin_unlock(&shmem_swaplist_lock); } folio_dup_swap(folio, NULL); shmem_delete_from_page_cache(folio, swp_to_radix_entry(folio->swap)); BUG_ON(folio_mapped(folio)); error = swap_writeout(folio, plug); if (error != AOP_WRITEPAGE_ACTIVATE) { /* folio has been unlocked */ return error; } /* * The intention here is to avoid holding on to the swap when * zswap was unable to compress and unable to writeback; but * it will be appropriate if other reactivate cases are added. */ error = shmem_add_to_page_cache(folio, mapping, index, swp_to_radix_entry(folio->swap), __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); /* Swap entry might be erased by racing shmem_free_swap() */ if (!error) { shmem_recalc_inode(inode, 0, -nr_pages); folio_put_swap(folio, NULL); } /* * The swap_cache_del_folio() below could be left for * shrink_folio_list()'s folio_free_swap() to dispose of; * but I'm a little nervous about letting this folio out of * shmem_writeout() in a hybrid half-tmpfs-half-swap state * e.g. folio_mapping(folio) might give an unexpected answer. */ swap_cache_del_folio(folio); goto redirty; } if (nr_pages > 1) goto try_split; redirty: folio_mark_dirty(folio); return AOP_WRITEPAGE_ACTIVATE; /* Return with folio locked */ } EXPORT_SYMBOL_GPL(shmem_writeout); #if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS) static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) { char buffer[64]; if (!mpol || mpol->mode == MPOL_DEFAULT) return; /* show nothing */ mpol_to_str(buffer, sizeof(buffer), mpol); seq_printf(seq, ",mpol=%s", buffer); } static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) { struct mempolicy *mpol = NULL; if (sbinfo->mpol) { raw_spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */ mpol = sbinfo->mpol; mpol_get(mpol); raw_spin_unlock(&sbinfo->stat_lock); } return mpol; } #else /* !CONFIG_NUMA || !CONFIG_TMPFS */ static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) { } static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) { return NULL; } #endif /* CONFIG_NUMA && CONFIG_TMPFS */ static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info, pgoff_t index, unsigned int order, pgoff_t *ilx); static struct folio *shmem_swapin_cluster(swp_entry_t swap, gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) { struct mempolicy *mpol; pgoff_t ilx; struct folio *folio; mpol = shmem_get_pgoff_policy(info, index, 0, &ilx); folio = swap_cluster_readahead(swap, gfp, mpol, ilx); mpol_cond_put(mpol); return folio; } /* * Make sure huge_gfp is always more limited than limit_gfp. * Some of the flags set permissions, while others set limitations. */ static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp) { gfp_t allowflags = __GFP_IO | __GFP_FS | __GFP_RECLAIM; gfp_t denyflags = __GFP_NOWARN | __GFP_NORETRY; gfp_t zoneflags = limit_gfp & GFP_ZONEMASK; gfp_t result = huge_gfp & ~(allowflags | GFP_ZONEMASK); /* Allow allocations only from the originally specified zones. */ result |= zoneflags; /* * Minimize the result gfp by taking the union with the deny flags, * and the intersection of the allow flags. */ result |= (limit_gfp & denyflags); result |= (huge_gfp & limit_gfp) & allowflags; return result; } #ifdef CONFIG_TRANSPARENT_HUGEPAGE bool shmem_hpage_pmd_enabled(void) { if (shmem_huge == SHMEM_HUGE_DENY) return false; if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_always)) return true; if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_madvise)) return true; if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_within_size)) return true; if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_inherit) && shmem_huge != SHMEM_HUGE_NEVER) return true; return false; } unsigned long shmem_allowable_huge_orders(struct inode *inode, struct vm_area_struct *vma, pgoff_t index, loff_t write_end, bool shmem_huge_force) { unsigned long mask = READ_ONCE(huge_shmem_orders_always); unsigned long within_size_orders = READ_ONCE(huge_shmem_orders_within_size); vm_flags_t vm_flags = vma ? vma->vm_flags : 0; unsigned int global_orders; if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags, shmem_huge_force))) return 0; global_orders = shmem_huge_global_enabled(inode, index, write_end, shmem_huge_force, vma, vm_flags); /* Tmpfs huge pages allocation */ if (!vma || !vma_is_anon_shmem(vma)) return global_orders; /* * Following the 'deny' semantics of the top level, force the huge * option off from all mounts. */ if (shmem_huge == SHMEM_HUGE_DENY) return 0; /* * Only allow inherit orders if the top-level value is 'force', which * means non-PMD sized THP can not override 'huge' mount option now. */ if (shmem_huge == SHMEM_HUGE_FORCE) return READ_ONCE(huge_shmem_orders_inherit); /* Allow mTHP that will be fully within i_size. */ mask |= shmem_get_orders_within_size(inode, within_size_orders, index, 0); if (vm_flags & VM_HUGEPAGE) mask |= READ_ONCE(huge_shmem_orders_madvise); if (global_orders > 0) mask |= READ_ONCE(huge_shmem_orders_inherit); return THP_ORDERS_ALL_FILE_DEFAULT & mask; } static unsigned long shmem_suitable_orders(struct inode *inode, struct vm_fault *vmf, struct address_space *mapping, pgoff_t index, unsigned long orders) { struct vm_area_struct *vma = vmf ? vmf->vma : NULL; pgoff_t aligned_index; unsigned long pages; int order; if (vma) { orders = thp_vma_suitable_orders(vma, vmf->address, orders); if (!orders) return 0; } /* Find the highest order that can add into the page cache */ order = highest_order(orders); while (orders) { pages = 1UL << order; aligned_index = round_down(index, pages); /* * Check for conflict before waiting on a huge allocation. * Conflict might be that a huge page has just been allocated * and added to page cache by a racing thread, or that there * is already at least one small page in the huge extent. * Be careful to retry when appropriate, but not forever! * Elsewhere -EEXIST would be the right code, but not here. */ if (!xa_find(&mapping->i_pages, &aligned_index, aligned_index + pages - 1, XA_PRESENT)) break; order = next_order(&orders, order); } return orders; } #else static unsigned long shmem_suitable_orders(struct inode *inode, struct vm_fault *vmf, struct address_space *mapping, pgoff_t index, unsigned long orders) { return 0; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static struct folio *shmem_alloc_folio(gfp_t gfp, int order, struct shmem_inode_info *info, pgoff_t index) { struct mempolicy *mpol; pgoff_t ilx; struct folio *folio; mpol = shmem_get_pgoff_policy(info, index, order, &ilx); folio = folio_alloc_mpol(gfp, order, mpol, ilx, numa_node_id()); mpol_cond_put(mpol); return folio; } static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf, gfp_t gfp, struct inode *inode, pgoff_t index, struct mm_struct *fault_mm, unsigned long orders) { struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); unsigned long suitable_orders = 0; struct folio *folio = NULL; pgoff_t aligned_index; long pages; int error, order; if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) orders = 0; if (orders > 0) { suitable_orders = shmem_suitable_orders(inode, vmf, mapping, index, orders); order = highest_order(suitable_orders); while (suitable_orders) { pages = 1UL << order; aligned_index = round_down(index, pages); folio = shmem_alloc_folio(gfp, order, info, aligned_index); if (folio) { index = aligned_index; goto allocated; } if (pages == HPAGE_PMD_NR) count_vm_event(THP_FILE_FALLBACK); count_mthp_stat(order, MTHP_STAT_SHMEM_FALLBACK); order = next_order(&suitable_orders, order); } } else { pages = 1; folio = shmem_alloc_folio(gfp, 0, info, index); } if (!folio) return ERR_PTR(-ENOMEM); allocated: __folio_set_locked(folio); __folio_set_swapbacked(folio); gfp &= GFP_RECLAIM_MASK; error = mem_cgroup_charge(folio, fault_mm, gfp); if (error) { if (xa_find(&mapping->i_pages, &index, index + pages - 1, XA_PRESENT)) { error = -EEXIST; } else if (pages > 1) { if (pages == HPAGE_PMD_NR) { count_vm_event(THP_FILE_FALLBACK); count_vm_event(THP_FILE_FALLBACK_CHARGE); } count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_FALLBACK); count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_FALLBACK_CHARGE); } goto unlock; } error = shmem_add_to_page_cache(folio, mapping, index, NULL, gfp); if (error) goto unlock; error = shmem_inode_acct_blocks(inode, pages); if (error) { struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); long freed; /* * Try to reclaim some space by splitting a few * large folios beyond i_size on the filesystem. */ shmem_unused_huge_shrink(sbinfo, NULL, pages); /* * And do a shmem_recalc_inode() to account for freed pages: * except our folio is there in cache, so not quite balanced. */ spin_lock(&info->lock); freed = pages + info->alloced - info->swapped - READ_ONCE(mapping->nrpages); if (freed > 0) info->alloced -= freed; spin_unlock(&info->lock); if (freed > 0) shmem_inode_unacct_blocks(inode, freed); error = shmem_inode_acct_blocks(inode, pages); if (error) { filemap_remove_folio(folio); goto unlock; } } shmem_recalc_inode(inode, pages, 0); folio_add_lru(folio); return folio; unlock: folio_unlock(folio); folio_put(folio); return ERR_PTR(error); } static struct folio *shmem_swap_alloc_folio(struct inode *inode, struct vm_area_struct *vma, pgoff_t index, swp_entry_t entry, int order, gfp_t gfp) { struct shmem_inode_info *info = SHMEM_I(inode); struct folio *new, *swapcache; int nr_pages = 1 << order; gfp_t alloc_gfp; /* * We have arrived here because our zones are constrained, so don't * limit chance of success with further cpuset and node constraints. */ gfp &= ~GFP_CONSTRAINT_MASK; alloc_gfp = gfp; if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { if (WARN_ON_ONCE(order)) return ERR_PTR(-EINVAL); } else if (order) { /* * If uffd is active for the vma, we need per-page fault * fidelity to maintain the uffd semantics, then fallback * to swapin order-0 folio, as well as for zswap case. * Any existing sub folio in the swap cache also blocks * mTHP swapin. */ if ((vma && unlikely(userfaultfd_armed(vma))) || !zswap_never_enabled() || non_swapcache_batch(entry, nr_pages) != nr_pages) goto fallback; alloc_gfp = limit_gfp_mask(vma_thp_gfp_mask(vma), gfp); } retry: new = shmem_alloc_folio(alloc_gfp, order, info, index); if (!new) { new = ERR_PTR(-ENOMEM); goto fallback; } if (mem_cgroup_swapin_charge_folio(new, vma ? vma->vm_mm : NULL, alloc_gfp, entry)) { folio_put(new); new = ERR_PTR(-ENOMEM); goto fallback; } swapcache = swapin_folio(entry, new); if (swapcache != new) { folio_put(new); if (!swapcache) { /* * The new folio is charged already, swapin can * only fail due to another raced swapin. */ new = ERR_PTR(-EEXIST); goto fallback; } } return swapcache; fallback: /* Order 0 swapin failed, nothing to fallback to, abort */ if (!order) return new; entry.val += index - round_down(index, nr_pages); alloc_gfp = gfp; nr_pages = 1; order = 0; goto retry; } /* * When a page is moved from swapcache to shmem filecache (either by the * usual swapin of shmem_get_folio_gfp(), or by the less common swapoff of * shmem_unuse_inode()), it may have been read in earlier from swap, in * ignorance of the mapping it belongs to. If that mapping has special * constraints (like the gma500 GEM driver, which requires RAM below 4GB), * we may need to copy to a suitable page before moving to filecache. * * In a future release, this may well be extended to respect cpuset and * NUMA mempolicy, and applied also to anonymous pages in do_swap_page(); * but for now it is a simple matter of zone. */ static bool shmem_should_replace_folio(struct folio *folio, gfp_t gfp) { return folio_zonenum(folio) > gfp_zone(gfp); } static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, struct shmem_inode_info *info, pgoff_t index, struct vm_area_struct *vma) { struct swap_cluster_info *ci; struct folio *new, *old = *foliop; swp_entry_t entry = old->swap; int nr_pages = folio_nr_pages(old); int error = 0; /* * We have arrived here because our zones are constrained, so don't * limit chance of success by further cpuset and node constraints. */ gfp &= ~GFP_CONSTRAINT_MASK; #ifdef CONFIG_TRANSPARENT_HUGEPAGE if (nr_pages > 1) { gfp_t huge_gfp = vma_thp_gfp_mask(vma); gfp = limit_gfp_mask(huge_gfp, gfp); } #endif new = shmem_alloc_folio(gfp, folio_order(old), info, index); if (!new) return -ENOMEM; folio_ref_add(new, nr_pages); folio_copy(new, old); flush_dcache_folio(new); __folio_set_locked(new); __folio_set_swapbacked(new); folio_mark_uptodate(new); new->swap = entry; folio_set_swapcache(new); ci = swap_cluster_get_and_lock_irq(old); __swap_cache_replace_folio(ci, old, new); mem_cgroup_replace_folio(old, new); shmem_update_stats(new, nr_pages); shmem_update_stats(old, -nr_pages); swap_cluster_unlock_irq(ci); folio_add_lru(new); *foliop = new; folio_clear_swapcache(old); old->private = NULL; folio_unlock(old); /* * The old folio are removed from swap cache, drop the 'nr_pages' * reference, as well as one temporary reference getting from swap * cache. */ folio_put_refs(old, nr_pages + 1); return error; } static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, struct folio *folio, swp_entry_t swap) { struct address_space *mapping = inode->i_mapping; swp_entry_t swapin_error; void *old; int nr_pages; swapin_error = make_poisoned_swp_entry(); old = xa_cmpxchg_irq(&mapping->i_pages, index, swp_to_radix_entry(swap), swp_to_radix_entry(swapin_error), 0); if (old != swp_to_radix_entry(swap)) return; nr_pages = folio_nr_pages(folio); folio_wait_writeback(folio); folio_put_swap(folio, NULL); swap_cache_del_folio(folio); /* * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks * won't be 0 when inode is released and thus trigger WARN_ON(i_blocks) * in shmem_evict_inode(). */ shmem_recalc_inode(inode, -nr_pages, -nr_pages); } static int shmem_split_large_entry(struct inode *inode, pgoff_t index, swp_entry_t swap, gfp_t gfp) { struct address_space *mapping = inode->i_mapping; XA_STATE_ORDER(xas, &mapping->i_pages, index, 0); int split_order = 0; int i; /* Convert user data gfp flags to xarray node gfp flags */ gfp &= GFP_RECLAIM_MASK; for (;;) { void *old = NULL; int cur_order; pgoff_t swap_index; xas_lock_irq(&xas); old = xas_load(&xas); if (!xa_is_value(old) || swp_to_radix_entry(swap) != old) { xas_set_err(&xas, -EEXIST); goto unlock; } cur_order = xas_get_order(&xas); if (!cur_order) goto unlock; /* Try to split large swap entry in pagecache */ swap_index = round_down(index, 1 << cur_order); split_order = xas_try_split_min_order(cur_order); while (cur_order > 0) { pgoff_t aligned_index = round_down(index, 1 << cur_order); pgoff_t swap_offset = aligned_index - swap_index; xas_set_order(&xas, index, split_order); xas_try_split(&xas, old, cur_order); if (xas_error(&xas)) goto unlock; /* * Re-set the swap entry after splitting, and the swap * offset of the original large entry must be continuous. */ for (i = 0; i < 1 << cur_order; i += (1 << split_order)) { swp_entry_t tmp; tmp = swp_entry(swp_type(swap), swp_offset(swap) + swap_offset + i); __xa_store(&mapping->i_pages, aligned_index + i, swp_to_radix_entry(tmp), 0); } cur_order = split_order; split_order = xas_try_split_min_order(split_order); } unlock: xas_unlock_irq(&xas); if (!xas_nomem(&xas, gfp)) break; } if (xas_error(&xas)) return xas_error(&xas); return 0; } /* * Swap in the folio pointed to by *foliop. * Caller has to make sure that *foliop contains a valid swapped folio. * Returns 0 and the folio in foliop if success. On failure, returns the * error code and NULL in *foliop. */ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, struct folio **foliop, enum sgp_type sgp, gfp_t gfp, struct vm_area_struct *vma, vm_fault_t *fault_type) { struct address_space *mapping = inode->i_mapping; struct mm_struct *fault_mm = vma ? vma->vm_mm : NULL; struct shmem_inode_info *info = SHMEM_I(inode); swp_entry_t swap; softleaf_t index_entry; struct swap_info_struct *si; struct folio *folio = NULL; int error, nr_pages, order; pgoff_t offset; VM_BUG_ON(!*foliop || !xa_is_value(*foliop)); index_entry = radix_to_swp_entry(*foliop); swap = index_entry; *foliop = NULL; if (softleaf_is_poison_marker(index_entry)) return -EIO; si = get_swap_device(index_entry); order = shmem_confirm_swap(mapping, index, index_entry); if (unlikely(!si)) { if (order < 0) return -EEXIST; else return -EINVAL; } if (unlikely(order < 0)) { put_swap_device(si); return -EEXIST; } /* index may point to the middle of a large entry, get the sub entry */ if (order) { offset = index - round_down(index, 1 << order); swap = swp_entry(swp_type(swap), swp_offset(swap) + offset); } /* Look it up and read it in.. */ folio = swap_cache_get_folio(swap); if (!folio) { if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) { /* Direct swapin skipping swap cache & readahead */ folio = shmem_swap_alloc_folio(inode, vma, index, index_entry, order, gfp); if (IS_ERR(folio)) { error = PTR_ERR(folio); folio = NULL; goto failed; } } else { /* Cached swapin only supports order 0 folio */ folio = shmem_swapin_cluster(swap, gfp, info, index); if (!folio) { error = -ENOMEM; goto failed; } } if (fault_type) { *fault_type |= VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); count_memcg_event_mm(fault_mm, PGMAJFAULT); } } else { swap_update_readahead(folio, NULL, 0); } if (order > folio_order(folio)) { /* * Swapin may get smaller folios due to various reasons: * It may fallback to order 0 due to memory pressure or race, * swap readahead may swap in order 0 folios into swapcache * asynchronously, while the shmem mapping can still stores * large swap entries. In such cases, we should split the * large swap entry to prevent possible data corruption. */ error = shmem_split_large_entry(inode, index, index_entry, gfp); if (error) goto failed_nolock; } /* * If the folio is large, round down swap and index by folio size. * No matter what race occurs, the swap layer ensures we either get * a valid folio that has its swap entry aligned by size, or a * temporarily invalid one which we'll abort very soon and retry. * * shmem_add_to_page_cache ensures the whole range contains expected * entries and prevents any corruption, so any race split is fine * too, it will succeed as long as the entries are still there. */ nr_pages = folio_nr_pages(folio); if (nr_pages > 1) { swap.val = round_down(swap.val, nr_pages); index = round_down(index, nr_pages); } /* * We have to do this with the folio locked to prevent races. * The shmem_confirm_swap below only checks if the first swap * entry matches the folio, that's enough to ensure the folio * is not used outside of shmem, as shmem swap entries * and swap cache folios are never partially freed. */ folio_lock(folio); if (!folio_matches_swap_entry(folio, swap) || shmem_confirm_swap(mapping, index, swap) < 0) { error = -EEXIST; goto unlock; } if (!folio_test_uptodate(folio)) { error = -EIO; goto failed; } folio_wait_writeback(folio); /* * Some architectures may have to restore extra metadata to the * folio after reading from swap. */ arch_swap_restore(folio_swap(swap, folio), folio); if (shmem_should_replace_folio(folio, gfp)) { error = shmem_replace_folio(&folio, gfp, info, index, vma); if (error) goto failed; } error = shmem_add_to_page_cache(folio, mapping, index, swp_to_radix_entry(swap), gfp); if (error) goto failed; shmem_recalc_inode(inode, 0, -nr_pages); if (sgp == SGP_WRITE) folio_mark_accessed(folio); folio_put_swap(folio, NULL); swap_cache_del_folio(folio); folio_mark_dirty(folio); put_swap_device(si); *foliop = folio; return 0; failed: if (shmem_confirm_swap(mapping, index, swap) < 0) error = -EEXIST; if (error == -EIO) shmem_set_folio_swapin_error(inode, index, folio, swap); unlock: if (folio) folio_unlock(folio); failed_nolock: if (folio) folio_put(folio); put_swap_device(si); return error; } /* * shmem_get_folio_gfp - find page in cache, or get from swap, or allocate * * If we allocate a new one we do not mark it dirty. That's up to the * vm. If we swap it in we mark it dirty since we also free the swap * entry since a page cannot live in both the swap and page cache. * * vmf and fault_type are only supplied by shmem_fault: otherwise they are NULL. */ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, loff_t write_end, struct folio **foliop, enum sgp_type sgp, gfp_t gfp, struct vm_fault *vmf, vm_fault_t *fault_type) { struct vm_area_struct *vma = vmf ? vmf->vma : NULL; struct mm_struct *fault_mm; struct folio *folio; int error; bool alloced; unsigned long orders = 0; if (WARN_ON_ONCE(!shmem_mapping(inode->i_mapping))) return -EINVAL; if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT)) return -EFBIG; repeat: if (sgp <= SGP_CACHE && ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) return -EINVAL; alloced = false; fault_mm = vma ? vma->vm_mm : NULL; folio = filemap_get_entry(inode->i_mapping, index); if (folio && vma && userfaultfd_minor(vma)) { if (!xa_is_value(folio)) folio_put(folio); *fault_type = handle_userfault(vmf, VM_UFFD_MINOR); return 0; } if (xa_is_value(folio)) { error = shmem_swapin_folio(inode, index, &folio, sgp, gfp, vma, fault_type); if (error == -EEXIST) goto repeat; *foliop = folio; return error; } if (folio) { folio_lock(folio); /* Has the folio been truncated or swapped out? */ if (unlikely(folio->mapping != inode->i_mapping)) { folio_unlock(folio); folio_put(folio); goto repeat; } if (sgp == SGP_WRITE) folio_mark_accessed(folio); if (folio_test_uptodate(folio)) goto out; /* fallocated folio */ if (sgp != SGP_READ) goto clear; folio_unlock(folio); folio_put(folio); } /* * SGP_READ: succeed on hole, with NULL folio, letting caller zero. * SGP_NOALLOC: fail on hole, with NULL folio, letting caller fail. */ *foliop = NULL; if (sgp == SGP_READ) return 0; if (sgp == SGP_NOALLOC) return -ENOENT; /* * Fast cache lookup and swap lookup did not find it: allocate. */ if (vma && userfaultfd_missing(vma)) { *fault_type = handle_userfault(vmf, VM_UFFD_MISSING); return 0; } /* Find hugepage orders that are allowed for anonymous shmem and tmpfs. */ orders = shmem_allowable_huge_orders(inode, vma, index, write_end, false); if (orders > 0) { gfp_t huge_gfp; huge_gfp = vma_thp_gfp_mask(vma); huge_gfp = limit_gfp_mask(huge_gfp, gfp); folio = shmem_alloc_and_add_folio(vmf, huge_gfp, inode, index, fault_mm, orders); if (!IS_ERR(folio)) { if (folio_test_pmd_mappable(folio)) count_vm_event(THP_FILE_ALLOC); count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_ALLOC); goto alloced; } if (PTR_ERR(folio) == -EEXIST) goto repeat; } folio = shmem_alloc_and_add_folio(vmf, gfp, inode, index, fault_mm, 0); if (IS_ERR(folio)) { error = PTR_ERR(folio); if (error == -EEXIST) goto repeat; folio = NULL; goto unlock; } alloced: alloced = true; if (folio_test_large(folio) && DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) < folio_next_index(folio)) { struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); struct shmem_inode_info *info = SHMEM_I(inode); /* * Part of the large folio is beyond i_size: subject * to shrink under memory pressure. */ spin_lock(&sbinfo->shrinklist_lock); /* * _careful to defend against unlocked access to * ->shrink_list in shmem_unused_huge_shrink() */ if (list_empty_careful(&info->shrinklist)) { list_add_tail(&info->shrinklist, &sbinfo->shrinklist); sbinfo->shrinklist_len++; } spin_unlock(&sbinfo->shrinklist_lock); } if (sgp == SGP_WRITE) folio_set_referenced(folio); /* * Let SGP_FALLOC use the SGP_WRITE optimization on a new folio. */ if (sgp == SGP_FALLOC) sgp = SGP_WRITE; clear: /* * Let SGP_WRITE caller clear ends if write does not fill folio; * but SGP_FALLOC on a folio fallocated earlier must initialize * it now, lest undo on failure cancel our earlier guarantee. */ if (sgp != SGP_WRITE && !folio_test_uptodate(folio)) { long i, n = folio_nr_pages(folio); for (i = 0; i < n; i++) clear_highpage(folio_page(folio, i)); flush_dcache_folio(folio); folio_mark_uptodate(folio); } /* Perhaps the file has been truncated since we checked */ if (sgp <= SGP_CACHE && ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) { error = -EINVAL; goto unlock; } out: *foliop = folio; return 0; /* * Error recovery. */ unlock: if (alloced) filemap_remove_folio(folio); shmem_recalc_inode(inode, 0, 0); if (folio) { folio_unlock(folio); folio_put(folio); } return error; } /** * shmem_get_folio - find, and lock a shmem folio. * @inode: inode to search * @index: the page index. * @write_end: end of a write, could extend inode size * @foliop: pointer to the folio if found * @sgp: SGP_* flags to control behavior * * Looks up the page cache entry at @inode & @index. If a folio is * present, it is returned locked with an increased refcount. * * If the caller modifies data in the folio, it must call folio_mark_dirty() * before unlocking the folio to ensure that the folio is not reclaimed. * There is no need to reserve space before calling folio_mark_dirty(). * * When no folio is found, the behavior depends on @sgp: * - for SGP_READ, *@foliop is %NULL and 0 is returned * - for SGP_NOALLOC, *@foliop is %NULL and -ENOENT is returned * - for all other flags a new folio is allocated, inserted into the * page cache and returned locked in @foliop. * * Context: May sleep. * Return: 0 if successful, else a negative error code. */ int shmem_get_folio(struct inode *inode, pgoff_t index, loff_t write_end, struct folio **foliop, enum sgp_type sgp) { return shmem_get_folio_gfp(inode, index, write_end, foliop, sgp, mapping_gfp_mask(inode->i_mapping), NULL, NULL); } EXPORT_SYMBOL_GPL(shmem_get_folio); /* * This is like autoremove_wake_function, but it removes the wait queue * entry unconditionally - even if something else had already woken the * target. */ static int synchronous_wake_function(wait_queue_entry_t *wait, unsigned int mode, int sync, void *key) { int ret = default_wake_function(wait, mode, sync, key); list_del_init(&wait->entry); return ret; } /* * Trinity finds that probing a hole which tmpfs is punching can * prevent the hole-punch from ever completing: which in turn * locks writers out with its hold on i_rwsem. So refrain from * faulting pages into the hole while it's being punched. Although * shmem_undo_range() does remove the additions, it may be unable to * keep up, as each new page needs its own unmap_mapping_range() call, * and the i_mmap tree grows ever slower to scan if new vmas are added. * * It does not matter if we sometimes reach this check just before the * hole-punch begins, so that one fault then races with the punch: * we just need to make racing faults a rare case. * * The implementation below would be much simpler if we just used a * standard mutex or completion: but we cannot take i_rwsem in fault, * and bloating every shmem inode for this unlikely case would be sad. */ static vm_fault_t shmem_falloc_wait(struct vm_fault *vmf, struct inode *inode) { struct shmem_falloc *shmem_falloc; struct file *fpin = NULL; vm_fault_t ret = 0; spin_lock(&inode->i_lock); shmem_falloc = inode->i_private; if (shmem_falloc && shmem_falloc->waitq && vmf->pgoff >= shmem_falloc->start && vmf->pgoff < shmem_falloc->next) { wait_queue_head_t *shmem_falloc_waitq; DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function); ret = VM_FAULT_NOPAGE; fpin = maybe_unlock_mmap_for_io(vmf, NULL); shmem_falloc_waitq = shmem_falloc->waitq; prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait, TASK_UNINTERRUPTIBLE); spin_unlock(&inode->i_lock); schedule(); /* * shmem_falloc_waitq points into the shmem_fallocate() * stack of the hole-punching task: shmem_falloc_waitq * is usually invalid by the time we reach here, but * finish_wait() does not dereference it in that case; * though i_lock needed lest racing with wake_up_all(). */ spin_lock(&inode->i_lock); finish_wait(shmem_falloc_waitq, &shmem_fault_wait); } spin_unlock(&inode->i_lock); if (fpin) { fput(fpin); ret = VM_FAULT_RETRY; } return ret; } static vm_fault_t shmem_fault(struct vm_fault *vmf) { struct inode *inode = file_inode(vmf->vma->vm_file); gfp_t gfp = mapping_gfp_mask(inode->i_mapping); struct folio *folio = NULL; vm_fault_t ret = 0; int err; /* * Trinity finds that probing a hole which tmpfs is punching can * prevent the hole-punch from ever completing: noted in i_private. */ if (unlikely(inode->i_private)) { ret = shmem_falloc_wait(vmf, inode); if (ret) return ret; } WARN_ON_ONCE(vmf->page != NULL); err = shmem_get_folio_gfp(inode, vmf->pgoff, 0, &folio, SGP_CACHE, gfp, vmf, &ret); if (err) return vmf_error(err); if (folio) { vmf->page = folio_file_page(folio, vmf->pgoff); ret |= VM_FAULT_LOCKED; } return ret; } unsigned long shmem_get_unmapped_area(struct file *file, unsigned long uaddr, unsigned long len, unsigned long pgoff, unsigned long flags) { unsigned long addr; unsigned long offset; unsigned long inflated_len; unsigned long inflated_addr; unsigned long inflated_offset; unsigned long hpage_size; if (len > TASK_SIZE) return -ENOMEM; addr = mm_get_unmapped_area(file, uaddr, len, pgoff, flags); if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) return addr; if (IS_ERR_VALUE(addr)) return addr; if (addr & ~PAGE_MASK) return addr; if (addr > TASK_SIZE - len) return addr; if (shmem_huge == SHMEM_HUGE_DENY) return addr; if (flags & MAP_FIXED) return addr; /* * Our priority is to support MAP_SHARED mapped hugely; * and support MAP_PRIVATE mapped hugely too, until it is COWed. * But if caller specified an address hint and we allocated area there * successfully, respect that as before. */ if (uaddr == addr) return addr; hpage_size = HPAGE_PMD_SIZE; if (shmem_huge != SHMEM_HUGE_FORCE) { struct super_block *sb; unsigned long __maybe_unused hpage_orders; int order = 0; if (file) { VM_BUG_ON(file->f_op != &shmem_file_operations); sb = file_inode(file)->i_sb; } else { /* * Called directly from mm/mmap.c, or drivers/char/mem.c * for "/dev/zero", to create a shared anonymous object. */ if (IS_ERR(shm_mnt)) return addr; sb = shm_mnt->mnt_sb; /* * Find the highest mTHP order used for anonymous shmem to * provide a suitable alignment address. */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE hpage_orders = READ_ONCE(huge_shmem_orders_always); hpage_orders |= READ_ONCE(huge_shmem_orders_within_size); hpage_orders |= READ_ONCE(huge_shmem_orders_madvise); if (SHMEM_SB(sb)->huge != SHMEM_HUGE_NEVER) hpage_orders |= READ_ONCE(huge_shmem_orders_inherit); if (hpage_orders > 0) { order = highest_order(hpage_orders); hpage_size = PAGE_SIZE << order; } #endif } if (SHMEM_SB(sb)->huge == SHMEM_HUGE_NEVER && !order) return addr; } if (len < hpage_size) return addr; offset = (pgoff << PAGE_SHIFT) & (hpage_size - 1); if (offset && offset + len < 2 * hpage_size) return addr; if ((addr & (hpage_size - 1)) == offset) return addr; inflated_len = len + hpage_size - PAGE_SIZE; if (inflated_len > TASK_SIZE) return addr; if (inflated_len < len) return addr; inflated_addr = mm_get_unmapped_area(NULL, uaddr, inflated_len, 0, flags); if (IS_ERR_VALUE(inflated_addr)) return addr; if (inflated_addr & ~PAGE_MASK) return addr; inflated_offset = inflated_addr & (hpage_size - 1); inflated_addr += offset - inflated_offset; if (inflated_offset > offset) inflated_addr += hpage_size; if (inflated_addr > TASK_SIZE - len) return addr; return inflated_addr; } #ifdef CONFIG_NUMA static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) { struct inode *inode = file_inode(vma->vm_file); return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol); } static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, unsigned long addr, pgoff_t *ilx) { struct inode *inode = file_inode(vma->vm_file); pgoff_t index; /* * Bias interleave by inode number to distribute better across nodes; * but this interface is independent of which page order is used, so * supplies only that bias, letting caller apply the offset (adjusted * by page order, as in shmem_get_pgoff_policy() and get_vma_policy()). */ *ilx = inode->i_ino; index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index); } static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info, pgoff_t index, unsigned int order, pgoff_t *ilx) { struct mempolicy *mpol; /* Bias interleave by inode number to distribute better across nodes */ *ilx = info->vfs_inode.i_ino + (index >> order); mpol = mpol_shared_policy_lookup(&info->policy, index); return mpol ? mpol : get_task_policy(current); } #else static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info, pgoff_t index, unsigned int order, pgoff_t *ilx) { *ilx = 0; return NULL; } #endif /* CONFIG_NUMA */ int shmem_lock(struct file *file, int lock, struct ucounts *ucounts) { struct inode *inode = file_inode(file); struct shmem_inode_info *info = SHMEM_I(inode); int retval = -ENOMEM; /* * What serializes the accesses to info->flags? * ipc_lock_object() when called from shmctl_do_lock(), * no serialization needed when called from shm_destroy(). */ if (lock && !(info->flags & SHMEM_F_LOCKED)) { if (!user_shm_lock(inode->i_size, ucounts)) goto out_nomem; info->flags |= SHMEM_F_LOCKED; mapping_set_unevictable(file->f_mapping); } if (!lock && (info->flags & SHMEM_F_LOCKED) && ucounts) { user_shm_unlock(inode->i_size, ucounts); info->flags &= ~SHMEM_F_LOCKED; mapping_clear_unevictable(file->f_mapping); } retval = 0; out_nomem: return retval; } static int shmem_mmap_prepare(struct vm_area_desc *desc) { struct file *file = desc->file; struct inode *inode = file_inode(file); file_accessed(file); /* This is anonymous shared memory if it is unlinked at the time of mmap */ if (inode->i_nlink) desc->vm_ops = &shmem_vm_ops; else desc->vm_ops = &shmem_anon_vm_ops; return 0; } static int shmem_file_open(struct inode *inode, struct file *file) { file->f_mode |= FMODE_CAN_ODIRECT; return generic_file_open(inode, file); } #ifdef CONFIG_TMPFS_XATTR static int shmem_initxattrs(struct inode *, const struct xattr *, void *); #if IS_ENABLED(CONFIG_UNICODE) /* * shmem_inode_casefold_flags - Deal with casefold file attribute flag * * The casefold file attribute needs some special checks. I can just be added to * an empty dir, and can't be removed from a non-empty dir. */ static int shmem_inode_casefold_flags(struct inode *inode, unsigned int fsflags, struct dentry *dentry, unsigned int *i_flags) { unsigned int old = inode->i_flags; struct super_block *sb = inode->i_sb; if (fsflags & FS_CASEFOLD_FL) { if (!(old & S_CASEFOLD)) { if (!sb->s_encoding) return -EOPNOTSUPP; if (!S_ISDIR(inode->i_mode)) return -ENOTDIR; if (dentry && !simple_empty(dentry)) return -ENOTEMPTY; } *i_flags = *i_flags | S_CASEFOLD; } else if (old & S_CASEFOLD) { if (dentry && !simple_empty(dentry)) return -ENOTEMPTY; } return 0; } #else static int shmem_inode_casefold_flags(struct inode *inode, unsigned int fsflags, struct dentry *dentry, unsigned int *i_flags) { if (fsflags & FS_CASEFOLD_FL) return -EOPNOTSUPP; return 0; } #endif /* * chattr's fsflags are unrelated to extended attributes, * but tmpfs has chosen to enable them under the same config option. */ static int shmem_set_inode_flags(struct inode *inode, unsigned int fsflags, struct dentry *dentry) { unsigned int i_flags = 0; int ret; ret = shmem_inode_casefold_flags(inode, fsflags, dentry, &i_flags); if (ret) return ret; if (fsflags & FS_NOATIME_FL) i_flags |= S_NOATIME; if (fsflags & FS_APPEND_FL) i_flags |= S_APPEND; if (fsflags & FS_IMMUTABLE_FL) i_flags |= S_IMMUTABLE; /* * But FS_NODUMP_FL does not require any action in i_flags. */ inode_set_flags(inode, i_flags, S_NOATIME | S_APPEND | S_IMMUTABLE | S_CASEFOLD); return 0; } #else static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags, struct dentry *dentry) { } #define shmem_initxattrs NULL #endif static struct offset_ctx *shmem_get_offset_ctx(struct inode *inode) { return &SHMEM_I(inode)->dir_offsets; } static struct inode *__shmem_get_inode(struct mnt_idmap *idmap, struct super_block *sb, struct inode *dir, umode_t mode, dev_t dev, vma_flags_t flags) { struct inode *inode; struct shmem_inode_info *info; struct shmem_sb_info *sbinfo = SHMEM_SB(sb); ino_t ino; int err; err = shmem_reserve_inode(sb, &ino); if (err) return ERR_PTR(err); inode = new_inode(sb); if (!inode) { shmem_free_inode(sb, 0); return ERR_PTR(-ENOSPC); } inode->i_ino = ino; inode_init_owner(idmap, inode, dir, mode); inode->i_blocks = 0; simple_inode_init_ts(inode); inode->i_generation = get_random_u32(); info = SHMEM_I(inode); memset(info, 0, (char *)inode - (char *)info); spin_lock_init(&info->lock); atomic_set(&info->stop_eviction, 0); info->seals = F_SEAL_SEAL; info->flags = vma_flags_test(&flags, VMA_NORESERVE_BIT) ? SHMEM_F_NORESERVE : 0; info->i_crtime = inode_get_mtime(inode); info->fsflags = (dir == NULL) ? 0 : SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED; if (info->fsflags) shmem_set_inode_flags(inode, info->fsflags, NULL); INIT_LIST_HEAD(&info->shrinklist); INIT_LIST_HEAD(&info->swaplist); simple_xattrs_init(&info->xattrs); cache_no_acl(inode); if (sbinfo->noswap) mapping_set_unevictable(inode->i_mapping); /* Don't consider 'deny' for emergencies and 'force' for testing */ if (sbinfo->huge) mapping_set_large_folios(inode->i_mapping); switch (mode & S_IFMT) { default: inode->i_op = &shmem_special_inode_operations; init_special_inode(inode, mode, dev); break; case S_IFREG: inode->i_mapping->a_ops = &shmem_aops; inode->i_op = &shmem_inode_operations; inode->i_fop = &shmem_file_operations; mpol_shared_policy_init(&info->policy, shmem_get_sbmpol(sbinfo)); break; case S_IFDIR: inc_nlink(inode); /* Some things misbehave if size == 0 on a directory */ inode->i_size = 2 * BOGO_DIRENT_SIZE; inode->i_op = &shmem_dir_inode_operations; inode->i_fop = &simple_offset_dir_operations; simple_offset_init(shmem_get_offset_ctx(inode)); break; case S_IFLNK: /* * Must not load anything in the rbtree, * mpol_free_shared_policy will not be called. */ mpol_shared_policy_init(&info->policy, NULL); break; } lockdep_annotate_inode_mutex_key(inode); return inode; } #ifdef CONFIG_TMPFS_QUOTA static struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block *sb, struct inode *dir, umode_t mode, dev_t dev, vma_flags_t flags) { int err; struct inode *inode; inode = __shmem_get_inode(idmap, sb, dir, mode, dev, flags); if (IS_ERR(inode)) return inode; err = dquot_initialize(inode); if (err) goto errout; err = dquot_alloc_inode(inode); if (err) { dquot_drop(inode); goto errout; } return inode; errout: inode->i_flags |= S_NOQUOTA; iput(inode); return ERR_PTR(err); } #else static struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block *sb, struct inode *dir, umode_t mode, dev_t dev, vma_flags_t flags) { return __shmem_get_inode(idmap, sb, dir, mode, dev, flags); } #endif /* CONFIG_TMPFS_QUOTA */ #ifdef CONFIG_USERFAULTFD int shmem_mfill_atomic_pte(pmd_t *dst_pmd, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, uffd_flags_t flags, struct folio **foliop) { struct inode *inode = file_inode(dst_vma->vm_file); struct shmem_inode_info *info = SHMEM_I(inode); struct address_space *mapping = inode->i_mapping; gfp_t gfp = mapping_gfp_mask(mapping); pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); void *page_kaddr; struct folio *folio; int ret; pgoff_t max_off; if (shmem_inode_acct_blocks(inode, 1)) { /* * We may have got a page, returned -ENOENT triggering a retry, * and now we find ourselves with -ENOMEM. Release the page, to * avoid a BUG_ON in our caller. */ if (unlikely(*foliop)) { folio_put(*foliop); *foliop = NULL; } return -ENOMEM; } if (!*foliop) { ret = -ENOMEM; folio = shmem_alloc_folio(gfp, 0, info, pgoff); if (!folio) goto out_unacct_blocks; if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) { page_kaddr = kmap_local_folio(folio, 0); /* * The read mmap_lock is held here. Despite the * mmap_lock being read recursive a deadlock is still * possible if a writer has taken a lock. For example: * * process A thread 1 takes read lock on own mmap_lock * process A thread 2 calls mmap, blocks taking write lock * process B thread 1 takes page fault, read lock on own mmap lock * process B thread 2 calls mmap, blocks taking write lock * process A thread 1 blocks taking read lock on process B * process B thread 1 blocks taking read lock on process A * * Disable page faults to prevent potential deadlock * and retry the copy outside the mmap_lock. */ pagefault_disable(); ret = copy_from_user(page_kaddr, (const void __user *)src_addr, PAGE_SIZE); pagefault_enable(); kunmap_local(page_kaddr); /* fallback to copy_from_user outside mmap_lock */ if (unlikely(ret)) { *foliop = folio; ret = -ENOENT; /* don't free the page */ goto out_unacct_blocks; } flush_dcache_folio(folio); } else { /* ZEROPAGE */ clear_user_highpage(&folio->page, dst_addr); } } else { folio = *foliop; VM_BUG_ON_FOLIO(folio_test_large(folio), folio); *foliop = NULL; } VM_BUG_ON(folio_test_locked(folio)); VM_BUG_ON(folio_test_swapbacked(folio)); __folio_set_locked(folio); __folio_set_swapbacked(folio); __folio_mark_uptodate(folio); ret = -EFAULT; max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); if (unlikely(pgoff >= max_off)) goto out_release; ret = mem_cgroup_charge(folio, dst_vma->vm_mm, gfp); if (ret) goto out_release; ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL, gfp); if (ret) goto out_release; ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, &folio->page, true, flags); if (ret) goto out_delete_from_cache; shmem_recalc_inode(inode, 1, 0); folio_unlock(folio); return 0; out_delete_from_cache: filemap_remove_folio(folio); out_release: folio_unlock(folio); folio_put(folio); out_unacct_blocks: shmem_inode_unacct_blocks(inode, 1); return ret; } #endif /* CONFIG_USERFAULTFD */ #ifdef CONFIG_TMPFS static const struct inode_operations shmem_symlink_inode_operations; static const struct inode_operations shmem_short_symlink_operations; static int shmem_write_begin(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { struct inode *inode = mapping->host; struct shmem_inode_info *info = SHMEM_I(inode); pgoff_t index = pos >> PAGE_SHIFT; struct folio *folio; int ret = 0; /* i_rwsem is held by caller */ if (unlikely(info->seals & (F_SEAL_GROW | F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))) { if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) return -EPERM; if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size) return -EPERM; } if (unlikely((info->flags & SHMEM_F_MAPPING_FROZEN) && pos + len > inode->i_size)) return -EPERM; ret = shmem_get_folio(inode, index, pos + len, &folio, SGP_WRITE); if (ret) return ret; if (folio_contain_hwpoisoned_page(folio)) { folio_unlock(folio); folio_put(folio); return -EIO; } *foliop = folio; return 0; } static int shmem_write_end(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; if (pos + copied > inode->i_size) i_size_write(inode, pos + copied); if (!folio_test_uptodate(folio)) { if (copied < folio_size(folio)) { size_t from = offset_in_folio(folio, pos); folio_zero_segments(folio, 0, from, from + copied, folio_size(folio)); } folio_mark_uptodate(folio); } folio_mark_dirty(folio); folio_unlock(folio); folio_put(folio); return copied; } static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct address_space *mapping = inode->i_mapping; pgoff_t index; unsigned long offset; int error = 0; ssize_t retval = 0; for (;;) { struct folio *folio = NULL; struct page *page = NULL; unsigned long nr, ret; loff_t end_offset, i_size = i_size_read(inode); bool fallback_page_copy = false; size_t fsize; if (unlikely(iocb->ki_pos >= i_size)) break; index = iocb->ki_pos >> PAGE_SHIFT; error = shmem_get_folio(inode, index, 0, &folio, SGP_READ); if (error) { if (error == -EINVAL) error = 0; break; } if (folio) { folio_unlock(folio); page = folio_file_page(folio, index); if (PageHWPoison(page)) { folio_put(folio); error = -EIO; break; } if (folio_test_large(folio) && folio_test_has_hwpoisoned(folio)) fallback_page_copy = true; } /* * We must evaluate after, since reads (unlike writes) * are called without i_rwsem protection against truncate */ i_size = i_size_read(inode); if (unlikely(iocb->ki_pos >= i_size)) { if (folio) folio_put(folio); break; } end_offset = min_t(loff_t, i_size, iocb->ki_pos + to->count); if (folio && likely(!fallback_page_copy)) fsize = folio_size(folio); else fsize = PAGE_SIZE; offset = iocb->ki_pos & (fsize - 1); nr = min_t(loff_t, end_offset - iocb->ki_pos, fsize - offset); if (folio) { /* * If users can be writing to this page using arbitrary * virtual addresses, take care about potential aliasing * before reading the page on the kernel side. */ if (mapping_writably_mapped(mapping)) { if (likely(!fallback_page_copy)) flush_dcache_folio(folio); else flush_dcache_page(page); } /* * Mark the folio accessed if we read the beginning. */ if (!offset) folio_mark_accessed(folio); /* * Ok, we have the page, and it's up-to-date, so * now we can copy it to user space... */ if (likely(!fallback_page_copy)) ret = copy_folio_to_iter(folio, offset, nr, to); else ret = copy_page_to_iter(page, offset, nr, to); folio_put(folio); } else if (user_backed_iter(to)) { /* * Copy to user tends to be so well optimized, but * clear_user() not so much, that it is noticeably * faster to copy the zero page instead of clearing. */ ret = copy_page_to_iter(ZERO_PAGE(0), offset, nr, to); } else { /* * But submitting the same page twice in a row to * splice() - or others? - can result in confusion: * so don't attempt that optimization on pipes etc. */ ret = iov_iter_zero(nr, to); } retval += ret; iocb->ki_pos += ret; if (!iov_iter_count(to)) break; if (ret < nr) { error = -EFAULT; break; } cond_resched(); } file_accessed(file); return retval ? retval : error; } static ssize_t shmem_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; ssize_t ret; inode_lock(inode); ret = generic_write_checks(iocb, from); if (ret <= 0) goto unlock; ret = file_remove_privs(file); if (ret) goto unlock; ret = file_update_time(file); if (ret) goto unlock; ret = generic_perform_write(iocb, from); unlock: inode_unlock(inode); return ret; } static bool zero_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { return true; } static void zero_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { } static bool zero_pipe_buf_try_steal(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { return false; } static const struct pipe_buf_operations zero_pipe_buf_ops = { .release = zero_pipe_buf_release, .try_steal = zero_pipe_buf_try_steal, .get = zero_pipe_buf_get, }; static size_t splice_zeropage_into_pipe(struct pipe_inode_info *pipe, loff_t fpos, size_t size) { size_t offset = fpos & ~PAGE_MASK; size = min_t(size_t, size, PAGE_SIZE - offset); if (!pipe_is_full(pipe)) { struct pipe_buffer *buf = pipe_head_buf(pipe); *buf = (struct pipe_buffer) { .ops = &zero_pipe_buf_ops, .page = ZERO_PAGE(0), .offset = offset, .len = size, }; pipe->head++; } return size; } static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { struct inode *inode = file_inode(in); struct address_space *mapping = inode->i_mapping; struct folio *folio = NULL; size_t total_spliced = 0, used, npages, n, part; loff_t isize; int error = 0; /* Work out how much data we can actually add into the pipe */ used = pipe_buf_usage(pipe); npages = max_t(ssize_t, pipe->max_usage - used, 0); len = min_t(size_t, len, npages * PAGE_SIZE); do { bool fallback_page_splice = false; struct page *page = NULL; pgoff_t index; size_t size; if (*ppos >= i_size_read(inode)) break; index = *ppos >> PAGE_SHIFT; error = shmem_get_folio(inode, index, 0, &folio, SGP_READ); if (error) { if (error == -EINVAL) error = 0; break; } if (folio) { folio_unlock(folio); page = folio_file_page(folio, index); if (PageHWPoison(page)) { error = -EIO; break; } if (folio_test_large(folio) && folio_test_has_hwpoisoned(folio)) fallback_page_splice = true; } /* * i_size must be checked after we know the pages are Uptodate. * * Checking i_size after the check allows us to calculate * the correct value for "nr", which means the zero-filled * part of the page is not copied back to userspace (unless * another truncate extends the file - this is desired though). */ isize = i_size_read(inode); if (unlikely(*ppos >= isize)) break; /* * Fallback to PAGE_SIZE splice if the large folio has hwpoisoned * pages. */ size = len; if (unlikely(fallback_page_splice)) { size_t offset = *ppos & ~PAGE_MASK; size = umin(size, PAGE_SIZE - offset); } part = min_t(loff_t, isize - *ppos, size); if (folio) { /* * If users can be writing to this page using arbitrary * virtual addresses, take care about potential aliasing * before reading the page on the kernel side. */ if (mapping_writably_mapped(mapping)) { if (likely(!fallback_page_splice)) flush_dcache_folio(folio); else flush_dcache_page(page); } folio_mark_accessed(folio); /* * Ok, we have the page, and it's up-to-date, so we can * now splice it into the pipe. */ n = splice_folio_into_pipe(pipe, folio, *ppos, part); folio_put(folio); folio = NULL; } else { n = splice_zeropage_into_pipe(pipe, *ppos, part); } if (!n) break; len -= n; total_spliced += n; *ppos += n; in->f_ra.prev_pos = *ppos; if (pipe_is_full(pipe)) break; cond_resched(); } while (len); if (folio) folio_put(folio); file_accessed(in); return total_spliced ? total_spliced : error; } static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence) { struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; if (whence != SEEK_DATA && whence != SEEK_HOLE) return generic_file_llseek_size(file, offset, whence, MAX_LFS_FILESIZE, i_size_read(inode)); if (offset < 0) return -ENXIO; inode_lock(inode); /* We're holding i_rwsem so we can access i_size directly */ offset = mapping_seek_hole_data(mapping, offset, inode->i_size, whence); if (offset >= 0) offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE); inode_unlock(inode); return offset; } static long shmem_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_falloc shmem_falloc; pgoff_t start, index, end, undo_fallocend; int error; if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) return -EOPNOTSUPP; inode_lock(inode); if (info->flags & SHMEM_F_MAPPING_FROZEN) { error = -EPERM; goto out; } if (mode & FALLOC_FL_PUNCH_HOLE) { struct address_space *mapping = file->f_mapping; loff_t unmap_start = round_up(offset, PAGE_SIZE); loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1; DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq); /* protected by i_rwsem */ if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) { error = -EPERM; goto out; } shmem_falloc.waitq = &shmem_falloc_waitq; shmem_falloc.start = (u64)unmap_start >> PAGE_SHIFT; shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT; spin_lock(&inode->i_lock); inode->i_private = &shmem_falloc; spin_unlock(&inode->i_lock); if ((u64)unmap_end > (u64)unmap_start) unmap_mapping_range(mapping, unmap_start, 1 + unmap_end - unmap_start, 0); shmem_truncate_range(inode, offset, offset + len - 1); /* No need to unmap again: hole-punching leaves COWed pages */ spin_lock(&inode->i_lock); inode->i_private = NULL; wake_up_all(&shmem_falloc_waitq); WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.head)); spin_unlock(&inode->i_lock); error = 0; goto out; } /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */ error = inode_newsize_ok(inode, offset + len); if (error) goto out; if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) { error = -EPERM; goto out; } start = offset >> PAGE_SHIFT; end = (offset + len + PAGE_SIZE - 1) >> PAGE_SHIFT; /* Try to avoid a swapstorm if len is impossible to satisfy */ if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) { error = -ENOSPC; goto out; } shmem_falloc.waitq = NULL; shmem_falloc.start = start; shmem_falloc.next = start; shmem_falloc.nr_falloced = 0; shmem_falloc.nr_unswapped = 0; spin_lock(&inode->i_lock); inode->i_private = &shmem_falloc; spin_unlock(&inode->i_lock); /* * info->fallocend is only relevant when huge pages might be * involved: to prevent split_huge_page() freeing fallocated * pages when FALLOC_FL_KEEP_SIZE committed beyond i_size. */ undo_fallocend = info->fallocend; if (info->fallocend < end) info->fallocend = end; for (index = start; index < end; ) { struct folio *folio; /* * Check for fatal signal so that we abort early in OOM * situations. We don't want to abort in case of non-fatal * signals as large fallocate can take noticeable time and * e.g. periodic timers may result in fallocate constantly * restarting. */ if (fatal_signal_pending(current)) error = -EINTR; else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced) error = -ENOMEM; else error = shmem_get_folio(inode, index, offset + len, &folio, SGP_FALLOC); if (error) { info->fallocend = undo_fallocend; /* Remove the !uptodate folios we added */ if (index > start) { shmem_undo_range(inode, (loff_t)start << PAGE_SHIFT, ((loff_t)index << PAGE_SHIFT) - 1, true); } goto undone; } /* * Here is a more important optimization than it appears: * a second SGP_FALLOC on the same large folio will clear it, * making it uptodate and un-undoable if we fail later. */ index = folio_next_index(folio); /* Beware 32-bit wraparound */ if (!index) index--; /* * Inform shmem_writeout() how far we have reached. * No need for lock or barrier: we have the page lock. */ if (!folio_test_uptodate(folio)) shmem_falloc.nr_falloced += index - shmem_falloc.next; shmem_falloc.next = index; /* * If !uptodate, leave it that way so that freeable folios * can be recognized if we need to rollback on error later. * But mark it dirty so that memory pressure will swap rather * than free the folios we are allocating (and SGP_CACHE folios * might still be clean: we now need to mark those dirty too). */ folio_mark_dirty(folio); folio_unlock(folio); folio_put(folio); cond_resched(); } if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) i_size_write(inode, offset + len); undone: spin_lock(&inode->i_lock); inode->i_private = NULL; spin_unlock(&inode->i_lock); out: if (!error) file_modified(file); inode_unlock(inode); return error; } static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) { struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); buf->f_type = TMPFS_MAGIC; buf->f_bsize = PAGE_SIZE; buf->f_namelen = NAME_MAX; if (sbinfo->max_blocks) { buf->f_blocks = sbinfo->max_blocks; buf->f_bavail = buf->f_bfree = sbinfo->max_blocks - percpu_counter_sum(&sbinfo->used_blocks); } if (sbinfo->max_inodes) { buf->f_files = sbinfo->max_inodes; buf->f_ffree = sbinfo->free_ispace / BOGO_INODE_SIZE; } /* else leave those fields 0 like simple_statfs */ buf->f_fsid = uuid_to_fsid(dentry->d_sb->s_uuid.b); return 0; } /* * File creation. Allocate an inode, and we're done.. */ static int shmem_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { struct inode *inode; int error; if (!generic_ci_validate_strict_name(dir, &dentry->d_name)) return -EINVAL; inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, dev, mk_vma_flags(VMA_NORESERVE_BIT)); if (IS_ERR(inode)) return PTR_ERR(inode); error = simple_acl_create(dir, inode); if (error) goto out_iput; error = security_inode_init_security(inode, dir, &dentry->d_name, shmem_initxattrs, NULL); if (error && error != -EOPNOTSUPP) goto out_iput; error = simple_offset_add(shmem_get_offset_ctx(dir), dentry); if (error) goto out_iput; dir->i_size += BOGO_DIRENT_SIZE; inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); inode_inc_iversion(dir); d_make_persistent(dentry, inode); return error; out_iput: iput(inode); return error; } static int shmem_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct file *file, umode_t mode) { struct inode *inode; int error; inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, 0, mk_vma_flags(VMA_NORESERVE_BIT)); if (IS_ERR(inode)) { error = PTR_ERR(inode); goto err_out; } error = security_inode_init_security(inode, dir, NULL, shmem_initxattrs, NULL); if (error && error != -EOPNOTSUPP) goto out_iput; error = simple_acl_create(dir, inode); if (error) goto out_iput; d_tmpfile(file, inode); err_out: return finish_open_simple(file, error); out_iput: iput(inode); return error; } static struct dentry *shmem_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { int error; error = shmem_mknod(idmap, dir, dentry, mode | S_IFDIR, 0); if (error) return ERR_PTR(error); inc_nlink(dir); return NULL; } static int shmem_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { return shmem_mknod(idmap, dir, dentry, mode | S_IFREG, 0); } /* * Link a file.. */ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(old_dentry); int ret; /* * No ordinary (disk based) filesystem counts links as inodes; * but each new link needs a new dentry, pinning lowmem, and * tmpfs dentries cannot be pruned until they are unlinked. * But if an O_TMPFILE file is linked into the tmpfs, the * first link must skip that, to get the accounting right. */ if (inode->i_nlink) { ret = shmem_reserve_inode(inode->i_sb, NULL); if (ret) return ret; } ret = simple_offset_add(shmem_get_offset_ctx(dir), dentry); if (ret) { if (inode->i_nlink) shmem_free_inode(inode->i_sb, 0); return ret; } dir->i_size += BOGO_DIRENT_SIZE; inode_inc_iversion(dir); return simple_link(old_dentry, dir, dentry); } static int shmem_unlink(struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(dentry); if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) shmem_free_inode(inode->i_sb, 0); simple_offset_remove(shmem_get_offset_ctx(dir), dentry); dir->i_size -= BOGO_DIRENT_SIZE; inode_inc_iversion(dir); simple_unlink(dir, dentry); /* * For now, VFS can't deal with case-insensitive negative dentries, so * we invalidate them */ if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir)) d_invalidate(dentry); return 0; } static int shmem_rmdir(struct inode *dir, struct dentry *dentry) { if (!simple_empty(dentry)) return -ENOTEMPTY; drop_nlink(d_inode(dentry)); drop_nlink(dir); return shmem_unlink(dir, dentry); } static int shmem_whiteout(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry) { struct dentry *whiteout; int error; whiteout = d_alloc(old_dentry->d_parent, &old_dentry->d_name); if (!whiteout) return -ENOMEM; error = shmem_mknod(idmap, old_dir, whiteout, S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV); dput(whiteout); return error; } /* * The VFS layer already does all the dentry stuff for rename, * we just have to decrement the usage count for the target if * it exists so that the VFS layer correctly free's it when it * gets overwritten. */ static int shmem_rename2(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { struct inode *inode = d_inode(old_dentry); int they_are_dirs = S_ISDIR(inode->i_mode); bool had_offset = false; int error; if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; if (flags & RENAME_EXCHANGE) return simple_offset_rename_exchange(old_dir, old_dentry, new_dir, new_dentry); if (!simple_empty(new_dentry)) return -ENOTEMPTY; error = simple_offset_add(shmem_get_offset_ctx(new_dir), new_dentry); if (error == -EBUSY) had_offset = true; else if (unlikely(error)) return error; if (flags & RENAME_WHITEOUT) { error = shmem_whiteout(idmap, old_dir, old_dentry); if (error) { if (!had_offset) simple_offset_remove(shmem_get_offset_ctx(new_dir), new_dentry); return error; } } simple_offset_rename(old_dir, old_dentry, new_dir, new_dentry); if (d_really_is_positive(new_dentry)) { (void) shmem_unlink(new_dir, new_dentry); if (they_are_dirs) { drop_nlink(d_inode(new_dentry)); drop_nlink(old_dir); } } else if (they_are_dirs) { drop_nlink(old_dir); inc_nlink(new_dir); } old_dir->i_size -= BOGO_DIRENT_SIZE; new_dir->i_size += BOGO_DIRENT_SIZE; simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); inode_inc_iversion(old_dir); inode_inc_iversion(new_dir); return 0; } static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { int error; int len; struct inode *inode; struct folio *folio; char *link; len = strlen(symname) + 1; if (len > PAGE_SIZE) return -ENAMETOOLONG; inode = shmem_get_inode(idmap, dir->i_sb, dir, S_IFLNK | 0777, 0, mk_vma_flags(VMA_NORESERVE_BIT)); if (IS_ERR(inode)) return PTR_ERR(inode); error = security_inode_init_security(inode, dir, &dentry->d_name, shmem_initxattrs, NULL); if (error && error != -EOPNOTSUPP) goto out_iput; error = simple_offset_add(shmem_get_offset_ctx(dir), dentry); if (error) goto out_iput; inode->i_size = len-1; if (len <= SHORT_SYMLINK_LEN) { link = kmemdup(symname, len, GFP_KERNEL); if (!link) { error = -ENOMEM; goto out_remove_offset; } inode->i_op = &shmem_short_symlink_operations; inode_set_cached_link(inode, link, len - 1); } else { inode_nohighmem(inode); inode->i_mapping->a_ops = &shmem_aops; error = shmem_get_folio(inode, 0, 0, &folio, SGP_WRITE); if (error) goto out_remove_offset; inode->i_op = &shmem_symlink_inode_operations; memcpy(folio_address(folio), symname, len); folio_mark_uptodate(folio); folio_mark_dirty(folio); folio_unlock(folio); folio_put(folio); } dir->i_size += BOGO_DIRENT_SIZE; inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); inode_inc_iversion(dir); d_make_persistent(dentry, inode); return 0; out_remove_offset: simple_offset_remove(shmem_get_offset_ctx(dir), dentry); out_iput: iput(inode); return error; } static void shmem_put_link(void *arg) { folio_mark_accessed(arg); folio_put(arg); } static const char *shmem_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { struct folio *folio = NULL; int error; if (!dentry) { folio = filemap_get_folio(inode->i_mapping, 0); if (IS_ERR(folio)) return ERR_PTR(-ECHILD); if (PageHWPoison(folio_page(folio, 0)) || !folio_test_uptodate(folio)) { folio_put(folio); return ERR_PTR(-ECHILD); } } else { error = shmem_get_folio(inode, 0, 0, &folio, SGP_READ); if (error) return ERR_PTR(error); if (!folio) return ERR_PTR(-ECHILD); if (PageHWPoison(folio_page(folio, 0))) { folio_unlock(folio); folio_put(folio); return ERR_PTR(-ECHILD); } folio_unlock(folio); } set_delayed_call(done, shmem_put_link, folio); return folio_address(folio); } #ifdef CONFIG_TMPFS_XATTR static int shmem_fileattr_get(struct dentry *dentry, struct file_kattr *fa) { struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); fileattr_fill_flags(fa, info->fsflags & SHMEM_FL_USER_VISIBLE); return 0; } static int shmem_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); struct shmem_inode_info *info = SHMEM_I(inode); int ret, flags; if (fileattr_has_fsx(fa)) return -EOPNOTSUPP; if (fa->flags & ~SHMEM_FL_USER_MODIFIABLE) return -EOPNOTSUPP; flags = (info->fsflags & ~SHMEM_FL_USER_MODIFIABLE) | (fa->flags & SHMEM_FL_USER_MODIFIABLE); ret = shmem_set_inode_flags(inode, flags, dentry); if (ret) return ret; info->fsflags = flags; inode_set_ctime_current(inode); inode_inc_iversion(inode); return 0; } /* * Superblocks without xattr inode operations may get some security.* xattr * support from the LSM "for free". As soon as we have any other xattrs * like ACLs, we also need to implement the security.* handlers at * filesystem level, though. */ /* * Callback for security_inode_init_security() for acquiring xattrs. */ static int shmem_initxattrs(struct inode *inode, const struct xattr *xattr_array, void *fs_info) { struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); const struct xattr *xattr; struct simple_xattr *new_xattr; size_t ispace = 0; size_t len; if (sbinfo->max_inodes) { for (xattr = xattr_array; xattr->name != NULL; xattr++) { ispace += simple_xattr_space(xattr->name, xattr->value_len + XATTR_SECURITY_PREFIX_LEN); } if (ispace) { raw_spin_lock(&sbinfo->stat_lock); if (sbinfo->free_ispace < ispace) ispace = 0; else sbinfo->free_ispace -= ispace; raw_spin_unlock(&sbinfo->stat_lock); if (!ispace) return -ENOSPC; } } for (xattr = xattr_array; xattr->name != NULL; xattr++) { new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len); if (!new_xattr) break; len = strlen(xattr->name) + 1; new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len, GFP_KERNEL_ACCOUNT); if (!new_xattr->name) { kvfree(new_xattr); break; } memcpy(new_xattr->name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN); memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN, xattr->name, len); simple_xattr_add(&info->xattrs, new_xattr); } if (xattr->name != NULL) { if (ispace) { raw_spin_lock(&sbinfo->stat_lock); sbinfo->free_ispace += ispace; raw_spin_unlock(&sbinfo->stat_lock); } simple_xattrs_free(&info->xattrs, NULL); return -ENOMEM; } return 0; } static int shmem_xattr_handler_get(const struct xattr_handler *handler, struct dentry *unused, struct inode *inode, const char *name, void *buffer, size_t size) { struct shmem_inode_info *info = SHMEM_I(inode); name = xattr_full_name(handler, name); return simple_xattr_get(&info->xattrs, name, buffer, size); } static int shmem_xattr_handler_set(const struct xattr_handler *handler, struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) { struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); struct simple_xattr *old_xattr; size_t ispace = 0; name = xattr_full_name(handler, name); if (value && sbinfo->max_inodes) { ispace = simple_xattr_space(name, size); raw_spin_lock(&sbinfo->stat_lock); if (sbinfo->free_ispace < ispace) ispace = 0; else sbinfo->free_ispace -= ispace; raw_spin_unlock(&sbinfo->stat_lock); if (!ispace) return -ENOSPC; } old_xattr = simple_xattr_set(&info->xattrs, name, value, size, flags); if (!IS_ERR(old_xattr)) { ispace = 0; if (old_xattr && sbinfo->max_inodes) ispace = simple_xattr_space(old_xattr->name, old_xattr->size); simple_xattr_free(old_xattr); old_xattr = NULL; inode_set_ctime_current(inode); inode_inc_iversion(inode); } if (ispace) { raw_spin_lock(&sbinfo->stat_lock); sbinfo->free_ispace += ispace; raw_spin_unlock(&sbinfo->stat_lock); } return PTR_ERR(old_xattr); } static const struct xattr_handler shmem_security_xattr_handler = { .prefix = XATTR_SECURITY_PREFIX, .get = shmem_xattr_handler_get, .set = shmem_xattr_handler_set, }; static const struct xattr_handler shmem_trusted_xattr_handler = { .prefix = XATTR_TRUSTED_PREFIX, .get = shmem_xattr_handler_get, .set = shmem_xattr_handler_set, }; static const struct xattr_handler shmem_user_xattr_handler = { .prefix = XATTR_USER_PREFIX, .get = shmem_xattr_handler_get, .set = shmem_xattr_handler_set, }; static const struct xattr_handler * const shmem_xattr_handlers[] = { &shmem_security_xattr_handler, &shmem_trusted_xattr_handler, &shmem_user_xattr_handler, NULL }; static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size) { struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); return simple_xattr_list(d_inode(dentry), &info->xattrs, buffer, size); } #endif /* CONFIG_TMPFS_XATTR */ static const struct inode_operations shmem_short_symlink_operations = { .getattr = shmem_getattr, .setattr = shmem_setattr, .get_link = simple_get_link, #ifdef CONFIG_TMPFS_XATTR .listxattr = shmem_listxattr, #endif }; static const struct inode_operations shmem_symlink_inode_operations = { .getattr = shmem_getattr, .setattr = shmem_setattr, .get_link = shmem_get_link, #ifdef CONFIG_TMPFS_XATTR .listxattr = shmem_listxattr, #endif }; static struct dentry *shmem_get_parent(struct dentry *child) { return ERR_PTR(-ESTALE); } static int shmem_match(struct inode *ino, void *vfh) { __u32 *fh = vfh; __u64 inum = fh[2]; inum = (inum << 32) | fh[1]; return ino->i_ino == inum && fh[0] == ino->i_generation; } /* Find any alias of inode, but prefer a hashed alias */ static struct dentry *shmem_find_alias(struct inode *inode) { struct dentry *alias = d_find_alias(inode); return alias ?: d_find_any_alias(inode); } static struct dentry *shmem_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { struct inode *inode; struct dentry *dentry = NULL; u64 inum; if (fh_len < 3) return NULL; inum = fid->raw[2]; inum = (inum << 32) | fid->raw[1]; inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]), shmem_match, fid->raw); if (inode) { dentry = shmem_find_alias(inode); iput(inode); } return dentry; } static int shmem_encode_fh(struct inode *inode, __u32 *fh, int *len, struct inode *parent) { if (*len < 3) { *len = 3; return FILEID_INVALID; } if (inode_unhashed(inode)) { /* Unfortunately insert_inode_hash is not idempotent, * so as we hash inodes here rather than at creation * time, we need a lock to ensure we only try * to do it once */ static DEFINE_SPINLOCK(lock); spin_lock(&lock); if (inode_unhashed(inode)) __insert_inode_hash(inode, inode->i_ino + inode->i_generation); spin_unlock(&lock); } fh[0] = inode->i_generation; fh[1] = inode->i_ino; fh[2] = ((__u64)inode->i_ino) >> 32; *len = 3; return 1; } static const struct export_operations shmem_export_ops = { .get_parent = shmem_get_parent, .encode_fh = shmem_encode_fh, .fh_to_dentry = shmem_fh_to_dentry, }; enum shmem_param { Opt_gid, Opt_huge, Opt_mode, Opt_mpol, Opt_nr_blocks, Opt_nr_inodes, Opt_size, Opt_uid, Opt_inode32, Opt_inode64, Opt_noswap, Opt_quota, Opt_usrquota, Opt_grpquota, Opt_usrquota_block_hardlimit, Opt_usrquota_inode_hardlimit, Opt_grpquota_block_hardlimit, Opt_grpquota_inode_hardlimit, Opt_casefold_version, Opt_casefold, Opt_strict_encoding, }; static const struct constant_table shmem_param_enums_huge[] = { {"never", SHMEM_HUGE_NEVER }, {"always", SHMEM_HUGE_ALWAYS }, {"within_size", SHMEM_HUGE_WITHIN_SIZE }, {"advise", SHMEM_HUGE_ADVISE }, {} }; const struct fs_parameter_spec shmem_fs_parameters[] = { fsparam_gid ("gid", Opt_gid), fsparam_enum ("huge", Opt_huge, shmem_param_enums_huge), fsparam_u32oct("mode", Opt_mode), fsparam_string("mpol", Opt_mpol), fsparam_string("nr_blocks", Opt_nr_blocks), fsparam_string("nr_inodes", Opt_nr_inodes), fsparam_string("size", Opt_size), fsparam_uid ("uid", Opt_uid), fsparam_flag ("inode32", Opt_inode32), fsparam_flag ("inode64", Opt_inode64), fsparam_flag ("noswap", Opt_noswap), #ifdef CONFIG_TMPFS_QUOTA fsparam_flag ("quota", Opt_quota), fsparam_flag ("usrquota", Opt_usrquota), fsparam_flag ("grpquota", Opt_grpquota), fsparam_string("usrquota_block_hardlimit", Opt_usrquota_block_hardlimit), fsparam_string("usrquota_inode_hardlimit", Opt_usrquota_inode_hardlimit), fsparam_string("grpquota_block_hardlimit", Opt_grpquota_block_hardlimit), fsparam_string("grpquota_inode_hardlimit", Opt_grpquota_inode_hardlimit), #endif fsparam_string("casefold", Opt_casefold_version), fsparam_flag ("casefold", Opt_casefold), fsparam_flag ("strict_encoding", Opt_strict_encoding), {} }; #if IS_ENABLED(CONFIG_UNICODE) static int shmem_parse_opt_casefold(struct fs_context *fc, struct fs_parameter *param, bool latest_version) { struct shmem_options *ctx = fc->fs_private; int version = UTF8_LATEST; struct unicode_map *encoding; char *version_str = param->string + 5; if (!latest_version) { if (strncmp(param->string, "utf8-", 5)) return invalfc(fc, "Only UTF-8 encodings are supported " "in the format: utf8-<version number>"); version = utf8_parse_version(version_str); if (version < 0) return invalfc(fc, "Invalid UTF-8 version: %s", version_str); } encoding = utf8_load(version); if (IS_ERR(encoding)) { return invalfc(fc, "Failed loading UTF-8 version: utf8-%u.%u.%u\n", unicode_major(version), unicode_minor(version), unicode_rev(version)); } pr_info("tmpfs: Using encoding : utf8-%u.%u.%u\n", unicode_major(version), unicode_minor(version), unicode_rev(version)); ctx->encoding = encoding; return 0; } #else static int shmem_parse_opt_casefold(struct fs_context *fc, struct fs_parameter *param, bool latest_version) { return invalfc(fc, "tmpfs: Kernel not built with CONFIG_UNICODE\n"); } #endif static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param) { struct shmem_options *ctx = fc->fs_private; struct fs_parse_result result; unsigned long long size; char *rest; int opt; kuid_t kuid; kgid_t kgid; opt = fs_parse(fc, shmem_fs_parameters, param, &result); if (opt < 0) return opt; switch (opt) { case Opt_size: size = memparse(param->string, &rest); if (*rest == '%') { size <<= PAGE_SHIFT; size *= totalram_pages(); do_div(size, 100); rest++; } if (*rest) goto bad_value; ctx->blocks = DIV_ROUND_UP(size, PAGE_SIZE); ctx->seen |= SHMEM_SEEN_BLOCKS; break; case Opt_nr_blocks: ctx->blocks = memparse(param->string, &rest); if (*rest || ctx->blocks > LONG_MAX) goto bad_value; ctx->seen |= SHMEM_SEEN_BLOCKS; break; case Opt_nr_inodes: ctx->inodes = memparse(param->string, &rest); if (*rest || ctx->inodes > ULONG_MAX / BOGO_INODE_SIZE) goto bad_value; ctx->seen |= SHMEM_SEEN_INODES; break; case Opt_mode: ctx->mode = result.uint_32 & 07777; break; case Opt_uid: kuid = result.uid; /* * The requested uid must be representable in the * filesystem's idmapping. */ if (!kuid_has_mapping(fc->user_ns, kuid)) goto bad_value; ctx->uid = kuid; break; case Opt_gid: kgid = result.gid; /* * The requested gid must be representable in the * filesystem's idmapping. */ if (!kgid_has_mapping(fc->user_ns, kgid)) goto bad_value; ctx->gid = kgid; break; case Opt_huge: ctx->huge = result.uint_32; if (ctx->huge != SHMEM_HUGE_NEVER && !(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && has_transparent_hugepage())) goto unsupported_parameter; ctx->seen |= SHMEM_SEEN_HUGE; break; case Opt_mpol: if (IS_ENABLED(CONFIG_NUMA)) { mpol_put(ctx->mpol); ctx->mpol = NULL; if (mpol_parse_str(param->string, &ctx->mpol)) goto bad_value; break; } goto unsupported_parameter; case Opt_inode32: ctx->full_inums = false; ctx->seen |= SHMEM_SEEN_INUMS; break; case Opt_inode64: if (sizeof(ino_t) < 8) { return invalfc(fc, "Cannot use inode64 with <64bit inums in kernel\n"); } ctx->full_inums = true; ctx->seen |= SHMEM_SEEN_INUMS; break; case Opt_noswap: if ((fc->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN)) { return invalfc(fc, "Turning off swap in unprivileged tmpfs mounts unsupported"); } ctx->noswap = true; break; case Opt_quota: if (fc->user_ns != &init_user_ns) return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported"); ctx->seen |= SHMEM_SEEN_QUOTA; ctx->quota_types |= (QTYPE_MASK_USR | QTYPE_MASK_GRP); break; case Opt_usrquota: if (fc->user_ns != &init_user_ns) return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported"); ctx->seen |= SHMEM_SEEN_QUOTA; ctx->quota_types |= QTYPE_MASK_USR; break; case Opt_grpquota: if (fc->user_ns != &init_user_ns) return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported"); ctx->seen |= SHMEM_SEEN_QUOTA; ctx->quota_types |= QTYPE_MASK_GRP; break; case Opt_usrquota_block_hardlimit: size = memparse(param->string, &rest); if (*rest || !size) goto bad_value; if (size > SHMEM_QUOTA_MAX_SPC_LIMIT) return invalfc(fc, "User quota block hardlimit too large."); ctx->qlimits.usrquota_bhardlimit = size; break; case Opt_grpquota_block_hardlimit: size = memparse(param->string, &rest); if (*rest || !size) goto bad_value; if (size > SHMEM_QUOTA_MAX_SPC_LIMIT) return invalfc(fc, "Group quota block hardlimit too large."); ctx->qlimits.grpquota_bhardlimit = size; break; case Opt_usrquota_inode_hardlimit: size = memparse(param->string, &rest); if (*rest || !size) goto bad_value; if (size > SHMEM_QUOTA_MAX_INO_LIMIT) return invalfc(fc, "User quota inode hardlimit too large."); ctx->qlimits.usrquota_ihardlimit = size; break; case Opt_grpquota_inode_hardlimit: size = memparse(param->string, &rest); if (*rest || !size) goto bad_value; if (size > SHMEM_QUOTA_MAX_INO_LIMIT) return invalfc(fc, "Group quota inode hardlimit too large."); ctx->qlimits.grpquota_ihardlimit = size; break; case Opt_casefold_version: return shmem_parse_opt_casefold(fc, param, false); case Opt_casefold: return shmem_parse_opt_casefold(fc, param, true); case Opt_strict_encoding: #if IS_ENABLED(CONFIG_UNICODE) ctx->strict_encoding = true; break; #else return invalfc(fc, "tmpfs: Kernel not built with CONFIG_UNICODE\n"); #endif } return 0; unsupported_parameter: return invalfc(fc, "Unsupported parameter '%s'", param->key); bad_value: return invalfc(fc, "Bad value for '%s'", param->key); } static char *shmem_next_opt(char **s) { char *sbegin = *s; char *p; if (sbegin == NULL) return NULL; /* * NUL-terminate this option: unfortunately, * mount options form a comma-separated list, * but mpol's nodelist may also contain commas. */ for (;;) { p = strchr(*s, ','); if (p == NULL) break; *s = p + 1; if (!isdigit(*(p+1))) { *p = '\0'; return sbegin; } } *s = NULL; return sbegin; } static int shmem_parse_monolithic(struct fs_context *fc, void *data) { return vfs_parse_monolithic_sep(fc, data, shmem_next_opt); } /* * Reconfigure a shmem filesystem. */ static int shmem_reconfigure(struct fs_context *fc) { struct shmem_options *ctx = fc->fs_private; struct shmem_sb_info *sbinfo = SHMEM_SB(fc->root->d_sb); unsigned long used_isp; struct mempolicy *mpol = NULL; const char *err; raw_spin_lock(&sbinfo->stat_lock); used_isp = sbinfo->max_inodes * BOGO_INODE_SIZE - sbinfo->free_ispace; if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) { if (!sbinfo->max_blocks) { err = "Cannot retroactively limit size"; goto out; } if (percpu_counter_compare(&sbinfo->used_blocks, ctx->blocks) > 0) { err = "Too small a size for current use"; goto out; } } if ((ctx->seen & SHMEM_SEEN_INODES) && ctx->inodes) { if (!sbinfo->max_inodes) { err = "Cannot retroactively limit inodes"; goto out; } if (ctx->inodes * BOGO_INODE_SIZE < used_isp) { err = "Too few inodes for current use"; goto out; } } if ((ctx->seen & SHMEM_SEEN_INUMS) && !ctx->full_inums && sbinfo->next_ino > UINT_MAX) { err = "Current inum too high to switch to 32-bit inums"; goto out; } /* * "noswap" doesn't use fsparam_flag_no, i.e. there's no "swap" * counterpart for (re-)enabling swap. */ if (ctx->noswap && !sbinfo->noswap) { err = "Cannot disable swap on remount"; goto out; } if (ctx->seen & SHMEM_SEEN_QUOTA && !sb_any_quota_loaded(fc->root->d_sb)) { err = "Cannot enable quota on remount"; goto out; } #ifdef CONFIG_TMPFS_QUOTA #define CHANGED_LIMIT(name) \ (ctx->qlimits.name## hardlimit && \ (ctx->qlimits.name## hardlimit != sbinfo->qlimits.name## hardlimit)) if (CHANGED_LIMIT(usrquota_b) || CHANGED_LIMIT(usrquota_i) || CHANGED_LIMIT(grpquota_b) || CHANGED_LIMIT(grpquota_i)) { err = "Cannot change global quota limit on remount"; goto out; } #endif /* CONFIG_TMPFS_QUOTA */ if (ctx->seen & SHMEM_SEEN_HUGE) sbinfo->huge = ctx->huge; if (ctx->seen & SHMEM_SEEN_INUMS) sbinfo->full_inums = ctx->full_inums; if (ctx->seen & SHMEM_SEEN_BLOCKS) sbinfo->max_blocks = ctx->blocks; if (ctx->seen & SHMEM_SEEN_INODES) { sbinfo->max_inodes = ctx->inodes; sbinfo->free_ispace = ctx->inodes * BOGO_INODE_SIZE - used_isp; } /* * Preserve previous mempolicy unless mpol remount option was specified. */ if (ctx->mpol) { mpol = sbinfo->mpol; sbinfo->mpol = ctx->mpol; /* transfers initial ref */ ctx->mpol = NULL; } if (ctx->noswap) sbinfo->noswap = true; raw_spin_unlock(&sbinfo->stat_lock); mpol_put(mpol); return 0; out: raw_spin_unlock(&sbinfo->stat_lock); return invalfc(fc, "%s", err); } static int shmem_show_options(struct seq_file *seq, struct dentry *root) { struct shmem_sb_info *sbinfo = SHMEM_SB(root->d_sb); struct mempolicy *mpol; if (sbinfo->max_blocks != shmem_default_max_blocks()) seq_printf(seq, ",size=%luk", K(sbinfo->max_blocks)); if (sbinfo->max_inodes != shmem_default_max_inodes()) seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes); if (sbinfo->mode != (0777 | S_ISVTX)) seq_printf(seq, ",mode=%03ho", sbinfo->mode); if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID)) seq_printf(seq, ",uid=%u", from_kuid_munged(&init_user_ns, sbinfo->uid)); if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID)) seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, sbinfo->gid)); /* * Showing inode{64,32} might be useful even if it's the system default, * since then people don't have to resort to checking both here and * /proc/config.gz to confirm 64-bit inums were successfully applied * (which may not even exist if IKCONFIG_PROC isn't enabled). * * We hide it when inode64 isn't the default and we are using 32-bit * inodes, since that probably just means the feature isn't even under * consideration. * * As such: * * +-----------------+-----------------+ * | TMPFS_INODE64=y | TMPFS_INODE64=n | * +------------------+-----------------+-----------------+ * | full_inums=true | show | show | * | full_inums=false | show | hide | * +------------------+-----------------+-----------------+ * */ if (IS_ENABLED(CONFIG_TMPFS_INODE64) || sbinfo->full_inums) seq_printf(seq, ",inode%d", (sbinfo->full_inums ? 64 : 32)); #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* Rightly or wrongly, show huge mount option unmasked by shmem_huge */ if (sbinfo->huge) seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge)); #endif mpol = shmem_get_sbmpol(sbinfo); shmem_show_mpol(seq, mpol); mpol_put(mpol); if (sbinfo->noswap) seq_printf(seq, ",noswap"); #ifdef CONFIG_TMPFS_QUOTA if (sb_has_quota_active(root->d_sb, USRQUOTA)) seq_printf(seq, ",usrquota"); if (sb_has_quota_active(root->d_sb, GRPQUOTA)) seq_printf(seq, ",grpquota"); if (sbinfo->qlimits.usrquota_bhardlimit) seq_printf(seq, ",usrquota_block_hardlimit=%lld", sbinfo->qlimits.usrquota_bhardlimit); if (sbinfo->qlimits.grpquota_bhardlimit) seq_printf(seq, ",grpquota_block_hardlimit=%lld", sbinfo->qlimits.grpquota_bhardlimit); if (sbinfo->qlimits.usrquota_ihardlimit) seq_printf(seq, ",usrquota_inode_hardlimit=%lld", sbinfo->qlimits.usrquota_ihardlimit); if (sbinfo->qlimits.grpquota_ihardlimit) seq_printf(seq, ",grpquota_inode_hardlimit=%lld", sbinfo->qlimits.grpquota_ihardlimit); #endif return 0; } #endif /* CONFIG_TMPFS */ static void shmem_put_super(struct super_block *sb) { struct shmem_sb_info *sbinfo = SHMEM_SB(sb); #if IS_ENABLED(CONFIG_UNICODE) if (sb->s_encoding) utf8_unload(sb->s_encoding); #endif #ifdef CONFIG_TMPFS_QUOTA shmem_disable_quotas(sb); #endif free_percpu(sbinfo->ino_batch); percpu_counter_destroy(&sbinfo->used_blocks); mpol_put(sbinfo->mpol); kfree(sbinfo); sb->s_fs_info = NULL; } #if IS_ENABLED(CONFIG_UNICODE) && defined(CONFIG_TMPFS) static const struct dentry_operations shmem_ci_dentry_ops = { .d_hash = generic_ci_d_hash, .d_compare = generic_ci_d_compare, }; #endif static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) { struct shmem_options *ctx = fc->fs_private; struct inode *inode; struct shmem_sb_info *sbinfo; int error = -ENOMEM; /* Round up to L1_CACHE_BYTES to resist false sharing */ sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info), L1_CACHE_BYTES), GFP_KERNEL); if (!sbinfo) return error; sb->s_fs_info = sbinfo; #ifdef CONFIG_TMPFS /* * Per default we only allow half of the physical ram per * tmpfs instance, limiting inodes to one per page of lowmem; * but the internal instance is left unlimited. */ if (!(sb->s_flags & SB_KERNMOUNT)) { if (!(ctx->seen & SHMEM_SEEN_BLOCKS)) ctx->blocks = shmem_default_max_blocks(); if (!(ctx->seen & SHMEM_SEEN_INODES)) ctx->inodes = shmem_default_max_inodes(); if (!(ctx->seen & SHMEM_SEEN_INUMS)) ctx->full_inums = IS_ENABLED(CONFIG_TMPFS_INODE64); sbinfo->noswap = ctx->noswap; } else { sb->s_flags |= SB_NOUSER; } sb->s_export_op = &shmem_export_ops; sb->s_flags |= SB_NOSEC; #if IS_ENABLED(CONFIG_UNICODE) if (!ctx->encoding && ctx->strict_encoding) { pr_err("tmpfs: strict_encoding option without encoding is forbidden\n"); error = -EINVAL; goto failed; } if (ctx->encoding) { sb->s_encoding = ctx->encoding; set_default_d_op(sb, &shmem_ci_dentry_ops); if (ctx->strict_encoding) sb->s_encoding_flags = SB_ENC_STRICT_MODE_FL; } #endif #else sb->s_flags |= SB_NOUSER; #endif /* CONFIG_TMPFS */ sb->s_d_flags |= DCACHE_DONTCACHE; sbinfo->max_blocks = ctx->blocks; sbinfo->max_inodes = ctx->inodes; sbinfo->free_ispace = sbinfo->max_inodes * BOGO_INODE_SIZE; if (sb->s_flags & SB_KERNMOUNT) { sbinfo->ino_batch = alloc_percpu(ino_t); if (!sbinfo->ino_batch) goto failed; } sbinfo->uid = ctx->uid; sbinfo->gid = ctx->gid; sbinfo->full_inums = ctx->full_inums; sbinfo->mode = ctx->mode; #ifdef CONFIG_TRANSPARENT_HUGEPAGE if (ctx->seen & SHMEM_SEEN_HUGE) sbinfo->huge = ctx->huge; else sbinfo->huge = tmpfs_huge; #endif sbinfo->mpol = ctx->mpol; ctx->mpol = NULL; raw_spin_lock_init(&sbinfo->stat_lock); if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL)) goto failed; spin_lock_init(&sbinfo->shrinklist_lock); INIT_LIST_HEAD(&sbinfo->shrinklist); sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = TMPFS_MAGIC; sb->s_op = &shmem_ops; sb->s_time_gran = 1; #ifdef CONFIG_TMPFS_XATTR sb->s_xattr = shmem_xattr_handlers; #endif #ifdef CONFIG_TMPFS_POSIX_ACL sb->s_flags |= SB_POSIXACL; #endif uuid_t uuid; uuid_gen(&uuid); super_set_uuid(sb, uuid.b, sizeof(uuid)); #ifdef CONFIG_TMPFS_QUOTA if (ctx->seen & SHMEM_SEEN_QUOTA) { sb->dq_op = &shmem_quota_operations; sb->s_qcop = &dquot_quotactl_sysfile_ops; sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; /* Copy the default limits from ctx into sbinfo */ memcpy(&sbinfo->qlimits, &ctx->qlimits, sizeof(struct shmem_quota_limits)); if (shmem_enable_quotas(sb, ctx->quota_types)) goto failed; } #endif /* CONFIG_TMPFS_QUOTA */ inode = shmem_get_inode(&nop_mnt_idmap, sb, NULL, S_IFDIR | sbinfo->mode, 0, mk_vma_flags(VMA_NORESERVE_BIT)); if (IS_ERR(inode)) { error = PTR_ERR(inode); goto failed; } inode->i_uid = sbinfo->uid; inode->i_gid = sbinfo->gid; sb->s_root = d_make_root(inode); if (!sb->s_root) goto failed; return 0; failed: shmem_put_super(sb); return error; } static int shmem_get_tree(struct fs_context *fc) { return get_tree_nodev(fc, shmem_fill_super); } static void shmem_free_fc(struct fs_context *fc) { struct shmem_options *ctx = fc->fs_private; if (ctx) { mpol_put(ctx->mpol); kfree(ctx); } } static const struct fs_context_operations shmem_fs_context_ops = { .free = shmem_free_fc, .get_tree = shmem_get_tree, #ifdef CONFIG_TMPFS .parse_monolithic = shmem_parse_monolithic, .parse_param = shmem_parse_one, .reconfigure = shmem_reconfigure, #endif }; static struct kmem_cache *shmem_inode_cachep __ro_after_init; static struct inode *shmem_alloc_inode(struct super_block *sb) { struct shmem_inode_info *info; info = alloc_inode_sb(sb, shmem_inode_cachep, GFP_KERNEL); if (!info) return NULL; return &info->vfs_inode; } static void shmem_free_in_core_inode(struct inode *inode) { if (S_ISLNK(inode->i_mode)) kfree(inode->i_link); kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); } static void shmem_destroy_inode(struct inode *inode) { if (S_ISREG(inode->i_mode)) mpol_free_shared_policy(&SHMEM_I(inode)->policy); if (S_ISDIR(inode->i_mode)) simple_offset_destroy(shmem_get_offset_ctx(inode)); } static void shmem_init_inode(void *foo) { struct shmem_inode_info *info = foo; inode_init_once(&info->vfs_inode); } static void __init shmem_init_inodecache(void) { shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", sizeof(struct shmem_inode_info), 0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode); } static void __init shmem_destroy_inodecache(void) { kmem_cache_destroy(shmem_inode_cachep); } /* Keep the page in page cache instead of truncating it */ static int shmem_error_remove_folio(struct address_space *mapping, struct folio *folio) { return 0; } static const struct address_space_operations shmem_aops = { .dirty_folio = noop_dirty_folio, #ifdef CONFIG_TMPFS .write_begin = shmem_write_begin, .write_end = shmem_write_end, #endif #ifdef CONFIG_MIGRATION .migrate_folio = migrate_folio, #endif .error_remove_folio = shmem_error_remove_folio, }; static const struct file_operations shmem_file_operations = { .mmap_prepare = shmem_mmap_prepare, .open = shmem_file_open, .get_unmapped_area = shmem_get_unmapped_area, #ifdef CONFIG_TMPFS .llseek = shmem_file_llseek, .read_iter = shmem_file_read_iter, .write_iter = shmem_file_write_iter, .fsync = noop_fsync, .splice_read = shmem_file_splice_read, .splice_write = iter_file_splice_write, .fallocate = shmem_fallocate, .setlease = generic_setlease, #endif }; static const struct inode_operations shmem_inode_operations = { .getattr = shmem_getattr, .setattr = shmem_setattr, #ifdef CONFIG_TMPFS_XATTR .listxattr = shmem_listxattr, .set_acl = simple_set_acl, .fileattr_get = shmem_fileattr_get, .fileattr_set = shmem_fileattr_set, #endif }; static const struct inode_operations shmem_dir_inode_operations = { #ifdef CONFIG_TMPFS .getattr = shmem_getattr, .create = shmem_create, .lookup = simple_lookup, .link = shmem_link, .unlink = shmem_unlink, .symlink = shmem_symlink, .mkdir = shmem_mkdir, .rmdir = shmem_rmdir, .mknod = shmem_mknod, .rename = shmem_rename2, .tmpfile = shmem_tmpfile, .get_offset_ctx = shmem_get_offset_ctx, #endif #ifdef CONFIG_TMPFS_XATTR .listxattr = shmem_listxattr, .fileattr_get = shmem_fileattr_get, .fileattr_set = shmem_fileattr_set, #endif #ifdef CONFIG_TMPFS_POSIX_ACL .setattr = shmem_setattr, .set_acl = simple_set_acl, #endif }; static const struct inode_operations shmem_special_inode_operations = { .getattr = shmem_getattr, #ifdef CONFIG_TMPFS_XATTR .listxattr = shmem_listxattr, #endif #ifdef CONFIG_TMPFS_POSIX_ACL .setattr = shmem_setattr, .set_acl = simple_set_acl, #endif }; static const struct super_operations shmem_ops = { .alloc_inode = shmem_alloc_inode, .free_inode = shmem_free_in_core_inode, .destroy_inode = shmem_destroy_inode, #ifdef CONFIG_TMPFS .statfs = shmem_statfs, .show_options = shmem_show_options, #endif #ifdef CONFIG_TMPFS_QUOTA .get_dquots = shmem_get_dquots, #endif .evict_inode = shmem_evict_inode, .drop_inode = inode_just_drop, .put_super = shmem_put_super, #ifdef CONFIG_TRANSPARENT_HUGEPAGE .nr_cached_objects = shmem_unused_huge_count, .free_cached_objects = shmem_unused_huge_scan, #endif }; static const struct vm_operations_struct shmem_vm_ops = { .fault = shmem_fault, .map_pages = filemap_map_pages, #ifdef CONFIG_NUMA .set_policy = shmem_set_policy, .get_policy = shmem_get_policy, #endif }; static const struct vm_operations_struct shmem_anon_vm_ops = { .fault = shmem_fault, .map_pages = filemap_map_pages, #ifdef CONFIG_NUMA .set_policy = shmem_set_policy, .get_policy = shmem_get_policy, #endif }; int shmem_init_fs_context(struct fs_context *fc) { struct shmem_options *ctx; ctx = kzalloc_obj(struct shmem_options); if (!ctx) return -ENOMEM; ctx->mode = 0777 | S_ISVTX; ctx->uid = current_fsuid(); ctx->gid = current_fsgid(); #if IS_ENABLED(CONFIG_UNICODE) ctx->encoding = NULL; #endif fc->fs_private = ctx; fc->ops = &shmem_fs_context_ops; #ifdef CONFIG_TMPFS fc->sb_flags |= SB_I_VERSION; #endif return 0; } static struct file_system_type shmem_fs_type = { .owner = THIS_MODULE, .name = "tmpfs", .init_fs_context = shmem_init_fs_context, #ifdef CONFIG_TMPFS .parameters = shmem_fs_parameters, #endif .kill_sb = kill_anon_super, .fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP | FS_MGTIME, }; #if defined(CONFIG_SYSFS) && defined(CONFIG_TMPFS) #define __INIT_KOBJ_ATTR(_name, _mode, _show, _store) \ { \ .attr = { .name = __stringify(_name), .mode = _mode }, \ .show = _show, \ .store = _store, \ } #define TMPFS_ATTR_W(_name, _store) \ static struct kobj_attribute tmpfs_attr_##_name = \ __INIT_KOBJ_ATTR(_name, 0200, NULL, _store) #define TMPFS_ATTR_RW(_name, _show, _store) \ static struct kobj_attribute tmpfs_attr_##_name = \ __INIT_KOBJ_ATTR(_name, 0644, _show, _store) #define TMPFS_ATTR_RO(_name, _show) \ static struct kobj_attribute tmpfs_attr_##_name = \ __INIT_KOBJ_ATTR(_name, 0444, _show, NULL) #if IS_ENABLED(CONFIG_UNICODE) static ssize_t casefold_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { return sysfs_emit(buf, "supported\n"); } TMPFS_ATTR_RO(casefold, casefold_show); #endif static struct attribute *tmpfs_attributes[] = { #if IS_ENABLED(CONFIG_UNICODE) &tmpfs_attr_casefold.attr, #endif NULL }; static const struct attribute_group tmpfs_attribute_group = { .attrs = tmpfs_attributes, .name = "features" }; static struct kobject *tmpfs_kobj; static int __init tmpfs_sysfs_init(void) { int ret; tmpfs_kobj = kobject_create_and_add("tmpfs", fs_kobj); if (!tmpfs_kobj) return -ENOMEM; ret = sysfs_create_group(tmpfs_kobj, &tmpfs_attribute_group); if (ret) kobject_put(tmpfs_kobj); return ret; } #endif /* CONFIG_SYSFS && CONFIG_TMPFS */ void __init shmem_init(void) { int error; shmem_init_inodecache(); #ifdef CONFIG_TMPFS_QUOTA register_quota_format(&shmem_quota_format); #endif error = register_filesystem(&shmem_fs_type); if (error) { pr_err("Could not register tmpfs\n"); goto out2; } shm_mnt = kern_mount(&shmem_fs_type); if (IS_ERR(shm_mnt)) { error = PTR_ERR(shm_mnt); pr_err("Could not kern_mount tmpfs\n"); goto out1; } #if defined(CONFIG_SYSFS) && defined(CONFIG_TMPFS) error = tmpfs_sysfs_init(); if (error) { pr_err("Could not init tmpfs sysfs\n"); goto out1; } #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY) SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge; else shmem_huge = SHMEM_HUGE_NEVER; /* just in case it was patched */ /* * Default to setting PMD-sized THP to inherit the global setting and * disable all other multi-size THPs. */ if (!shmem_orders_configured) huge_shmem_orders_inherit = BIT(HPAGE_PMD_ORDER); #endif return; out1: unregister_filesystem(&shmem_fs_type); out2: #ifdef CONFIG_TMPFS_QUOTA unregister_quota_format(&shmem_quota_format); #endif shmem_destroy_inodecache(); shm_mnt = ERR_PTR(error); } #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_SYSFS) static ssize_t shmem_enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { static const int values[] = { SHMEM_HUGE_ALWAYS, SHMEM_HUGE_WITHIN_SIZE, SHMEM_HUGE_ADVISE, SHMEM_HUGE_NEVER, SHMEM_HUGE_DENY, SHMEM_HUGE_FORCE, }; int len = 0; int i; for (i = 0; i < ARRAY_SIZE(values); i++) { len += sysfs_emit_at(buf, len, shmem_huge == values[i] ? "%s[%s]" : "%s%s", i ? " " : "", shmem_format_huge(values[i])); } len += sysfs_emit_at(buf, len, "\n"); return len; } static ssize_t shmem_enabled_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { char tmp[16]; int huge, err; if (count + 1 > sizeof(tmp)) return -EINVAL; memcpy(tmp, buf, count); tmp[count] = '\0'; if (count && tmp[count - 1] == '\n') tmp[count - 1] = '\0'; huge = shmem_parse_huge(tmp); if (huge == -EINVAL) return huge; shmem_huge = huge; if (shmem_huge > SHMEM_HUGE_DENY) SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge; err = start_stop_khugepaged(); return err ? err : count; } struct kobj_attribute shmem_enabled_attr = __ATTR_RW(shmem_enabled); static DEFINE_SPINLOCK(huge_shmem_orders_lock); static ssize_t thpsize_shmem_enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { int order = to_thpsize(kobj)->order; const char *output; if (test_bit(order, &huge_shmem_orders_always)) output = "[always] inherit within_size advise never"; else if (test_bit(order, &huge_shmem_orders_inherit)) output = "always [inherit] within_size advise never"; else if (test_bit(order, &huge_shmem_orders_within_size)) output = "always inherit [within_size] advise never"; else if (test_bit(order, &huge_shmem_orders_madvise)) output = "always inherit within_size [advise] never"; else output = "always inherit within_size advise [never]"; return sysfs_emit(buf, "%s\n", output); } static ssize_t thpsize_shmem_enabled_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int order = to_thpsize(kobj)->order; ssize_t ret = count; if (sysfs_streq(buf, "always")) { spin_lock(&huge_shmem_orders_lock); clear_bit(order, &huge_shmem_orders_inherit); clear_bit(order, &huge_shmem_orders_madvise); clear_bit(order, &huge_shmem_orders_within_size); set_bit(order, &huge_shmem_orders_always); spin_unlock(&huge_shmem_orders_lock); } else if (sysfs_streq(buf, "inherit")) { /* Do not override huge allocation policy with non-PMD sized mTHP */ if (shmem_huge == SHMEM_HUGE_FORCE && order != HPAGE_PMD_ORDER) return -EINVAL; spin_lock(&huge_shmem_orders_lock); clear_bit(order, &huge_shmem_orders_always); clear_bit(order, &huge_shmem_orders_madvise); clear_bit(order, &huge_shmem_orders_within_size); set_bit(order, &huge_shmem_orders_inherit); spin_unlock(&huge_shmem_orders_lock); } else if (sysfs_streq(buf, "within_size")) { spin_lock(&huge_shmem_orders_lock); clear_bit(order, &huge_shmem_orders_always); clear_bit(order, &huge_shmem_orders_inherit); clear_bit(order, &huge_shmem_orders_madvise); set_bit(order, &huge_shmem_orders_within_size); spin_unlock(&huge_shmem_orders_lock); } else if (sysfs_streq(buf, "advise")) { spin_lock(&huge_shmem_orders_lock); clear_bit(order, &huge_shmem_orders_always); clear_bit(order, &huge_shmem_orders_inherit); clear_bit(order, &huge_shmem_orders_within_size); set_bit(order, &huge_shmem_orders_madvise); spin_unlock(&huge_shmem_orders_lock); } else if (sysfs_streq(buf, "never")) { spin_lock(&huge_shmem_orders_lock); clear_bit(order, &huge_shmem_orders_always); clear_bit(order, &huge_shmem_orders_inherit); clear_bit(order, &huge_shmem_orders_within_size); clear_bit(order, &huge_shmem_orders_madvise); spin_unlock(&huge_shmem_orders_lock); } else { ret = -EINVAL; } if (ret > 0) { int err = start_stop_khugepaged(); if (err) ret = err; } return ret; } struct kobj_attribute thpsize_shmem_enabled_attr = __ATTR(shmem_enabled, 0644, thpsize_shmem_enabled_show, thpsize_shmem_enabled_store); #endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */ #if defined(CONFIG_TRANSPARENT_HUGEPAGE) static int __init setup_transparent_hugepage_shmem(char *str) { int huge; huge = shmem_parse_huge(str); if (huge == -EINVAL) { pr_warn("transparent_hugepage_shmem= cannot parse, ignored\n"); return huge; } shmem_huge = huge; return 1; } __setup("transparent_hugepage_shmem=", setup_transparent_hugepage_shmem); static int __init setup_transparent_hugepage_tmpfs(char *str) { int huge; huge = shmem_parse_huge(str); if (huge < 0) { pr_warn("transparent_hugepage_tmpfs= cannot parse, ignored\n"); return huge; } tmpfs_huge = huge; return 1; } __setup("transparent_hugepage_tmpfs=", setup_transparent_hugepage_tmpfs); static char str_dup[PAGE_SIZE] __initdata; static int __init setup_thp_shmem(char *str) { char *token, *range, *policy, *subtoken; unsigned long always, inherit, madvise, within_size; char *start_size, *end_size; int start, end, nr; char *p; if (!str || strlen(str) + 1 > PAGE_SIZE) goto err; strscpy(str_dup, str); always = huge_shmem_orders_always; inherit = huge_shmem_orders_inherit; madvise = huge_shmem_orders_madvise; within_size = huge_shmem_orders_within_size; p = str_dup; while ((token = strsep(&p, ";")) != NULL) { range = strsep(&token, ":"); policy = token; if (!policy) goto err; while ((subtoken = strsep(&range, ",")) != NULL) { if (strchr(subtoken, '-')) { start_size = strsep(&subtoken, "-"); end_size = subtoken; start = get_order_from_str(start_size, THP_ORDERS_ALL_FILE_DEFAULT); end = get_order_from_str(end_size, THP_ORDERS_ALL_FILE_DEFAULT); } else { start_size = end_size = subtoken; start = end = get_order_from_str(subtoken, THP_ORDERS_ALL_FILE_DEFAULT); } if (start < 0) { pr_err("invalid size %s in thp_shmem boot parameter\n", start_size); goto err; } if (end < 0) { pr_err("invalid size %s in thp_shmem boot parameter\n", end_size); goto err; } if (start > end) goto err; nr = end - start + 1; if (!strcmp(policy, "always")) { bitmap_set(&always, start, nr); bitmap_clear(&inherit, start, nr); bitmap_clear(&madvise, start, nr); bitmap_clear(&within_size, start, nr); } else if (!strcmp(policy, "advise")) { bitmap_set(&madvise, start, nr); bitmap_clear(&inherit, start, nr); bitmap_clear(&always, start, nr); bitmap_clear(&within_size, start, nr); } else if (!strcmp(policy, "inherit")) { bitmap_set(&inherit, start, nr); bitmap_clear(&madvise, start, nr); bitmap_clear(&always, start, nr); bitmap_clear(&within_size, start, nr); } else if (!strcmp(policy, "within_size")) { bitmap_set(&within_size, start, nr); bitmap_clear(&inherit, start, nr); bitmap_clear(&madvise, start, nr); bitmap_clear(&always, start, nr); } else if (!strcmp(policy, "never")) { bitmap_clear(&inherit, start, nr); bitmap_clear(&madvise, start, nr); bitmap_clear(&always, start, nr); bitmap_clear(&within_size, start, nr); } else { pr_err("invalid policy %s in thp_shmem boot parameter\n", policy); goto err; } } } huge_shmem_orders_always = always; huge_shmem_orders_madvise = madvise; huge_shmem_orders_inherit = inherit; huge_shmem_orders_within_size = within_size; shmem_orders_configured = true; return 1; err: pr_warn("thp_shmem=%s: error parsing string, ignoring setting\n", str); return 0; } __setup("thp_shmem=", setup_thp_shmem); #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #else /* !CONFIG_SHMEM */ /* * tiny-shmem: simple shmemfs and tmpfs using ramfs code * * This is intended for small system where the benefits of the full * shmem code (swap-backed and resource-limited) are outweighed by * their complexity. On systems without swap this code should be * effectively equivalent, but much lighter weight. */ static struct file_system_type shmem_fs_type = { .name = "tmpfs", .init_fs_context = ramfs_init_fs_context, .parameters = ramfs_fs_parameters, .kill_sb = ramfs_kill_sb, .fs_flags = FS_USERNS_MOUNT, }; void __init shmem_init(void) { BUG_ON(register_filesystem(&shmem_fs_type) != 0); shm_mnt = kern_mount(&shmem_fs_type); BUG_ON(IS_ERR(shm_mnt)); } int shmem_unuse(unsigned int type) { return 0; } int shmem_lock(struct file *file, int lock, struct ucounts *ucounts) { return 0; } void shmem_unlock_mapping(struct address_space *mapping) { } #ifdef CONFIG_MMU unsigned long shmem_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { return mm_get_unmapped_area(file, addr, len, pgoff, flags); } #endif void shmem_truncate_range(struct inode *inode, loff_t lstart, uoff_t lend) { truncate_inode_pages_range(inode->i_mapping, lstart, lend); } EXPORT_SYMBOL_GPL(shmem_truncate_range); #define shmem_vm_ops generic_file_vm_ops #define shmem_anon_vm_ops generic_file_vm_ops #define shmem_file_operations ramfs_file_operations static inline int shmem_acct_size(unsigned long flags, loff_t size) { return 0; } static inline void shmem_unacct_size(unsigned long flags, loff_t size) { } static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block *sb, struct inode *dir, umode_t mode, dev_t dev, vma_flags_t flags) { struct inode *inode = ramfs_get_inode(sb, dir, mode, dev); return inode ? inode : ERR_PTR(-ENOSPC); } #endif /* CONFIG_SHMEM */ /* common code */ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, loff_t size, vma_flags_t flags, unsigned int i_flags) { const unsigned long shmem_flags = vma_flags_test(&flags, VMA_NORESERVE_BIT) ? SHMEM_F_NORESERVE : 0; struct inode *inode; struct file *res; if (IS_ERR(mnt)) return ERR_CAST(mnt); if (size < 0 || size > MAX_LFS_FILESIZE) return ERR_PTR(-EINVAL); if (is_idmapped_mnt(mnt)) return ERR_PTR(-EINVAL); if (shmem_acct_size(shmem_flags, size)) return ERR_PTR(-ENOMEM); inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL, S_IFREG | S_IRWXUGO, 0, flags); if (IS_ERR(inode)) { shmem_unacct_size(shmem_flags, size); return ERR_CAST(inode); } inode->i_flags |= i_flags; inode->i_size = size; clear_nlink(inode); /* It is unlinked */ res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size)); if (!IS_ERR(res)) res = alloc_file_pseudo(inode, mnt, name, O_RDWR, &shmem_file_operations); if (IS_ERR(res)) iput(inode); return res; } /** * shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be * kernel internal. There will be NO LSM permission checks against the * underlying inode. So users of this interface must do LSM checks at a * higher layer. The users are the big_key and shm implementations. LSM * checks are provided at the key or shm level rather than the inode. * @name: name for dentry (to be seen in /proc/<pid>/maps) * @size: size to be set for the file * @flags: VMA_NORESERVE_BIT suppresses pre-accounting of the entire object size */ struct file *shmem_kernel_file_setup(const char *name, loff_t size, vma_flags_t flags) { return __shmem_file_setup(shm_mnt, name, size, flags, S_PRIVATE); } EXPORT_SYMBOL_GPL(shmem_kernel_file_setup); /** * shmem_file_setup - get an unlinked file living in tmpfs * @name: name for dentry (to be seen in /proc/<pid>/maps) * @size: size to be set for the file * @flags: VMA_NORESERVE_BIT suppresses pre-accounting of the entire object size */ struct file *shmem_file_setup(const char *name, loff_t size, vma_flags_t flags) { return __shmem_file_setup(shm_mnt, name, size, flags, 0); } EXPORT_SYMBOL_GPL(shmem_file_setup); /** * shmem_file_setup_with_mnt - get an unlinked file living in tmpfs * @mnt: the tmpfs mount where the file will be created * @name: name for dentry (to be seen in /proc/<pid>/maps) * @size: size to be set for the file * @flags: VMA_NORESERVE_BIT suppresses pre-accounting of the entire object size */ struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name, loff_t size, vma_flags_t flags) { return __shmem_file_setup(mnt, name, size, flags, 0); } EXPORT_SYMBOL_GPL(shmem_file_setup_with_mnt); static struct file *__shmem_zero_setup(unsigned long start, unsigned long end, vma_flags_t flags) { loff_t size = end - start; /* * Cloning a new file under mmap_lock leads to a lock ordering conflict * between XFS directory reading and selinux: since this file is only * accessible to the user through its mapping, use S_PRIVATE flag to * bypass file security, in the same way as shmem_kernel_file_setup(). */ return shmem_kernel_file_setup("dev/zero", size, flags); } /** * shmem_zero_setup - setup a shared anonymous mapping * @vma: the vma to be mmapped is prepared by do_mmap * Returns: 0 on success, or error */ int shmem_zero_setup(struct vm_area_struct *vma) { struct file *file = __shmem_zero_setup(vma->vm_start, vma->vm_end, vma->flags); if (IS_ERR(file)) return PTR_ERR(file); if (vma->vm_file) fput(vma->vm_file); vma->vm_file = file; vma->vm_ops = &shmem_anon_vm_ops; return 0; } /** * shmem_zero_setup_desc - same as shmem_zero_setup, but determined by VMA * descriptor for convenience. * @desc: Describes VMA * Returns: 0 on success, or error */ int shmem_zero_setup_desc(struct vm_area_desc *desc) { struct file *file = __shmem_zero_setup(desc->start, desc->end, desc->vma_flags); if (IS_ERR(file)) return PTR_ERR(file); desc->vm_file = file; desc->vm_ops = &shmem_anon_vm_ops; return 0; } /** * shmem_read_folio_gfp - read into page cache, using specified page allocation flags. * @mapping: the folio's address_space * @index: the folio index * @gfp: the page allocator flags to use if allocating * * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)", * with any new page allocations done using the specified allocation flags. * But read_cache_page_gfp() uses the ->read_folio() method: which does not * suit tmpfs, since it may have pages in swapcache, and needs to find those * for itself; although drivers/gpu/drm i915 and ttm rely upon this support. * * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily. */ struct folio *shmem_read_folio_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp) { #ifdef CONFIG_SHMEM struct inode *inode = mapping->host; struct folio *folio; int error; error = shmem_get_folio_gfp(inode, index, i_size_read(inode), &folio, SGP_CACHE, gfp, NULL, NULL); if (error) return ERR_PTR(error); folio_unlock(folio); return folio; #else /* * The tiny !SHMEM case uses ramfs without swap */ return mapping_read_folio_gfp(mapping, index, gfp); #endif } EXPORT_SYMBOL_GPL(shmem_read_folio_gfp); struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp) { struct folio *folio = shmem_read_folio_gfp(mapping, index, gfp); struct page *page; if (IS_ERR(folio)) return &folio->page; page = folio_file_page(folio, index); if (PageHWPoison(page)) { folio_put(folio); return ERR_PTR(-EIO); } return page; } EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);
248 249 248 249 249 249 9 9 9 249 249 15 6 9 6 9 15 15 15 9 9 9 9 9 9 3 3 3 3 3 3 242 242 242 9 9 9 249 249 249 249 249 9 9 9 9 249 249 249 249 249 249 249 249 249 249 11 11 9 9 9 10 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. */ #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/completion.h> #include <linux/bug.h> #include <linux/list.h> #include <linux/string_choices.h> #include "messages.h" #include "ctree.h" #include "discard.h" #include "disk-io.h" #include "send.h" #include "transaction.h" #include "sysfs.h" #include "volumes.h" #include "space-info.h" #include "block-group.h" #include "qgroup.h" #include "misc.h" #include "fs.h" #include "accessors.h" /* * Structure name Path * -------------------------------------------------------------------------- * btrfs_supported_static_feature_attrs /sys/fs/btrfs/features * btrfs_supported_feature_attrs /sys/fs/btrfs/features and * /sys/fs/btrfs/<uuid>/features * btrfs_attrs /sys/fs/btrfs/<uuid> * devid_attrs /sys/fs/btrfs/<uuid>/devinfo/<devid> * allocation_attrs /sys/fs/btrfs/<uuid>/allocation * qgroup_attrs /sys/fs/btrfs/<uuid>/qgroups/<level>_<qgroupid> * space_info_attrs /sys/fs/btrfs/<uuid>/allocation/<bg-type> * raid_attrs /sys/fs/btrfs/<uuid>/allocation/<bg-type>/<bg-profile> * discard_attrs /sys/fs/btrfs/<uuid>/discard * * When built with BTRFS_CONFIG_DEBUG: * * btrfs_debug_feature_attrs /sys/fs/btrfs/debug * btrfs_debug_mount_attrs /sys/fs/btrfs/<uuid>/debug */ struct btrfs_feature_attr { struct kobj_attribute kobj_attr; enum btrfs_feature_set feature_set; u64 feature_bit; }; /* For raid type sysfs entries */ struct raid_kobject { u64 flags; struct kobject kobj; }; #define __INIT_KOBJ_ATTR(_name, _mode, _show, _store) \ { \ .attr = { .name = __stringify(_name), .mode = _mode }, \ .show = _show, \ .store = _store, \ } #define BTRFS_ATTR_W(_prefix, _name, _store) \ static struct kobj_attribute btrfs_attr_##_prefix##_##_name = \ __INIT_KOBJ_ATTR(_name, 0200, NULL, _store) #define BTRFS_ATTR_RW(_prefix, _name, _show, _store) \ static struct kobj_attribute btrfs_attr_##_prefix##_##_name = \ __INIT_KOBJ_ATTR(_name, 0644, _show, _store) #define BTRFS_ATTR(_prefix, _name, _show) \ static struct kobj_attribute btrfs_attr_##_prefix##_##_name = \ __INIT_KOBJ_ATTR(_name, 0444, _show, NULL) #define BTRFS_ATTR_PTR(_prefix, _name) \ (&btrfs_attr_##_prefix##_##_name.attr) #define BTRFS_FEAT_ATTR(_name, _feature_set, _feature_prefix, _feature_bit) \ static struct btrfs_feature_attr btrfs_attr_features_##_name = { \ .kobj_attr = __INIT_KOBJ_ATTR(_name, S_IRUGO, \ btrfs_feature_attr_show, \ btrfs_feature_attr_store), \ .feature_set = _feature_set, \ .feature_bit = _feature_prefix ##_## _feature_bit, \ } #define BTRFS_FEAT_ATTR_PTR(_name) \ (&btrfs_attr_features_##_name.kobj_attr.attr) #define BTRFS_FEAT_ATTR_COMPAT(name, feature) \ BTRFS_FEAT_ATTR(name, FEAT_COMPAT, BTRFS_FEATURE_COMPAT, feature) #define BTRFS_FEAT_ATTR_COMPAT_RO(name, feature) \ BTRFS_FEAT_ATTR(name, FEAT_COMPAT_RO, BTRFS_FEATURE_COMPAT_RO, feature) #define BTRFS_FEAT_ATTR_INCOMPAT(name, feature) \ BTRFS_FEAT_ATTR(name, FEAT_INCOMPAT, BTRFS_FEATURE_INCOMPAT, feature) static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj); static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj); static struct kobject *get_btrfs_kobj(struct kobject *kobj); static struct btrfs_feature_attr *to_btrfs_feature_attr(struct kobj_attribute *a) { return container_of(a, struct btrfs_feature_attr, kobj_attr); } static struct kobj_attribute *attr_to_btrfs_attr(struct attribute *attr) { return container_of(attr, struct kobj_attribute, attr); } static struct btrfs_feature_attr *attr_to_btrfs_feature_attr( struct attribute *attr) { return to_btrfs_feature_attr(attr_to_btrfs_attr(attr)); } static u64 get_features(struct btrfs_fs_info *fs_info, enum btrfs_feature_set set) { struct btrfs_super_block *disk_super = fs_info->super_copy; if (set == FEAT_COMPAT) return btrfs_super_compat_flags(disk_super); else if (set == FEAT_COMPAT_RO) return btrfs_super_compat_ro_flags(disk_super); else return btrfs_super_incompat_flags(disk_super); } static void set_features(struct btrfs_fs_info *fs_info, enum btrfs_feature_set set, u64 features) { struct btrfs_super_block *disk_super = fs_info->super_copy; if (set == FEAT_COMPAT) btrfs_set_super_compat_flags(disk_super, features); else if (set == FEAT_COMPAT_RO) btrfs_set_super_compat_ro_flags(disk_super, features); else btrfs_set_super_incompat_flags(disk_super, features); } static int can_modify_feature(struct btrfs_feature_attr *fa) { int val = 0; u64 set, clear; switch (fa->feature_set) { case FEAT_COMPAT: set = BTRFS_FEATURE_COMPAT_SAFE_SET; clear = BTRFS_FEATURE_COMPAT_SAFE_CLEAR; break; case FEAT_COMPAT_RO: set = BTRFS_FEATURE_COMPAT_RO_SAFE_SET; clear = BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR; break; case FEAT_INCOMPAT: set = BTRFS_FEATURE_INCOMPAT_SAFE_SET; clear = BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR; break; default: btrfs_warn(NULL, "sysfs: unknown feature set %d", fa->feature_set); return 0; } if (set & fa->feature_bit) val |= 1; if (clear & fa->feature_bit) val |= 2; return val; } static ssize_t btrfs_feature_attr_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { int val = 0; struct btrfs_fs_info *fs_info = to_fs_info(kobj); struct btrfs_feature_attr *fa = to_btrfs_feature_attr(a); if (fs_info) { u64 features = get_features(fs_info, fa->feature_set); if (features & fa->feature_bit) val = 1; } else val = can_modify_feature(fa); return sysfs_emit(buf, "%d\n", val); } static ssize_t btrfs_feature_attr_store(struct kobject *kobj, struct kobj_attribute *a, const char *buf, size_t count) { struct btrfs_fs_info *fs_info; struct btrfs_feature_attr *fa = to_btrfs_feature_attr(a); u64 features, set, clear; unsigned long val; int ret; fs_info = to_fs_info(kobj); if (!fs_info) return -EPERM; if (sb_rdonly(fs_info->sb)) return -EROFS; ret = kstrtoul(skip_spaces(buf), 0, &val); if (ret) return ret; if (fa->feature_set == FEAT_COMPAT) { set = BTRFS_FEATURE_COMPAT_SAFE_SET; clear = BTRFS_FEATURE_COMPAT_SAFE_CLEAR; } else if (fa->feature_set == FEAT_COMPAT_RO) { set = BTRFS_FEATURE_COMPAT_RO_SAFE_SET; clear = BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR; } else { set = BTRFS_FEATURE_INCOMPAT_SAFE_SET; clear = BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR; } features = get_features(fs_info, fa->feature_set); /* Nothing to do */ if ((val && (features & fa->feature_bit)) || (!val && !(features & fa->feature_bit))) return count; if ((val && !(set & fa->feature_bit)) || (!val && !(clear & fa->feature_bit))) { btrfs_info(fs_info, "%sabling feature %s on mounted fs is not supported.", val ? "En" : "Dis", fa->kobj_attr.attr.name); return -EPERM; } btrfs_info(fs_info, "%s %s feature flag", val ? "Setting" : "Clearing", fa->kobj_attr.attr.name); spin_lock(&fs_info->super_lock); features = get_features(fs_info, fa->feature_set); if (val) features |= fa->feature_bit; else features &= ~fa->feature_bit; set_features(fs_info, fa->feature_set, features); spin_unlock(&fs_info->super_lock); /* * We don't want to do full transaction commit from inside sysfs */ set_bit(BTRFS_FS_NEED_TRANS_COMMIT, &fs_info->flags); wake_up_process(fs_info->transaction_kthread); return count; } static umode_t btrfs_feature_visible(struct kobject *kobj, struct attribute *attr, int unused) { struct btrfs_fs_info *fs_info = to_fs_info(kobj); umode_t mode = attr->mode; if (fs_info) { struct btrfs_feature_attr *fa; u64 features; fa = attr_to_btrfs_feature_attr(attr); features = get_features(fs_info, fa->feature_set); if (can_modify_feature(fa)) mode |= S_IWUSR; else if (!(features & fa->feature_bit)) mode = 0; } return mode; } BTRFS_FEAT_ATTR_INCOMPAT(default_subvol, DEFAULT_SUBVOL); BTRFS_FEAT_ATTR_INCOMPAT(mixed_groups, MIXED_GROUPS); BTRFS_FEAT_ATTR_INCOMPAT(compress_lzo, COMPRESS_LZO); BTRFS_FEAT_ATTR_INCOMPAT(compress_zstd, COMPRESS_ZSTD); BTRFS_FEAT_ATTR_INCOMPAT(extended_iref, EXTENDED_IREF); BTRFS_FEAT_ATTR_INCOMPAT(raid56, RAID56); BTRFS_FEAT_ATTR_INCOMPAT(skinny_metadata, SKINNY_METADATA); BTRFS_FEAT_ATTR_INCOMPAT(no_holes, NO_HOLES); BTRFS_FEAT_ATTR_INCOMPAT(metadata_uuid, METADATA_UUID); BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE); BTRFS_FEAT_ATTR_COMPAT_RO(block_group_tree, BLOCK_GROUP_TREE); BTRFS_FEAT_ATTR_INCOMPAT(raid1c34, RAID1C34); BTRFS_FEAT_ATTR_INCOMPAT(simple_quota, SIMPLE_QUOTA); #ifdef CONFIG_BLK_DEV_ZONED BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED); #endif #ifdef CONFIG_BTRFS_EXPERIMENTAL /* Remove once support for extent tree v2 is feature complete */ BTRFS_FEAT_ATTR_INCOMPAT(extent_tree_v2, EXTENT_TREE_V2); /* Remove once support for raid stripe tree is feature complete. */ BTRFS_FEAT_ATTR_INCOMPAT(raid_stripe_tree, RAID_STRIPE_TREE); /* Remove once support for remap tree is feature complete. */ BTRFS_FEAT_ATTR_INCOMPAT(remap_tree, REMAP_TREE); #endif #ifdef CONFIG_FS_VERITY BTRFS_FEAT_ATTR_COMPAT_RO(verity, VERITY); #endif /* * Features which depend on feature bits and may differ between each fs. * * /sys/fs/btrfs/features - all available features implemented by this version * /sys/fs/btrfs/UUID/features - features of the fs which are enabled or * can be changed on a mounted filesystem. */ static struct attribute *btrfs_supported_feature_attrs[] = { BTRFS_FEAT_ATTR_PTR(default_subvol), BTRFS_FEAT_ATTR_PTR(mixed_groups), BTRFS_FEAT_ATTR_PTR(compress_lzo), BTRFS_FEAT_ATTR_PTR(compress_zstd), BTRFS_FEAT_ATTR_PTR(extended_iref), BTRFS_FEAT_ATTR_PTR(raid56), BTRFS_FEAT_ATTR_PTR(skinny_metadata), BTRFS_FEAT_ATTR_PTR(no_holes), BTRFS_FEAT_ATTR_PTR(metadata_uuid), BTRFS_FEAT_ATTR_PTR(free_space_tree), BTRFS_FEAT_ATTR_PTR(raid1c34), BTRFS_FEAT_ATTR_PTR(block_group_tree), BTRFS_FEAT_ATTR_PTR(simple_quota), #ifdef CONFIG_BLK_DEV_ZONED BTRFS_FEAT_ATTR_PTR(zoned), #endif #ifdef CONFIG_BTRFS_EXPERIMENTAL BTRFS_FEAT_ATTR_PTR(extent_tree_v2), BTRFS_FEAT_ATTR_PTR(raid_stripe_tree), BTRFS_FEAT_ATTR_PTR(remap_tree), #endif #ifdef CONFIG_FS_VERITY BTRFS_FEAT_ATTR_PTR(verity), #endif NULL }; static const struct attribute_group btrfs_feature_attr_group = { .name = "features", .is_visible = btrfs_feature_visible, .attrs = btrfs_supported_feature_attrs, }; static ssize_t rmdir_subvol_show(struct kobject *kobj, struct kobj_attribute *ka, char *buf) { return sysfs_emit(buf, "0\n"); } BTRFS_ATTR(static_feature, rmdir_subvol, rmdir_subvol_show); static ssize_t supported_checksums_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { ssize_t ret = 0; int i; for (i = 0; i < btrfs_get_num_csums(); i++) { /* * This "trick" only works as long as 'enum btrfs_csum_type' has * no holes in it */ ret += sysfs_emit_at(buf, ret, "%s%s", (i == 0 ? "" : " "), btrfs_super_csum_name(i)); } ret += sysfs_emit_at(buf, ret, "\n"); return ret; } BTRFS_ATTR(static_feature, supported_checksums, supported_checksums_show); static ssize_t send_stream_version_show(struct kobject *kobj, struct kobj_attribute *ka, char *buf) { return sysfs_emit(buf, "%d\n", BTRFS_SEND_STREAM_VERSION); } BTRFS_ATTR(static_feature, send_stream_version, send_stream_version_show); static const char *rescue_opts[] = { "usebackuproot", "nologreplay", "ignorebadroots", "ignoredatacsums", "ignoremetacsums", "ignoresuperflags", "all", }; static ssize_t supported_rescue_options_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { ssize_t ret = 0; int i; for (i = 0; i < ARRAY_SIZE(rescue_opts); i++) ret += sysfs_emit_at(buf, ret, "%s%s", (i ? " " : ""), rescue_opts[i]); ret += sysfs_emit_at(buf, ret, "\n"); return ret; } BTRFS_ATTR(static_feature, supported_rescue_options, supported_rescue_options_show); static ssize_t supported_sectorsizes_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { ssize_t ret = 0; bool has_output = false; for (u32 cur = BTRFS_MIN_BLOCKSIZE; cur <= BTRFS_MAX_BLOCKSIZE; cur *= 2) { if (!btrfs_supported_blocksize(cur)) continue; if (has_output) ret += sysfs_emit_at(buf, ret, " "); ret += sysfs_emit_at(buf, ret, "%u", cur); has_output = true; } ret += sysfs_emit_at(buf, ret, "\n"); return ret; } BTRFS_ATTR(static_feature, supported_sectorsizes, supported_sectorsizes_show); static ssize_t acl_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { return sysfs_emit(buf, "%d\n", IS_ENABLED(CONFIG_BTRFS_FS_POSIX_ACL)); } BTRFS_ATTR(static_feature, acl, acl_show); static ssize_t temp_fsid_supported_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { return sysfs_emit(buf, "0\n"); } BTRFS_ATTR(static_feature, temp_fsid, temp_fsid_supported_show); /* * Features which only depend on kernel version. * * These are listed in /sys/fs/btrfs/features along with * btrfs_supported_feature_attrs. */ static struct attribute *btrfs_supported_static_feature_attrs[] = { BTRFS_ATTR_PTR(static_feature, acl), BTRFS_ATTR_PTR(static_feature, rmdir_subvol), BTRFS_ATTR_PTR(static_feature, supported_checksums), BTRFS_ATTR_PTR(static_feature, send_stream_version), BTRFS_ATTR_PTR(static_feature, supported_rescue_options), BTRFS_ATTR_PTR(static_feature, supported_sectorsizes), BTRFS_ATTR_PTR(static_feature, temp_fsid), NULL }; static const struct attribute_group btrfs_static_feature_attr_group = { .name = "features", .attrs = btrfs_supported_static_feature_attrs, }; /* * Discard statistics and tunables */ #define discard_to_fs_info(_kobj) to_fs_info(get_btrfs_kobj(_kobj)) static ssize_t btrfs_discardable_bytes_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); return sysfs_emit(buf, "%lld\n", atomic64_read(&fs_info->discard_ctl.discardable_bytes)); } BTRFS_ATTR(discard, discardable_bytes, btrfs_discardable_bytes_show); static ssize_t btrfs_discardable_extents_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); return sysfs_emit(buf, "%d\n", atomic_read(&fs_info->discard_ctl.discardable_extents)); } BTRFS_ATTR(discard, discardable_extents, btrfs_discardable_extents_show); static ssize_t btrfs_discard_bitmap_bytes_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); return sysfs_emit(buf, "%llu\n", fs_info->discard_ctl.discard_bitmap_bytes); } BTRFS_ATTR(discard, discard_bitmap_bytes, btrfs_discard_bitmap_bytes_show); static ssize_t btrfs_discard_bytes_saved_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); return sysfs_emit(buf, "%lld\n", atomic64_read(&fs_info->discard_ctl.discard_bytes_saved)); } BTRFS_ATTR(discard, discard_bytes_saved, btrfs_discard_bytes_saved_show); static ssize_t btrfs_discard_extent_bytes_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); return sysfs_emit(buf, "%llu\n", fs_info->discard_ctl.discard_extent_bytes); } BTRFS_ATTR(discard, discard_extent_bytes, btrfs_discard_extent_bytes_show); static ssize_t btrfs_discard_iops_limit_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); return sysfs_emit(buf, "%u\n", READ_ONCE(fs_info->discard_ctl.iops_limit)); } static ssize_t btrfs_discard_iops_limit_store(struct kobject *kobj, struct kobj_attribute *a, const char *buf, size_t len) { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl; u32 iops_limit; int ret; ret = kstrtou32(buf, 10, &iops_limit); if (ret) return -EINVAL; WRITE_ONCE(discard_ctl->iops_limit, iops_limit); btrfs_discard_calc_delay(discard_ctl); btrfs_discard_schedule_work(discard_ctl, true); return len; } BTRFS_ATTR_RW(discard, iops_limit, btrfs_discard_iops_limit_show, btrfs_discard_iops_limit_store); static ssize_t btrfs_discard_kbps_limit_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); return sysfs_emit(buf, "%u\n", READ_ONCE(fs_info->discard_ctl.kbps_limit)); } static ssize_t btrfs_discard_kbps_limit_store(struct kobject *kobj, struct kobj_attribute *a, const char *buf, size_t len) { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl; u32 kbps_limit; int ret; ret = kstrtou32(buf, 10, &kbps_limit); if (ret) return -EINVAL; WRITE_ONCE(discard_ctl->kbps_limit, kbps_limit); btrfs_discard_schedule_work(discard_ctl, true); return len; } BTRFS_ATTR_RW(discard, kbps_limit, btrfs_discard_kbps_limit_show, btrfs_discard_kbps_limit_store); static ssize_t btrfs_discard_max_discard_size_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); return sysfs_emit(buf, "%llu\n", READ_ONCE(fs_info->discard_ctl.max_discard_size)); } static ssize_t btrfs_discard_max_discard_size_store(struct kobject *kobj, struct kobj_attribute *a, const char *buf, size_t len) { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl; u64 max_discard_size; int ret; ret = kstrtou64(buf, 10, &max_discard_size); if (ret) return -EINVAL; WRITE_ONCE(discard_ctl->max_discard_size, max_discard_size); return len; } BTRFS_ATTR_RW(discard, max_discard_size, btrfs_discard_max_discard_size_show, btrfs_discard_max_discard_size_store); /* * Per-filesystem stats for discard (when mounted with discard=async). * * Path: /sys/fs/btrfs/<uuid>/discard/ */ static const struct attribute *discard_attrs[] = { BTRFS_ATTR_PTR(discard, discardable_bytes), BTRFS_ATTR_PTR(discard, discardable_extents), BTRFS_ATTR_PTR(discard, discard_bitmap_bytes), BTRFS_ATTR_PTR(discard, discard_bytes_saved), BTRFS_ATTR_PTR(discard, discard_extent_bytes), BTRFS_ATTR_PTR(discard, iops_limit), BTRFS_ATTR_PTR(discard, kbps_limit), BTRFS_ATTR_PTR(discard, max_discard_size), NULL, }; #ifdef CONFIG_BTRFS_DEBUG /* * Per-filesystem runtime debugging exported via sysfs. * * Path: /sys/fs/btrfs/UUID/debug/ */ static const struct attribute *btrfs_debug_mount_attrs[] = { NULL, }; /* * Runtime debugging exported via sysfs, applies to all mounted filesystems. * * Path: /sys/fs/btrfs/debug */ static struct attribute *btrfs_debug_feature_attrs[] = { NULL }; static const struct attribute_group btrfs_debug_feature_attr_group = { .name = "debug", .attrs = btrfs_debug_feature_attrs, }; #endif static ssize_t btrfs_show_u64(u64 *value_ptr, spinlock_t *lock, char *buf) { u64 val; if (lock) spin_lock(lock); val = *value_ptr; if (lock) spin_unlock(lock); return sysfs_emit(buf, "%llu\n", val); } static ssize_t global_rsv_size_show(struct kobject *kobj, struct kobj_attribute *ka, char *buf) { struct btrfs_fs_info *fs_info = to_fs_info(kobj->parent); struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; return btrfs_show_u64(&block_rsv->size, &block_rsv->lock, buf); } BTRFS_ATTR(allocation, global_rsv_size, global_rsv_size_show); static ssize_t global_rsv_reserved_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = to_fs_info(kobj->parent); struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; return btrfs_show_u64(&block_rsv->reserved, &block_rsv->lock, buf); } BTRFS_ATTR(allocation, global_rsv_reserved, global_rsv_reserved_show); #define to_space_info(_kobj) container_of(_kobj, struct btrfs_space_info, kobj) #define to_raid_kobj(_kobj) container_of(_kobj, struct raid_kobject, kobj) static ssize_t raid_bytes_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf); BTRFS_ATTR(raid, total_bytes, raid_bytes_show); BTRFS_ATTR(raid, used_bytes, raid_bytes_show); static ssize_t raid_bytes_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct btrfs_space_info *sinfo = to_space_info(kobj->parent); struct btrfs_block_group *block_group; int index = btrfs_bg_flags_to_raid_index(to_raid_kobj(kobj)->flags); u64 val = 0; down_read(&sinfo->groups_sem); list_for_each_entry(block_group, &sinfo->block_groups[index], list) { if (&attr->attr == BTRFS_ATTR_PTR(raid, total_bytes)) val += block_group->length; else val += block_group->used; } up_read(&sinfo->groups_sem); return sysfs_emit(buf, "%llu\n", val); } /* * Allocation information about block group profiles. * * Path: /sys/fs/btrfs/<uuid>/allocation/<bg-type>/<bg-profile>/ */ static struct attribute *raid_attrs[] = { BTRFS_ATTR_PTR(raid, total_bytes), BTRFS_ATTR_PTR(raid, used_bytes), NULL }; ATTRIBUTE_GROUPS(raid); static void release_raid_kobj(struct kobject *kobj) { kfree(to_raid_kobj(kobj)); } static const struct kobj_type btrfs_raid_ktype = { .sysfs_ops = &kobj_sysfs_ops, .release = release_raid_kobj, .default_groups = raid_groups, }; #define SPACE_INFO_ATTR(field) \ static ssize_t btrfs_space_info_show_##field(struct kobject *kobj, \ struct kobj_attribute *a, \ char *buf) \ { \ struct btrfs_space_info *sinfo = to_space_info(kobj); \ return btrfs_show_u64(&sinfo->field, &sinfo->lock, buf); \ } \ BTRFS_ATTR(space_info, field, btrfs_space_info_show_##field) static ssize_t btrfs_chunk_size_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_space_info *sinfo = to_space_info(kobj); return sysfs_emit(buf, "%llu\n", READ_ONCE(sinfo->chunk_size)); } /* * Store new chunk size in space info. Can be called on a read-only filesystem. * * If the new chunk size value is larger than 10% of free space it is reduced * to match that limit. Alignment must be to 256M and the system chunk size * cannot be set. */ static ssize_t btrfs_chunk_size_store(struct kobject *kobj, struct kobj_attribute *a, const char *buf, size_t len) { struct btrfs_space_info *space_info = to_space_info(kobj); struct btrfs_fs_info *fs_info = to_fs_info(get_btrfs_kobj(kobj)); char *retptr; u64 val; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (!fs_info->fs_devices) return -EINVAL; if (btrfs_is_zoned(fs_info)) return -EINVAL; /* System block type must not be changed. */ if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM) return -EPERM; val = memparse(buf, &retptr); /* There could be trailing '\n', also catch any typos after the value */ retptr = skip_spaces(retptr); if (*retptr != 0 || val == 0) return -EINVAL; val = min(val, BTRFS_MAX_DATA_CHUNK_SIZE); /* Limit stripe size to 10% of available space. */ val = min(mult_perc(fs_info->fs_devices->total_rw_bytes, 10), val); /* Must be multiple of 256M. */ val &= ~((u64)SZ_256M - 1); /* Must be at least 256M. */ if (val < SZ_256M) return -EINVAL; btrfs_update_space_info_chunk_size(space_info, val); return len; } static ssize_t btrfs_size_classes_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_space_info *sinfo = to_space_info(kobj); struct btrfs_block_group *bg; u32 none = 0; u32 small = 0; u32 medium = 0; u32 large = 0; for (int i = 0; i < BTRFS_NR_RAID_TYPES; ++i) { down_read(&sinfo->groups_sem); list_for_each_entry(bg, &sinfo->block_groups[i], list) { if (!btrfs_block_group_should_use_size_class(bg)) continue; switch (bg->size_class) { case BTRFS_BG_SZ_NONE: none++; break; case BTRFS_BG_SZ_SMALL: small++; break; case BTRFS_BG_SZ_MEDIUM: medium++; break; case BTRFS_BG_SZ_LARGE: large++; break; } } up_read(&sinfo->groups_sem); } return sysfs_emit(buf, "none %u\n" "small %u\n" "medium %u\n" "large %u\n", none, small, medium, large); } #ifdef CONFIG_BTRFS_DEBUG /* * Request chunk allocation with current chunk size. */ static ssize_t btrfs_force_chunk_alloc_store(struct kobject *kobj, struct kobj_attribute *a, const char *buf, size_t len) { struct btrfs_space_info *space_info = to_space_info(kobj); struct btrfs_fs_info *fs_info = to_fs_info(get_btrfs_kobj(kobj)); struct btrfs_trans_handle *trans; bool val; int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (sb_rdonly(fs_info->sb)) return -EROFS; ret = kstrtobool(buf, &val); if (ret) return ret; if (!val) return -EINVAL; /* * This is unsafe to be called from sysfs context and may cause * unexpected problems. */ trans = btrfs_start_transaction(fs_info->tree_root, 0); if (IS_ERR(trans)) return PTR_ERR(trans); ret = btrfs_force_chunk_alloc(trans, space_info->flags); btrfs_end_transaction(trans); if (ret == 1) return len; return -ENOSPC; } BTRFS_ATTR_W(space_info, force_chunk_alloc, btrfs_force_chunk_alloc_store); #endif SPACE_INFO_ATTR(flags); SPACE_INFO_ATTR(total_bytes); SPACE_INFO_ATTR(bytes_used); SPACE_INFO_ATTR(bytes_pinned); SPACE_INFO_ATTR(bytes_reserved); SPACE_INFO_ATTR(bytes_may_use); SPACE_INFO_ATTR(bytes_readonly); SPACE_INFO_ATTR(bytes_zone_unusable); SPACE_INFO_ATTR(disk_used); SPACE_INFO_ATTR(disk_total); SPACE_INFO_ATTR(reclaim_count); SPACE_INFO_ATTR(reclaim_bytes); SPACE_INFO_ATTR(reclaim_errors); BTRFS_ATTR_RW(space_info, chunk_size, btrfs_chunk_size_show, btrfs_chunk_size_store); BTRFS_ATTR(space_info, size_classes, btrfs_size_classes_show); static ssize_t btrfs_sinfo_bg_reclaim_threshold_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_space_info *space_info = to_space_info(kobj); ssize_t ret; spin_lock(&space_info->lock); ret = sysfs_emit(buf, "%d\n", btrfs_calc_reclaim_threshold(space_info)); spin_unlock(&space_info->lock); return ret; } static ssize_t btrfs_sinfo_bg_reclaim_threshold_store(struct kobject *kobj, struct kobj_attribute *a, const char *buf, size_t len) { struct btrfs_space_info *space_info = to_space_info(kobj); int thresh; int ret; if (READ_ONCE(space_info->dynamic_reclaim)) return -EINVAL; ret = kstrtoint(buf, 10, &thresh); if (ret) return ret; if (thresh < 0 || thresh > 100) return -EINVAL; WRITE_ONCE(space_info->bg_reclaim_threshold, thresh); return len; } BTRFS_ATTR_RW(space_info, bg_reclaim_threshold, btrfs_sinfo_bg_reclaim_threshold_show, btrfs_sinfo_bg_reclaim_threshold_store); static ssize_t btrfs_sinfo_dynamic_reclaim_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_space_info *space_info = to_space_info(kobj); return sysfs_emit(buf, "%d\n", READ_ONCE(space_info->dynamic_reclaim)); } static ssize_t btrfs_sinfo_dynamic_reclaim_store(struct kobject *kobj, struct kobj_attribute *a, const char *buf, size_t len) { struct btrfs_space_info *space_info = to_space_info(kobj); int dynamic_reclaim; int ret; ret = kstrtoint(buf, 10, &dynamic_reclaim); if (ret) return ret; if (dynamic_reclaim < 0) return -EINVAL; WRITE_ONCE(space_info->dynamic_reclaim, dynamic_reclaim != 0); return len; } BTRFS_ATTR_RW(space_info, dynamic_reclaim, btrfs_sinfo_dynamic_reclaim_show, btrfs_sinfo_dynamic_reclaim_store); static ssize_t btrfs_sinfo_periodic_reclaim_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_space_info *space_info = to_space_info(kobj); return sysfs_emit(buf, "%d\n", READ_ONCE(space_info->periodic_reclaim)); } static ssize_t btrfs_sinfo_periodic_reclaim_store(struct kobject *kobj, struct kobj_attribute *a, const char *buf, size_t len) { struct btrfs_space_info *space_info = to_space_info(kobj); int periodic_reclaim; int ret; ret = kstrtoint(buf, 10, &periodic_reclaim); if (ret) return ret; if (periodic_reclaim < 0) return -EINVAL; WRITE_ONCE(space_info->periodic_reclaim, periodic_reclaim != 0); return len; } BTRFS_ATTR_RW(space_info, periodic_reclaim, btrfs_sinfo_periodic_reclaim_show, btrfs_sinfo_periodic_reclaim_store); /* * Allocation information about block group types. * * Path: /sys/fs/btrfs/<uuid>/allocation/<bg-type>/ */ static struct attribute *space_info_attrs[] = { BTRFS_ATTR_PTR(space_info, flags), BTRFS_ATTR_PTR(space_info, total_bytes), BTRFS_ATTR_PTR(space_info, bytes_used), BTRFS_ATTR_PTR(space_info, bytes_pinned), BTRFS_ATTR_PTR(space_info, bytes_reserved), BTRFS_ATTR_PTR(space_info, bytes_may_use), BTRFS_ATTR_PTR(space_info, bytes_readonly), BTRFS_ATTR_PTR(space_info, bytes_zone_unusable), BTRFS_ATTR_PTR(space_info, disk_used), BTRFS_ATTR_PTR(space_info, disk_total), BTRFS_ATTR_PTR(space_info, bg_reclaim_threshold), BTRFS_ATTR_PTR(space_info, dynamic_reclaim), BTRFS_ATTR_PTR(space_info, chunk_size), BTRFS_ATTR_PTR(space_info, size_classes), BTRFS_ATTR_PTR(space_info, reclaim_count), BTRFS_ATTR_PTR(space_info, reclaim_bytes), BTRFS_ATTR_PTR(space_info, reclaim_errors), BTRFS_ATTR_PTR(space_info, periodic_reclaim), #ifdef CONFIG_BTRFS_DEBUG BTRFS_ATTR_PTR(space_info, force_chunk_alloc), #endif NULL, }; ATTRIBUTE_GROUPS(space_info); static void space_info_release(struct kobject *kobj) { struct btrfs_space_info *sinfo = to_space_info(kobj); kfree(sinfo); } static const struct kobj_type space_info_ktype = { .sysfs_ops = &kobj_sysfs_ops, .release = space_info_release, .default_groups = space_info_groups, }; /* * Allocation information about block groups. * * Path: /sys/fs/btrfs/<uuid>/allocation/ */ static const struct attribute *allocation_attrs[] = { BTRFS_ATTR_PTR(allocation, global_rsv_reserved), BTRFS_ATTR_PTR(allocation, global_rsv_size), NULL, }; static ssize_t btrfs_label_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = to_fs_info(kobj); char *label = fs_info->super_copy->label; ssize_t ret; spin_lock(&fs_info->super_lock); ret = sysfs_emit(buf, label[0] ? "%s\n" : "%s", label); spin_unlock(&fs_info->super_lock); return ret; } static ssize_t btrfs_label_store(struct kobject *kobj, struct kobj_attribute *a, const char *buf, size_t len) { struct btrfs_fs_info *fs_info = to_fs_info(kobj); size_t p_len; if (!fs_info) return -EPERM; if (sb_rdonly(fs_info->sb)) return -EROFS; /* * p_len is the len until the first occurrence of either * '\n' or '\0' */ p_len = strcspn(buf, "\n"); if (p_len >= BTRFS_LABEL_SIZE) return -EINVAL; spin_lock(&fs_info->super_lock); memset(fs_info->super_copy->label, 0, BTRFS_LABEL_SIZE); memcpy(fs_info->super_copy->label, buf, p_len); spin_unlock(&fs_info->super_lock); /* * We don't want to do full transaction commit from inside sysfs */ set_bit(BTRFS_FS_NEED_TRANS_COMMIT, &fs_info->flags); wake_up_process(fs_info->transaction_kthread); return len; } BTRFS_ATTR_RW(, label, btrfs_label_show, btrfs_label_store); static ssize_t btrfs_nodesize_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = to_fs_info(kobj); return sysfs_emit(buf, "%u\n", fs_info->nodesize); } BTRFS_ATTR(, nodesize, btrfs_nodesize_show); static ssize_t btrfs_sectorsize_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = to_fs_info(kobj); return sysfs_emit(buf, "%u\n", fs_info->sectorsize); } BTRFS_ATTR(, sectorsize, btrfs_sectorsize_show); static ssize_t btrfs_commit_stats_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = to_fs_info(kobj); u64 now = ktime_get_ns(); u64 start_time = fs_info->commit_stats.critical_section_start_time; u64 pending = 0; if (start_time) pending = now - start_time; return sysfs_emit(buf, "commits %llu\n" "cur_commit_ms %llu\n" "last_commit_ms %llu\n" "max_commit_ms %llu\n" "total_commit_ms %llu\n", fs_info->commit_stats.commit_count, div_u64(pending, NSEC_PER_MSEC), div_u64(fs_info->commit_stats.last_commit_dur, NSEC_PER_MSEC), div_u64(fs_info->commit_stats.max_commit_dur, NSEC_PER_MSEC), div_u64(fs_info->commit_stats.total_commit_dur, NSEC_PER_MSEC)); } static ssize_t btrfs_commit_stats_store(struct kobject *kobj, struct kobj_attribute *a, const char *buf, size_t len) { struct btrfs_fs_info *fs_info = to_fs_info(kobj); unsigned long val; int ret; if (!fs_info) return -EPERM; if (!capable(CAP_SYS_RESOURCE)) return -EPERM; ret = kstrtoul(buf, 10, &val); if (ret) return ret; if (val) return -EINVAL; WRITE_ONCE(fs_info->commit_stats.max_commit_dur, 0); return len; } BTRFS_ATTR_RW(, commit_stats, btrfs_commit_stats_show, btrfs_commit_stats_store); static ssize_t btrfs_clone_alignment_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = to_fs_info(kobj); return sysfs_emit(buf, "%u\n", fs_info->sectorsize); } BTRFS_ATTR(, clone_alignment, btrfs_clone_alignment_show); static ssize_t quota_override_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = to_fs_info(kobj); int quota_override; quota_override = test_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags); return sysfs_emit(buf, "%d\n", quota_override); } static ssize_t quota_override_store(struct kobject *kobj, struct kobj_attribute *a, const char *buf, size_t len) { struct btrfs_fs_info *fs_info = to_fs_info(kobj); unsigned long knob; int ret; if (!fs_info) return -EPERM; if (!capable(CAP_SYS_RESOURCE)) return -EPERM; ret = kstrtoul(buf, 10, &knob); if (ret) return ret; if (knob > 1) return -EINVAL; if (knob) set_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags); else clear_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags); return len; } BTRFS_ATTR_RW(, quota_override, quota_override_show, quota_override_store); static ssize_t btrfs_metadata_uuid_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = to_fs_info(kobj); return sysfs_emit(buf, "%pU\n", fs_info->fs_devices->metadata_uuid); } BTRFS_ATTR(, metadata_uuid, btrfs_metadata_uuid_show); static ssize_t btrfs_checksum_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = to_fs_info(kobj); u16 csum_type = btrfs_super_csum_type(fs_info->super_copy); const char *csum_name = btrfs_super_csum_name(csum_type); return sysfs_emit(buf, "%s (%s-lib)\n", csum_name, csum_name); } BTRFS_ATTR(, checksum, btrfs_checksum_show); static ssize_t btrfs_exclusive_operation_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = to_fs_info(kobj); const char *str; switch (READ_ONCE(fs_info->exclusive_operation)) { case BTRFS_EXCLOP_NONE: str = "none\n"; break; case BTRFS_EXCLOP_BALANCE: str = "balance\n"; break; case BTRFS_EXCLOP_BALANCE_PAUSED: str = "balance paused\n"; break; case BTRFS_EXCLOP_DEV_ADD: str = "device add\n"; break; case BTRFS_EXCLOP_DEV_REMOVE: str = "device remove\n"; break; case BTRFS_EXCLOP_DEV_REPLACE: str = "device replace\n"; break; case BTRFS_EXCLOP_RESIZE: str = "resize\n"; break; case BTRFS_EXCLOP_SWAP_ACTIVATE: str = "swap activate\n"; break; default: str = "UNKNOWN\n"; break; } return sysfs_emit(buf, "%s", str); } BTRFS_ATTR(, exclusive_operation, btrfs_exclusive_operation_show); static ssize_t btrfs_generation_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = to_fs_info(kobj); return sysfs_emit(buf, "%llu\n", btrfs_get_fs_generation(fs_info)); } BTRFS_ATTR(, generation, btrfs_generation_show); static ssize_t btrfs_temp_fsid_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = to_fs_info(kobj); return sysfs_emit(buf, "%d\n", fs_info->fs_devices->temp_fsid); } BTRFS_ATTR(, temp_fsid, btrfs_temp_fsid_show); static const char *btrfs_read_policy_name[] = { "pid", #ifdef CONFIG_BTRFS_EXPERIMENTAL "round-robin", "devid", #endif }; #ifdef CONFIG_BTRFS_EXPERIMENTAL /* Global module configuration parameters. */ static char *read_policy; char *btrfs_get_mod_read_policy(void) { return read_policy; } /* Set perms to 0, disable /sys/module/btrfs/parameter/read_policy interface. */ module_param(read_policy, charp, 0); MODULE_PARM_DESC(read_policy, "Global read policy: pid (default), round-robin[:<min_contig_read>], devid[:<devid>]"); #endif int btrfs_read_policy_to_enum(const char *str, s64 *value_ret) { char param[32]; char __maybe_unused *value_str; if (!str || strlen(str) == 0) return 0; strscpy(param, str); #ifdef CONFIG_BTRFS_EXPERIMENTAL /* Separate value from input in policy:value format. */ value_str = strchr(param, ':'); if (value_str) { char *retptr; *value_str = 0; value_str++; if (!value_ret) return -EINVAL; *value_ret = memparse(value_str, &retptr); /* There could be any trailing typos after the value. */ retptr = skip_spaces(retptr); if (*retptr != 0 || *value_ret <= 0) return -EINVAL; } #endif return sysfs_match_string(btrfs_read_policy_name, param); } #ifdef CONFIG_BTRFS_EXPERIMENTAL int __init btrfs_read_policy_init(void) { s64 value; if (btrfs_read_policy_to_enum(read_policy, &value) == -EINVAL) { btrfs_err(NULL, "invalid read policy or value %s", read_policy); return -EINVAL; } return 0; } #endif static ssize_t btrfs_read_policy_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj); const enum btrfs_read_policy policy = READ_ONCE(fs_devices->read_policy); ssize_t ret = 0; int i; for (i = 0; i < BTRFS_NR_READ_POLICY; i++) { if (ret != 0) ret += sysfs_emit_at(buf, ret, " "); if (i == policy) ret += sysfs_emit_at(buf, ret, "["); ret += sysfs_emit_at(buf, ret, "%s", btrfs_read_policy_name[i]); #ifdef CONFIG_BTRFS_EXPERIMENTAL if (i == BTRFS_READ_POLICY_RR) ret += sysfs_emit_at(buf, ret, ":%u", READ_ONCE(fs_devices->rr_min_contig_read)); if (i == BTRFS_READ_POLICY_DEVID) ret += sysfs_emit_at(buf, ret, ":%llu", READ_ONCE(fs_devices->read_devid)); #endif if (i == policy) ret += sysfs_emit_at(buf, ret, "]"); } ret += sysfs_emit_at(buf, ret, "\n"); return ret; } static ssize_t btrfs_read_policy_store(struct kobject *kobj, struct kobj_attribute *a, const char *buf, size_t len) { struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj); int index; s64 value = -1; index = btrfs_read_policy_to_enum(buf, &value); if (index < 0) return -EINVAL; #ifdef CONFIG_BTRFS_EXPERIMENTAL /* If moving from RR then disable collecting fs stats. */ if (fs_devices->read_policy == BTRFS_READ_POLICY_RR && index != BTRFS_READ_POLICY_RR) fs_devices->collect_fs_stats = false; if (index == BTRFS_READ_POLICY_RR) { if (value != -1) { const u32 sectorsize = fs_devices->fs_info->sectorsize; if (!IS_ALIGNED(value, sectorsize)) { u64 temp_value = round_up(value, sectorsize); btrfs_debug(fs_devices->fs_info, "read_policy: min contig read %lld should be multiple of sectorsize %u, rounded to %llu", value, sectorsize, temp_value); value = temp_value; } } else { value = BTRFS_DEFAULT_RR_MIN_CONTIG_READ; } if (index != READ_ONCE(fs_devices->read_policy) || value != READ_ONCE(fs_devices->rr_min_contig_read)) { WRITE_ONCE(fs_devices->read_policy, index); WRITE_ONCE(fs_devices->rr_min_contig_read, value); btrfs_info(fs_devices->fs_info, "read policy set to '%s:%lld'", btrfs_read_policy_name[index], value); } fs_devices->collect_fs_stats = true; return len; } if (index == BTRFS_READ_POLICY_DEVID) { if (value != -1) { BTRFS_DEV_LOOKUP_ARGS(args); /* Validate input devid. */ args.devid = value; if (btrfs_find_device(fs_devices, &args) == NULL) return -EINVAL; } else { /* Set default devid to the devid of the latest device. */ value = fs_devices->latest_dev->devid; } if (index != READ_ONCE(fs_devices->read_policy) || value != READ_ONCE(fs_devices->read_devid)) { WRITE_ONCE(fs_devices->read_policy, index); WRITE_ONCE(fs_devices->read_devid, value); btrfs_info(fs_devices->fs_info, "read policy set to '%s:%llu'", btrfs_read_policy_name[index], value); } return len; } #endif if (index != READ_ONCE(fs_devices->read_policy)) { WRITE_ONCE(fs_devices->read_policy, index); btrfs_info(fs_devices->fs_info, "read policy set to '%s'", btrfs_read_policy_name[index]); } return len; } BTRFS_ATTR_RW(, read_policy, btrfs_read_policy_show, btrfs_read_policy_store); static ssize_t btrfs_bg_reclaim_threshold_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = to_fs_info(kobj); return sysfs_emit(buf, "%d\n", READ_ONCE(fs_info->bg_reclaim_threshold)); } static ssize_t btrfs_bg_reclaim_threshold_store(struct kobject *kobj, struct kobj_attribute *a, const char *buf, size_t len) { struct btrfs_fs_info *fs_info = to_fs_info(kobj); int thresh; int ret; ret = kstrtoint(buf, 10, &thresh); if (ret) return ret; #ifdef CONFIG_BTRFS_DEBUG if (thresh != 0 && (thresh > 100)) return -EINVAL; #else if (thresh != 0 && (thresh <= 50 || thresh > 100)) return -EINVAL; #endif WRITE_ONCE(fs_info->bg_reclaim_threshold, thresh); return len; } BTRFS_ATTR_RW(, bg_reclaim_threshold, btrfs_bg_reclaim_threshold_show, btrfs_bg_reclaim_threshold_store); /* * Per-filesystem information and stats. * * Path: /sys/fs/btrfs/<uuid>/ */ static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(, label), BTRFS_ATTR_PTR(, nodesize), BTRFS_ATTR_PTR(, sectorsize), BTRFS_ATTR_PTR(, clone_alignment), BTRFS_ATTR_PTR(, quota_override), BTRFS_ATTR_PTR(, metadata_uuid), BTRFS_ATTR_PTR(, checksum), BTRFS_ATTR_PTR(, exclusive_operation), BTRFS_ATTR_PTR(, generation), BTRFS_ATTR_PTR(, read_policy), BTRFS_ATTR_PTR(, bg_reclaim_threshold), BTRFS_ATTR_PTR(, commit_stats), BTRFS_ATTR_PTR(, temp_fsid), NULL, }; static void btrfs_release_fsid_kobj(struct kobject *kobj) { struct btrfs_fs_devices *fs_devs = to_fs_devs(kobj); memset(&fs_devs->fsid_kobj, 0, sizeof(struct kobject)); complete(&fs_devs->kobj_unregister); } static const struct kobj_type btrfs_ktype = { .sysfs_ops = &kobj_sysfs_ops, .release = btrfs_release_fsid_kobj, }; static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj) { if (kobj->ktype != &btrfs_ktype) return NULL; return container_of(kobj, struct btrfs_fs_devices, fsid_kobj); } static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj) { if (kobj->ktype != &btrfs_ktype) return NULL; return to_fs_devs(kobj)->fs_info; } static struct kobject *get_btrfs_kobj(struct kobject *kobj) { while (kobj) { if (kobj->ktype == &btrfs_ktype) return kobj; kobj = kobj->parent; } return NULL; } #define NUM_FEATURE_BITS 64 #define BTRFS_FEATURE_NAME_MAX 13 static char btrfs_unknown_feature_names[FEAT_MAX][NUM_FEATURE_BITS][BTRFS_FEATURE_NAME_MAX]; static struct btrfs_feature_attr btrfs_feature_attrs[FEAT_MAX][NUM_FEATURE_BITS]; static_assert(ARRAY_SIZE(btrfs_unknown_feature_names) == ARRAY_SIZE(btrfs_feature_attrs)); static_assert(ARRAY_SIZE(btrfs_unknown_feature_names[0]) == ARRAY_SIZE(btrfs_feature_attrs[0])); static const u64 supported_feature_masks[FEAT_MAX] = { [FEAT_COMPAT] = BTRFS_FEATURE_COMPAT_SUPP, [FEAT_COMPAT_RO] = BTRFS_FEATURE_COMPAT_RO_SUPP, [FEAT_INCOMPAT] = BTRFS_FEATURE_INCOMPAT_SUPP, }; static int addrm_unknown_feature_attrs(struct btrfs_fs_info *fs_info, bool add) { int set; for (set = 0; set < FEAT_MAX; set++) { int i; struct attribute *attrs[2]; struct attribute_group agroup = { .name = "features", .attrs = attrs, }; u64 features = get_features(fs_info, set); features &= ~supported_feature_masks[set]; if (!features) continue; attrs[1] = NULL; for (i = 0; i < NUM_FEATURE_BITS; i++) { struct btrfs_feature_attr *fa; if (!(features & (1ULL << i))) continue; fa = &btrfs_feature_attrs[set][i]; attrs[0] = &fa->kobj_attr.attr; if (add) { int ret; ret = sysfs_merge_group(&fs_info->fs_devices->fsid_kobj, &agroup); if (ret) return ret; } else sysfs_unmerge_group(&fs_info->fs_devices->fsid_kobj, &agroup); } } return 0; } static void __btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs) { if (fs_devs->devinfo_kobj) { kobject_del(fs_devs->devinfo_kobj); kobject_put(fs_devs->devinfo_kobj); fs_devs->devinfo_kobj = NULL; } if (fs_devs->devices_kobj) { kobject_del(fs_devs->devices_kobj); kobject_put(fs_devs->devices_kobj); fs_devs->devices_kobj = NULL; } if (fs_devs->fsid_kobj.state_initialized) { kobject_del(&fs_devs->fsid_kobj); kobject_put(&fs_devs->fsid_kobj); wait_for_completion(&fs_devs->kobj_unregister); } } /* when fs_devs is NULL it will remove all fsid kobject */ void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs) { struct list_head *fs_uuids = btrfs_get_fs_uuids(); if (fs_devs) { __btrfs_sysfs_remove_fsid(fs_devs); return; } list_for_each_entry(fs_devs, fs_uuids, fs_list) { __btrfs_sysfs_remove_fsid(fs_devs); } } static void btrfs_sysfs_remove_fs_devices(struct btrfs_fs_devices *fs_devices) { struct btrfs_device *device; struct btrfs_fs_devices *seed; list_for_each_entry(device, &fs_devices->devices, dev_list) btrfs_sysfs_remove_device(device); list_for_each_entry(seed, &fs_devices->seed_list, seed_list) { list_for_each_entry(device, &seed->devices, dev_list) btrfs_sysfs_remove_device(device); } } void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info) { struct kobject *fsid_kobj = &fs_info->fs_devices->fsid_kobj; sysfs_remove_link(fsid_kobj, "bdi"); if (fs_info->space_info_kobj) { sysfs_remove_files(fs_info->space_info_kobj, allocation_attrs); kobject_del(fs_info->space_info_kobj); kobject_put(fs_info->space_info_kobj); } if (fs_info->discard_kobj) { sysfs_remove_files(fs_info->discard_kobj, discard_attrs); kobject_del(fs_info->discard_kobj); kobject_put(fs_info->discard_kobj); } #ifdef CONFIG_BTRFS_DEBUG if (fs_info->debug_kobj) { sysfs_remove_files(fs_info->debug_kobj, btrfs_debug_mount_attrs); kobject_del(fs_info->debug_kobj); kobject_put(fs_info->debug_kobj); } #endif addrm_unknown_feature_attrs(fs_info, false); sysfs_remove_group(fsid_kobj, &btrfs_feature_attr_group); sysfs_remove_files(fsid_kobj, btrfs_attrs); btrfs_sysfs_remove_fs_devices(fs_info->fs_devices); } static const char * const btrfs_feature_set_names[FEAT_MAX] = { [FEAT_COMPAT] = "compat", [FEAT_COMPAT_RO] = "compat_ro", [FEAT_INCOMPAT] = "incompat", }; const char *btrfs_feature_set_name(enum btrfs_feature_set set) { return btrfs_feature_set_names[set]; } char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags) { size_t bufsize = 4096; /* safe max, 64 names * 64 bytes */ int len = 0; int i; char *str; str = kmalloc(bufsize, GFP_KERNEL); if (!str) return str; for (i = 0; i < ARRAY_SIZE(btrfs_feature_attrs[set]); i++) { const char *name; if (!(flags & (1ULL << i))) continue; name = btrfs_feature_attrs[set][i].kobj_attr.attr.name; len += scnprintf(str + len, bufsize - len, "%s%s", len ? "," : "", name); } return str; } static void init_feature_attrs(void) { struct btrfs_feature_attr *fa; int set, i; memset(btrfs_feature_attrs, 0, sizeof(btrfs_feature_attrs)); memset(btrfs_unknown_feature_names, 0, sizeof(btrfs_unknown_feature_names)); for (i = 0; btrfs_supported_feature_attrs[i]; i++) { struct btrfs_feature_attr *sfa; struct attribute *a = btrfs_supported_feature_attrs[i]; int bit; sfa = attr_to_btrfs_feature_attr(a); bit = ilog2(sfa->feature_bit); fa = &btrfs_feature_attrs[sfa->feature_set][bit]; fa->kobj_attr.attr.name = sfa->kobj_attr.attr.name; } for (set = 0; set < FEAT_MAX; set++) { for (i = 0; i < ARRAY_SIZE(btrfs_feature_attrs[set]); i++) { char *name = btrfs_unknown_feature_names[set][i]; fa = &btrfs_feature_attrs[set][i]; if (fa->kobj_attr.attr.name) continue; snprintf(name, BTRFS_FEATURE_NAME_MAX, "%s:%u", btrfs_feature_set_names[set], i); fa->kobj_attr.attr.name = name; fa->kobj_attr.attr.mode = S_IRUGO; fa->feature_set = set; fa->feature_bit = 1ULL << i; } } } /* * Create a sysfs entry for a given block group type at path * /sys/fs/btrfs/UUID/allocation/data/TYPE */ void btrfs_sysfs_add_block_group_type(struct btrfs_block_group *cache) { struct btrfs_fs_info *fs_info = cache->fs_info; struct btrfs_space_info *space_info = cache->space_info; struct raid_kobject *rkobj; const int index = btrfs_bg_flags_to_raid_index(cache->flags); unsigned int nofs_flag; int ret; /* * Setup a NOFS context because kobject_add(), deep in its call chain, * does GFP_KERNEL allocations, and we are often called in a context * where if reclaim is triggered we can deadlock (we are either holding * a transaction handle or some lock required for a transaction * commit). */ nofs_flag = memalloc_nofs_save(); rkobj = kzalloc_obj(*rkobj, GFP_NOFS); if (!rkobj) { memalloc_nofs_restore(nofs_flag); btrfs_warn(cache->fs_info, "couldn't alloc memory for raid level kobject"); return; } rkobj->flags = cache->flags; kobject_init(&rkobj->kobj, &btrfs_raid_ktype); /* * We call this either on mount, or if we've created a block group for a * new index type while running (i.e. when restriping). The running * case is tricky because we could race with other threads, so we need * to have this check to make sure we didn't already init the kobject. * * We don't have to protect on the free side because it only happens on * unmount. */ spin_lock(&space_info->lock); if (space_info->block_group_kobjs[index]) { spin_unlock(&space_info->lock); kobject_put(&rkobj->kobj); return; } else { space_info->block_group_kobjs[index] = &rkobj->kobj; } spin_unlock(&space_info->lock); ret = kobject_add(&rkobj->kobj, &space_info->kobj, "%s", btrfs_bg_type_to_raid_name(rkobj->flags)); memalloc_nofs_restore(nofs_flag); if (ret) { spin_lock(&space_info->lock); space_info->block_group_kobjs[index] = NULL; spin_unlock(&space_info->lock); kobject_put(&rkobj->kobj); btrfs_warn(fs_info, "failed to add kobject for block cache, ignoring"); return; } } /* * Remove sysfs directories for all block group types of a given space info and * the space info as well */ void btrfs_sysfs_remove_space_info(struct btrfs_space_info *space_info) { int i; for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { struct kobject *kobj; kobj = space_info->block_group_kobjs[i]; space_info->block_group_kobjs[i] = NULL; if (kobj) { kobject_del(kobj); kobject_put(kobj); } } kobject_del(&space_info->kobj); kobject_put(&space_info->kobj); } static const char *alloc_name(struct btrfs_space_info *space_info) { u64 flags = space_info->flags; switch (flags) { case BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA: return "mixed"; case BTRFS_BLOCK_GROUP_METADATA: switch (space_info->subgroup_id) { case BTRFS_SUB_GROUP_PRIMARY: return "metadata"; case BTRFS_SUB_GROUP_TREELOG: return "metadata-treelog"; default: WARN_ON_ONCE(1); return "metadata (unknown sub-group)"; } case BTRFS_BLOCK_GROUP_DATA: switch (space_info->subgroup_id) { case BTRFS_SUB_GROUP_PRIMARY: return "data"; case BTRFS_SUB_GROUP_DATA_RELOC: return "data-reloc"; default: WARN_ON_ONCE(1); return "data (unknown sub-group)"; } case BTRFS_BLOCK_GROUP_SYSTEM: ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_PRIMARY); return "system"; case BTRFS_BLOCK_GROUP_METADATA_REMAP: return "metadata-remap"; default: WARN_ON(1); return "invalid-combination"; } } /* * Create a sysfs entry for a space info type at path * /sys/fs/btrfs/UUID/allocation/TYPE */ int btrfs_sysfs_add_space_info_type(struct btrfs_space_info *space_info) { int ret; ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype, space_info->fs_info->space_info_kobj, "%s", alloc_name(space_info)); if (ret) { kobject_put(&space_info->kobj); return ret; } return 0; } void btrfs_sysfs_remove_device(struct btrfs_device *device) { struct kobject *devices_kobj; /* * Seed fs_devices devices_kobj aren't used, fetch kobject from the * fs_info::fs_devices. */ devices_kobj = device->fs_info->fs_devices->devices_kobj; ASSERT(devices_kobj); if (device->bdev) sysfs_remove_link(devices_kobj, bdev_kobj(device->bdev)->name); if (device->devid_kobj.state_initialized) { kobject_del(&device->devid_kobj); kobject_put(&device->devid_kobj); wait_for_completion(&device->kobj_unregister); } } static ssize_t btrfs_devinfo_in_fs_metadata_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { int val; struct btrfs_device *device = container_of(kobj, struct btrfs_device, devid_kobj); val = !!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); return sysfs_emit(buf, "%d\n", val); } BTRFS_ATTR(devid, in_fs_metadata, btrfs_devinfo_in_fs_metadata_show); static ssize_t btrfs_devinfo_missing_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { int val; struct btrfs_device *device = container_of(kobj, struct btrfs_device, devid_kobj); val = !!test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); return sysfs_emit(buf, "%d\n", val); } BTRFS_ATTR(devid, missing, btrfs_devinfo_missing_show); static ssize_t btrfs_devinfo_replace_target_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { int val; struct btrfs_device *device = container_of(kobj, struct btrfs_device, devid_kobj); val = !!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); return sysfs_emit(buf, "%d\n", val); } BTRFS_ATTR(devid, replace_target, btrfs_devinfo_replace_target_show); static ssize_t btrfs_devinfo_scrub_speed_max_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_device *device = container_of(kobj, struct btrfs_device, devid_kobj); return sysfs_emit(buf, "%llu\n", READ_ONCE(device->scrub_speed_max)); } static ssize_t btrfs_devinfo_scrub_speed_max_store(struct kobject *kobj, struct kobj_attribute *a, const char *buf, size_t len) { struct btrfs_device *device = container_of(kobj, struct btrfs_device, devid_kobj); char *endptr; unsigned long long limit; limit = memparse(buf, &endptr); /* There could be trailing '\n', also catch any typos after the value. */ endptr = skip_spaces(endptr); if (*endptr != 0) return -EINVAL; WRITE_ONCE(device->scrub_speed_max, limit); return len; } BTRFS_ATTR_RW(devid, scrub_speed_max, btrfs_devinfo_scrub_speed_max_show, btrfs_devinfo_scrub_speed_max_store); static ssize_t btrfs_devinfo_writeable_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { int val; struct btrfs_device *device = container_of(kobj, struct btrfs_device, devid_kobj); val = !!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); return sysfs_emit(buf, "%d\n", val); } BTRFS_ATTR(devid, writeable, btrfs_devinfo_writeable_show); static ssize_t btrfs_devinfo_fsid_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_device *device = container_of(kobj, struct btrfs_device, devid_kobj); return sysfs_emit(buf, "%pU\n", device->fs_devices->fsid); } BTRFS_ATTR(devid, fsid, btrfs_devinfo_fsid_show); static ssize_t btrfs_devinfo_error_stats_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_device *device = container_of(kobj, struct btrfs_device, devid_kobj); if (!device->dev_stats_valid) return sysfs_emit(buf, "invalid\n"); /* * Print all at once so we get a snapshot of all values from the same * time. Keep them in sync and in order of definition of * btrfs_dev_stat_values. */ return sysfs_emit(buf, "write_errs %d\n" "read_errs %d\n" "flush_errs %d\n" "corruption_errs %d\n" "generation_errs %d\n", btrfs_dev_stat_read(device, BTRFS_DEV_STAT_WRITE_ERRS), btrfs_dev_stat_read(device, BTRFS_DEV_STAT_READ_ERRS), btrfs_dev_stat_read(device, BTRFS_DEV_STAT_FLUSH_ERRS), btrfs_dev_stat_read(device, BTRFS_DEV_STAT_CORRUPTION_ERRS), btrfs_dev_stat_read(device, BTRFS_DEV_STAT_GENERATION_ERRS)); } BTRFS_ATTR(devid, error_stats, btrfs_devinfo_error_stats_show); /* * Information about one device. * * Path: /sys/fs/btrfs/<uuid>/devinfo/<devid>/ */ static struct attribute *devid_attrs[] = { BTRFS_ATTR_PTR(devid, error_stats), BTRFS_ATTR_PTR(devid, fsid), BTRFS_ATTR_PTR(devid, in_fs_metadata), BTRFS_ATTR_PTR(devid, missing), BTRFS_ATTR_PTR(devid, replace_target), BTRFS_ATTR_PTR(devid, scrub_speed_max), BTRFS_ATTR_PTR(devid, writeable), NULL }; ATTRIBUTE_GROUPS(devid); static void btrfs_release_devid_kobj(struct kobject *kobj) { struct btrfs_device *device = container_of(kobj, struct btrfs_device, devid_kobj); memset(&device->devid_kobj, 0, sizeof(struct kobject)); complete(&device->kobj_unregister); } static const struct kobj_type devid_ktype = { .sysfs_ops = &kobj_sysfs_ops, .default_groups = devid_groups, .release = btrfs_release_devid_kobj, }; int btrfs_sysfs_add_device(struct btrfs_device *device) { int ret; unsigned int nofs_flag; struct kobject *devices_kobj; struct kobject *devinfo_kobj; /* * Make sure we use the fs_info::fs_devices to fetch the kobjects even * for the seed fs_devices */ devices_kobj = device->fs_info->fs_devices->devices_kobj; devinfo_kobj = device->fs_info->fs_devices->devinfo_kobj; ASSERT(devices_kobj); ASSERT(devinfo_kobj); nofs_flag = memalloc_nofs_save(); if (device->bdev) { struct kobject *disk_kobj = bdev_kobj(device->bdev); ret = sysfs_create_link(devices_kobj, disk_kobj, disk_kobj->name); if (ret) { btrfs_warn(device->fs_info, "creating sysfs device link for devid %llu failed: %d", device->devid, ret); goto out; } } init_completion(&device->kobj_unregister); ret = kobject_init_and_add(&device->devid_kobj, &devid_ktype, devinfo_kobj, "%llu", device->devid); if (ret) { kobject_put(&device->devid_kobj); btrfs_warn(device->fs_info, "devinfo init for devid %llu failed: %d", device->devid, ret); } out: memalloc_nofs_restore(nofs_flag); return ret; } static int btrfs_sysfs_add_fs_devices(struct btrfs_fs_devices *fs_devices) { int ret; struct btrfs_device *device; struct btrfs_fs_devices *seed; list_for_each_entry(device, &fs_devices->devices, dev_list) { ret = btrfs_sysfs_add_device(device); if (ret) goto fail; } list_for_each_entry(seed, &fs_devices->seed_list, seed_list) { list_for_each_entry(device, &seed->devices, dev_list) { ret = btrfs_sysfs_add_device(device); if (ret) goto fail; } } return 0; fail: btrfs_sysfs_remove_fs_devices(fs_devices); return ret; } void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action) { int ret; ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action); if (ret) btrfs_warn(NULL, "sending event %d to kobject: '%s' (%p): failed", action, kobject_name(&disk_to_dev(bdev->bd_disk)->kobj), &disk_to_dev(bdev->bd_disk)->kobj); } void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices) { char fsid_buf[BTRFS_UUID_UNPARSED_SIZE]; /* * Sprouting changes fsid of the mounted filesystem, rename the fsid * directory */ snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", fs_devices->fsid); if (kobject_rename(&fs_devices->fsid_kobj, fsid_buf)) btrfs_warn(fs_devices->fs_info, "sysfs: failed to create fsid for sprout"); } void btrfs_sysfs_update_devid(struct btrfs_device *device) { char tmp[24]; snprintf(tmp, sizeof(tmp), "%llu", device->devid); if (kobject_rename(&device->devid_kobj, tmp)) btrfs_warn(device->fs_devices->fs_info, "sysfs: failed to update devid for %llu", device->devid); } /* /sys/fs/btrfs/ entry */ static struct kset *btrfs_kset; /* * Creates: * /sys/fs/btrfs/UUID * * Can be called by the device discovery thread. */ int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs) { int ret; init_completion(&fs_devs->kobj_unregister); fs_devs->fsid_kobj.kset = btrfs_kset; ret = kobject_init_and_add(&fs_devs->fsid_kobj, &btrfs_ktype, NULL, "%pU", fs_devs->fsid); if (ret) { kobject_put(&fs_devs->fsid_kobj); return ret; } fs_devs->devices_kobj = kobject_create_and_add("devices", &fs_devs->fsid_kobj); if (!fs_devs->devices_kobj) { btrfs_err(fs_devs->fs_info, "failed to init sysfs device interface"); btrfs_sysfs_remove_fsid(fs_devs); return -ENOMEM; } fs_devs->devinfo_kobj = kobject_create_and_add("devinfo", &fs_devs->fsid_kobj); if (!fs_devs->devinfo_kobj) { btrfs_err(fs_devs->fs_info, "failed to init sysfs devinfo kobject"); btrfs_sysfs_remove_fsid(fs_devs); return -ENOMEM; } return 0; } int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info) { int ret; struct btrfs_fs_devices *fs_devs = fs_info->fs_devices; struct kobject *fsid_kobj = &fs_devs->fsid_kobj; ret = btrfs_sysfs_add_fs_devices(fs_devs); if (ret) return ret; ret = sysfs_create_files(fsid_kobj, btrfs_attrs); if (ret) { btrfs_sysfs_remove_fs_devices(fs_devs); return ret; } ret = sysfs_create_group(fsid_kobj, &btrfs_feature_attr_group); if (ret) goto failure; #ifdef CONFIG_BTRFS_DEBUG fs_info->debug_kobj = kobject_create_and_add("debug", fsid_kobj); if (!fs_info->debug_kobj) { ret = -ENOMEM; goto failure; } ret = sysfs_create_files(fs_info->debug_kobj, btrfs_debug_mount_attrs); if (ret) goto failure; #endif /* Discard directory */ fs_info->discard_kobj = kobject_create_and_add("discard", fsid_kobj); if (!fs_info->discard_kobj) { ret = -ENOMEM; goto failure; } ret = sysfs_create_files(fs_info->discard_kobj, discard_attrs); if (ret) goto failure; ret = addrm_unknown_feature_attrs(fs_info, true); if (ret) goto failure; ret = sysfs_create_link(fsid_kobj, &fs_info->sb->s_bdi->dev->kobj, "bdi"); if (ret) goto failure; fs_info->space_info_kobj = kobject_create_and_add("allocation", fsid_kobj); if (!fs_info->space_info_kobj) { ret = -ENOMEM; goto failure; } ret = sysfs_create_files(fs_info->space_info_kobj, allocation_attrs); if (ret) goto failure; return 0; failure: btrfs_sysfs_remove_mounted(fs_info); return ret; } static ssize_t qgroup_enabled_show(struct kobject *qgroups_kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = to_fs_info(qgroups_kobj->parent); bool enabled; spin_lock(&fs_info->qgroup_lock); enabled = fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON; spin_unlock(&fs_info->qgroup_lock); return sysfs_emit(buf, "%d\n", enabled); } BTRFS_ATTR(qgroups, enabled, qgroup_enabled_show); static ssize_t qgroup_mode_show(struct kobject *qgroups_kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = to_fs_info(qgroups_kobj->parent); ssize_t ret = 0; spin_lock(&fs_info->qgroup_lock); ASSERT(btrfs_qgroup_enabled(fs_info)); switch (btrfs_qgroup_mode(fs_info)) { case BTRFS_QGROUP_MODE_FULL: ret = sysfs_emit(buf, "qgroup\n"); break; case BTRFS_QGROUP_MODE_SIMPLE: ret = sysfs_emit(buf, "squota\n"); break; default: btrfs_warn(fs_info, "unexpected qgroup mode %d\n", btrfs_qgroup_mode(fs_info)); break; } spin_unlock(&fs_info->qgroup_lock); return ret; } BTRFS_ATTR(qgroups, mode, qgroup_mode_show); static ssize_t qgroup_inconsistent_show(struct kobject *qgroups_kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = to_fs_info(qgroups_kobj->parent); bool inconsistent; spin_lock(&fs_info->qgroup_lock); inconsistent = (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT); spin_unlock(&fs_info->qgroup_lock); return sysfs_emit(buf, "%d\n", inconsistent); } BTRFS_ATTR(qgroups, inconsistent, qgroup_inconsistent_show); static ssize_t qgroup_drop_subtree_thres_show(struct kobject *qgroups_kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = to_fs_info(qgroups_kobj->parent); u8 result; spin_lock(&fs_info->qgroup_lock); result = fs_info->qgroup_drop_subtree_thres; spin_unlock(&fs_info->qgroup_lock); return sysfs_emit(buf, "%d\n", result); } static ssize_t qgroup_drop_subtree_thres_store(struct kobject *qgroups_kobj, struct kobj_attribute *a, const char *buf, size_t len) { struct btrfs_fs_info *fs_info = to_fs_info(qgroups_kobj->parent); u8 new_thres; int ret; ret = kstrtou8(buf, 10, &new_thres); if (ret) return -EINVAL; if (new_thres > BTRFS_MAX_LEVEL) return -EINVAL; spin_lock(&fs_info->qgroup_lock); fs_info->qgroup_drop_subtree_thres = new_thres; spin_unlock(&fs_info->qgroup_lock); return len; } BTRFS_ATTR_RW(qgroups, drop_subtree_threshold, qgroup_drop_subtree_thres_show, qgroup_drop_subtree_thres_store); /* * Qgroups global info * * Path: /sys/fs/btrfs/<uuid>/qgroups/ */ static struct attribute *qgroups_attrs[] = { BTRFS_ATTR_PTR(qgroups, enabled), BTRFS_ATTR_PTR(qgroups, inconsistent), BTRFS_ATTR_PTR(qgroups, drop_subtree_threshold), BTRFS_ATTR_PTR(qgroups, mode), NULL }; ATTRIBUTE_GROUPS(qgroups); static void qgroups_release(struct kobject *kobj) { kfree(kobj); } static const struct kobj_type qgroups_ktype = { .sysfs_ops = &kobj_sysfs_ops, .default_groups = qgroups_groups, .release = qgroups_release, }; static inline struct btrfs_fs_info *qgroup_kobj_to_fs_info(struct kobject *kobj) { return to_fs_info(kobj->parent->parent); } #define QGROUP_ATTR(_member, _show_name) \ static ssize_t btrfs_qgroup_show_##_member(struct kobject *qgroup_kobj, \ struct kobj_attribute *a, \ char *buf) \ { \ struct btrfs_fs_info *fs_info = qgroup_kobj_to_fs_info(qgroup_kobj); \ struct btrfs_qgroup *qgroup = container_of(qgroup_kobj, \ struct btrfs_qgroup, kobj); \ return btrfs_show_u64(&qgroup->_member, &fs_info->qgroup_lock, buf); \ } \ BTRFS_ATTR(qgroup, _show_name, btrfs_qgroup_show_##_member) #define QGROUP_RSV_ATTR(_name, _type) \ static ssize_t btrfs_qgroup_rsv_show_##_name(struct kobject *qgroup_kobj, \ struct kobj_attribute *a, \ char *buf) \ { \ struct btrfs_fs_info *fs_info = qgroup_kobj_to_fs_info(qgroup_kobj); \ struct btrfs_qgroup *qgroup = container_of(qgroup_kobj, \ struct btrfs_qgroup, kobj); \ return btrfs_show_u64(&qgroup->rsv.values[_type], \ &fs_info->qgroup_lock, buf); \ } \ BTRFS_ATTR(qgroup, rsv_##_name, btrfs_qgroup_rsv_show_##_name) QGROUP_ATTR(rfer, referenced); QGROUP_ATTR(excl, exclusive); QGROUP_ATTR(max_rfer, max_referenced); QGROUP_ATTR(max_excl, max_exclusive); QGROUP_ATTR(lim_flags, limit_flags); QGROUP_RSV_ATTR(data, BTRFS_QGROUP_RSV_DATA); QGROUP_RSV_ATTR(meta_pertrans, BTRFS_QGROUP_RSV_META_PERTRANS); QGROUP_RSV_ATTR(meta_prealloc, BTRFS_QGROUP_RSV_META_PREALLOC); /* * Qgroup information. * * Path: /sys/fs/btrfs/<uuid>/qgroups/<level>_<qgroupid>/ */ static struct attribute *qgroup_attrs[] = { BTRFS_ATTR_PTR(qgroup, referenced), BTRFS_ATTR_PTR(qgroup, exclusive), BTRFS_ATTR_PTR(qgroup, max_referenced), BTRFS_ATTR_PTR(qgroup, max_exclusive), BTRFS_ATTR_PTR(qgroup, limit_flags), BTRFS_ATTR_PTR(qgroup, rsv_data), BTRFS_ATTR_PTR(qgroup, rsv_meta_pertrans), BTRFS_ATTR_PTR(qgroup, rsv_meta_prealloc), NULL }; ATTRIBUTE_GROUPS(qgroup); static void qgroup_release(struct kobject *kobj) { struct btrfs_qgroup *qgroup = container_of(kobj, struct btrfs_qgroup, kobj); memset(&qgroup->kobj, 0, sizeof(*kobj)); } static const struct kobj_type qgroup_ktype = { .sysfs_ops = &kobj_sysfs_ops, .release = qgroup_release, .default_groups = qgroup_groups, }; int btrfs_sysfs_add_one_qgroup(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup) { struct kobject *qgroups_kobj = fs_info->qgroups_kobj; int ret; if (btrfs_is_testing(fs_info)) return 0; if (qgroup->kobj.state_initialized) return 0; if (!qgroups_kobj) return -EINVAL; ret = kobject_init_and_add(&qgroup->kobj, &qgroup_ktype, qgroups_kobj, "%hu_%llu", btrfs_qgroup_level(qgroup->qgroupid), btrfs_qgroup_subvolid(qgroup->qgroupid)); if (ret < 0) kobject_put(&qgroup->kobj); return ret; } void btrfs_sysfs_del_qgroups(struct btrfs_fs_info *fs_info) { struct btrfs_qgroup *qgroup; struct btrfs_qgroup *next; if (btrfs_is_testing(fs_info)) return; rbtree_postorder_for_each_entry_safe(qgroup, next, &fs_info->qgroup_tree, node) btrfs_sysfs_del_one_qgroup(fs_info, qgroup); if (fs_info->qgroups_kobj) { kobject_del(fs_info->qgroups_kobj); kobject_put(fs_info->qgroups_kobj); fs_info->qgroups_kobj = NULL; } } /* Called when qgroups get initialized, thus there is no need for locking */ int btrfs_sysfs_add_qgroups(struct btrfs_fs_info *fs_info) { struct kobject *fsid_kobj = &fs_info->fs_devices->fsid_kobj; struct btrfs_qgroup *qgroup; struct btrfs_qgroup *next; int ret = 0; if (btrfs_is_testing(fs_info)) return 0; ASSERT(fsid_kobj); if (fs_info->qgroups_kobj) return 0; fs_info->qgroups_kobj = kzalloc_obj(struct kobject); if (!fs_info->qgroups_kobj) return -ENOMEM; ret = kobject_init_and_add(fs_info->qgroups_kobj, &qgroups_ktype, fsid_kobj, "qgroups"); if (ret < 0) goto out; rbtree_postorder_for_each_entry_safe(qgroup, next, &fs_info->qgroup_tree, node) { ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); if (ret < 0) goto out; } out: if (ret < 0) btrfs_sysfs_del_qgroups(fs_info); return ret; } void btrfs_sysfs_del_one_qgroup(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup) { if (btrfs_is_testing(fs_info)) return; if (qgroup->kobj.state_initialized) { kobject_del(&qgroup->kobj); kobject_put(&qgroup->kobj); } } /* * Change per-fs features in /sys/fs/btrfs/UUID/features to match current * values in superblock. Call after any changes to incompat/compat_ro flags */ void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info) { struct kobject *fsid_kobj; int ret; if (!fs_info) return; fsid_kobj = &fs_info->fs_devices->fsid_kobj; if (!fsid_kobj->state_initialized) return; ret = sysfs_update_group(fsid_kobj, &btrfs_feature_attr_group); if (ret < 0) btrfs_warn(fs_info, "failed to update /sys/fs/btrfs/%pU/features: %d", fs_info->fs_devices->fsid, ret); } int __init btrfs_init_sysfs(void) { int ret; btrfs_kset = kset_create_and_add("btrfs", NULL, fs_kobj); if (!btrfs_kset) return -ENOMEM; init_feature_attrs(); ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_feature_attr_group); if (ret) goto out2; ret = sysfs_merge_group(&btrfs_kset->kobj, &btrfs_static_feature_attr_group); if (ret) goto out_remove_group; #ifdef CONFIG_BTRFS_DEBUG ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_debug_feature_attr_group); if (ret) { sysfs_unmerge_group(&btrfs_kset->kobj, &btrfs_static_feature_attr_group); goto out_remove_group; } #endif return 0; out_remove_group: sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group); out2: kset_unregister(btrfs_kset); return ret; } void __cold btrfs_exit_sysfs(void) { sysfs_unmerge_group(&btrfs_kset->kobj, &btrfs_static_feature_attr_group); sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group); #ifdef CONFIG_BTRFS_DEBUG sysfs_remove_group(&btrfs_kset->kobj, &btrfs_debug_feature_attr_group); #endif kset_unregister(btrfs_kset); }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 /* SPDX-License-Identifier: GPL-2.0 */ /* * sysctl.h: General linux system control interface * * Begun 24 March 1995, Stephen Tweedie * **************************************************************** **************************************************************** ** ** WARNING: ** The values in this file are exported to user space via ** the sysctl() binary interface. Do *NOT* change the ** numbering of any existing values here, and do not change ** any numbers within any one set of values. If you have to ** redefine an existing interface, use a new number for it. ** The kernel will then return -ENOTDIR to any application using ** the old binary interface. ** **************************************************************** **************************************************************** */ #ifndef _LINUX_SYSCTL_H #define _LINUX_SYSCTL_H #include <linux/list.h> #include <linux/rcupdate.h> #include <linux/wait.h> #include <linux/rbtree.h> #include <linux/uidgid.h> #include <uapi/linux/sysctl.h> /* For the /proc/sys support */ struct completion; struct ctl_table; struct nsproxy; struct ctl_table_root; struct ctl_table_header; struct ctl_dir; /* Keep the same order as in fs/proc/proc_sysctl.c */ #define SYSCTL_ZERO ((void *)&sysctl_vals[0]) #define SYSCTL_ONE ((void *)&sysctl_vals[1]) #define SYSCTL_TWO ((void *)&sysctl_vals[2]) #define SYSCTL_THREE ((void *)&sysctl_vals[3]) #define SYSCTL_FOUR ((void *)&sysctl_vals[4]) #define SYSCTL_ONE_HUNDRED ((void *)&sysctl_vals[5]) #define SYSCTL_TWO_HUNDRED ((void *)&sysctl_vals[6]) #define SYSCTL_ONE_THOUSAND ((void *)&sysctl_vals[7]) #define SYSCTL_THREE_THOUSAND ((void *)&sysctl_vals[8]) #define SYSCTL_INT_MAX ((void *)&sysctl_vals[9]) /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ #define SYSCTL_MAXOLDUID ((void *)&sysctl_vals[10]) #define SYSCTL_NEG_ONE ((void *)&sysctl_vals[11]) extern const int sysctl_vals[]; #define SYSCTL_LONG_ZERO ((void *)&sysctl_long_vals[0]) #define SYSCTL_LONG_ONE ((void *)&sysctl_long_vals[1]) #define SYSCTL_LONG_MAX ((void *)&sysctl_long_vals[2]) /** * * "dir" originates from read_iter (dir = 0) or write_iter (dir = 1) * in the file_operations struct at proc/proc_sysctl.c. Its value means * one of two things for sysctl: * 1. SYSCTL_USER_TO_KERN(dir) Writing to an internal kernel variable from user * space (dir > 0) * 2. SYSCTL_KERN_TO_USER(dir) Writing to a user space buffer from a kernel * variable (dir == 0). */ #define SYSCTL_USER_TO_KERN(dir) (!!(dir)) #define SYSCTL_KERN_TO_USER(dir) (!dir) extern const unsigned long sysctl_long_vals[]; typedef int proc_handler(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos); int proc_dostring(const struct ctl_table *, int, void *, size_t *, loff_t *); int proc_dobool(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); int proc_dointvec(const struct ctl_table *, int, void *, size_t *, loff_t *); int proc_dointvec_minmax(const struct ctl_table *table, int dir, void *buffer, size_t *lenp, loff_t *ppos); int proc_dointvec_conv(const struct ctl_table *table, int dir, void *buffer, size_t *lenp, loff_t *ppos, int (*conv)(bool *negp, unsigned long *u_ptr, int *k_ptr, int dir, const struct ctl_table *table)); int proc_int_k2u_conv_kop(ulong *u_ptr, const int *k_ptr, bool *negp, ulong (*k_ptr_op)(const ulong)); int proc_int_u2k_conv_uop(const ulong *u_ptr, int *k_ptr, const bool *negp, ulong (*u_ptr_op)(const ulong)); int proc_int_conv(bool *negp, ulong *u_ptr, int *k_ptr, int dir, const struct ctl_table *tbl, bool k_ptr_range_check, int (*user_to_kern)(const bool *negp, const ulong *u_ptr, int *k_ptr), int (*kern_to_user)(bool *negp, ulong *u_ptr, const int *k_ptr)); int proc_douintvec(const struct ctl_table *, int, void *, size_t *, loff_t *); int proc_douintvec_minmax(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); int proc_douintvec_conv(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos, int (*conv)(unsigned long *lvalp, unsigned int *valp, int write, const struct ctl_table *table)); int proc_uint_k2u_conv(ulong *u_ptr, const uint *k_ptr); int proc_uint_u2k_conv_uop(const ulong *u_ptr, uint *k_ptr, ulong (*u_ptr_op)(const ulong)); int proc_uint_conv(ulong *u_ptr, uint *k_ptr, int dir, const struct ctl_table *tbl, bool k_ptr_range_check, int (*user_to_kern)(const ulong *u_ptr, uint *k_ptr), int (*kern_to_user)(ulong *u_ptr, const uint *k_ptr)); int proc_dou8vec_minmax(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); int proc_doulongvec_minmax(const struct ctl_table *, int, void *, size_t *, loff_t *); int proc_doulongvec_minmax_conv(const struct ctl_table *table, int dir, void *buffer, size_t *lenp, loff_t *ppos, unsigned long convmul, unsigned long convdiv); int proc_do_large_bitmap(const struct ctl_table *, int, void *, size_t *, loff_t *); int proc_do_static_key(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); /* * Register a set of sysctl names by calling register_sysctl * with an initialised array of struct ctl_table's. * * sysctl names can be mirrored automatically under /proc/sys. The * procname supplied controls /proc naming. * * The table's mode will be honoured for proc-fs access. * * Leaf nodes in the sysctl tree will be represented by a single file * under /proc; non-leaf nodes will be represented by directories. A * null procname disables /proc mirroring at this node. * * The data and maxlen fields of the ctl_table * struct enable minimal validation of the values being written to be * performed, and the mode field allows minimal authentication. * * There must be a proc_handler routine for any terminal nodes * mirrored under /proc/sys (non-terminals are handled by a built-in * directory handler). Several default handlers are available to * cover common cases. */ /* Support for userspace poll() to watch for changes */ struct ctl_table_poll { atomic_t event; wait_queue_head_t wait; }; static inline void *proc_sys_poll_event(struct ctl_table_poll *poll) { return (void *)(unsigned long)atomic_read(&poll->event); } #define __CTL_TABLE_POLL_INITIALIZER(name) { \ .event = ATOMIC_INIT(0), \ .wait = __WAIT_QUEUE_HEAD_INITIALIZER(name.wait) } #define DEFINE_CTL_TABLE_POLL(name) \ struct ctl_table_poll name = __CTL_TABLE_POLL_INITIALIZER(name) /* A sysctl table is an array of struct ctl_table: */ struct ctl_table { const char *procname; /* Text ID for /proc/sys */ void *data; int maxlen; umode_t mode; proc_handler *proc_handler; /* Callback for text formatting */ struct ctl_table_poll *poll; void *extra1; void *extra2; } __randomize_layout; struct ctl_node { struct rb_node node; struct ctl_table_header *header; }; /** * struct ctl_table_header - maintains dynamic lists of struct ctl_table trees * @ctl_table: pointer to the first element in ctl_table array * @ctl_table_size: number of elements pointed by @ctl_table * @used: The entry will never be touched when equal to 0. * @count: Upped every time something is added to @inodes and downed every time * something is removed from inodes * @nreg: When nreg drops to 0 the ctl_table_header will be unregistered. * @rcu: Delays the freeing of the inode. Introduced with "unfuck proc_sysctl ->d_compare()" * * @type: Enumeration to differentiate between ctl target types * @type.SYSCTL_TABLE_TYPE_DEFAULT: ctl target with no special considerations * @type.SYSCTL_TABLE_TYPE_PERMANENTLY_EMPTY: Identifies a permanently empty dir * target to serve as a mount point */ struct ctl_table_header { union { struct { const struct ctl_table *ctl_table; int ctl_table_size; int used; int count; int nreg; }; struct rcu_head rcu; }; struct completion *unregistering; const struct ctl_table *ctl_table_arg; struct ctl_table_root *root; struct ctl_table_set *set; struct ctl_dir *parent; struct ctl_node *node; struct hlist_head inodes; /* head for proc_inode->sysctl_inodes */ enum { SYSCTL_TABLE_TYPE_DEFAULT, SYSCTL_TABLE_TYPE_PERMANENTLY_EMPTY, } type; }; struct ctl_dir { /* Header must be at the start of ctl_dir */ struct ctl_table_header header; struct rb_root root; }; struct ctl_table_set { int (*is_seen)(struct ctl_table_set *); struct ctl_dir dir; }; struct ctl_table_root { struct ctl_table_set default_set; struct ctl_table_set *(*lookup)(struct ctl_table_root *root); void (*set_ownership)(struct ctl_table_header *head, kuid_t *uid, kgid_t *gid); int (*permissions)(struct ctl_table_header *head, const struct ctl_table *table); }; #define register_sysctl(path, table) \ register_sysctl_sz(path, table, ARRAY_SIZE(table)) #ifdef CONFIG_SYSCTL void proc_sys_poll_notify(struct ctl_table_poll *poll); extern void setup_sysctl_set(struct ctl_table_set *p, struct ctl_table_root *root, int (*is_seen)(struct ctl_table_set *)); extern void retire_sysctl_set(struct ctl_table_set *set); struct ctl_table_header *__register_sysctl_table( struct ctl_table_set *set, const char *path, const struct ctl_table *table, size_t table_size); struct ctl_table_header *register_sysctl_sz(const char *path, const struct ctl_table *table, size_t table_size); void unregister_sysctl_table(struct ctl_table_header * table); extern int sysctl_init_bases(void); extern void __register_sysctl_init(const char *path, const struct ctl_table *table, const char *table_name, size_t table_size); #define register_sysctl_init(path, table) \ __register_sysctl_init(path, table, #table, ARRAY_SIZE(table)) extern struct ctl_table_header *register_sysctl_mount_point(const char *path); void do_sysctl_args(void); bool sysctl_is_alias(char *param); extern int unaligned_enabled; extern int no_unaligned_warning; #else /* CONFIG_SYSCTL */ static inline void register_sysctl_init(const char *path, const struct ctl_table *table) { } static inline struct ctl_table_header *register_sysctl_mount_point(const char *path) { return NULL; } static inline struct ctl_table_header *register_sysctl_sz(const char *path, const struct ctl_table *table, size_t table_size) { return NULL; } static inline void unregister_sysctl_table(struct ctl_table_header * table) { } static inline void setup_sysctl_set(struct ctl_table_set *p, struct ctl_table_root *root, int (*is_seen)(struct ctl_table_set *)) { } static inline void do_sysctl_args(void) { } static inline bool sysctl_is_alias(char *param) { return false; } #endif /* CONFIG_SYSCTL */ #endif /* _LINUX_SYSCTL_H */
24 24 136 136 136 136 136 136 136 136 136 136 136 136 136 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 // SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/recovery.c * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ */ #include <linux/unaligned.h> #include <linux/fs.h> #include <linux/f2fs_fs.h> #include <linux/sched/mm.h> #include "f2fs.h" #include "node.h" #include "segment.h" /* * Roll forward recovery scenarios. * * [Term] F: fsync_mark, D: dentry_mark * * 1. inode(x) | CP | inode(x) | dnode(F) * -> Update the latest inode(x). * * 2. inode(x) | CP | inode(F) | dnode(F) * -> No problem. * * 3. inode(x) | CP | dnode(F) | inode(x) * -> Recover to the latest dnode(F), and drop the last inode(x) * * 4. inode(x) | CP | dnode(F) | inode(F) * -> No problem. * * 5. CP | inode(x) | dnode(F) * -> The inode(DF) was missing. Should drop this dnode(F). * * 6. CP | inode(DF) | dnode(F) * -> No problem. * * 7. CP | dnode(F) | inode(DF) * -> If f2fs_iget fails, then goto next to find inode(DF). * * 8. CP | dnode(F) | inode(x) * -> If f2fs_iget fails, then goto next to find inode(DF). * But it will fail due to no inode(DF). */ static struct kmem_cache *fsync_entry_slab; bool f2fs_space_for_roll_forward(struct f2fs_sb_info *sbi) { s64 nalloc = percpu_counter_sum_positive(&sbi->alloc_valid_block_count); if (sbi->last_valid_block_count + nalloc > sbi->user_block_count) return false; if (NM_I(sbi)->max_rf_node_blocks && percpu_counter_sum_positive(&sbi->rf_node_block_count) >= NM_I(sbi)->max_rf_node_blocks) return false; return true; } static struct fsync_inode_entry *get_fsync_inode(struct list_head *head, nid_t ino) { struct fsync_inode_entry *entry; list_for_each_entry(entry, head, list) if (entry->inode->i_ino == ino) return entry; return NULL; } static struct fsync_inode_entry *add_fsync_inode(struct f2fs_sb_info *sbi, struct list_head *head, nid_t ino, bool quota_inode) { struct inode *inode; struct fsync_inode_entry *entry; int err; inode = f2fs_iget_retry(sbi->sb, ino); if (IS_ERR(inode)) return ERR_CAST(inode); err = f2fs_dquot_initialize(inode); if (err) goto err_out; if (quota_inode) { err = dquot_alloc_inode(inode); if (err) goto err_out; } entry = f2fs_kmem_cache_alloc(fsync_entry_slab, GFP_F2FS_ZERO, true, NULL); entry->inode = inode; list_add_tail(&entry->list, head); return entry; err_out: iput(inode); return ERR_PTR(err); } static void del_fsync_inode(struct fsync_inode_entry *entry, int drop) { if (drop) { /* inode should not be recovered, drop it */ f2fs_inode_synced(entry->inode); } iput(entry->inode); list_del(&entry->list); kmem_cache_free(fsync_entry_slab, entry); } static int init_recovered_filename(const struct inode *dir, struct f2fs_inode *raw_inode, struct f2fs_filename *fname, struct qstr *usr_fname) { int err; memset(fname, 0, sizeof(*fname)); fname->disk_name.len = le32_to_cpu(raw_inode->i_namelen); fname->disk_name.name = raw_inode->i_name; if (WARN_ON(fname->disk_name.len > F2FS_NAME_LEN)) return -ENAMETOOLONG; if (!IS_ENCRYPTED(dir)) { usr_fname->name = fname->disk_name.name; usr_fname->len = fname->disk_name.len; fname->usr_fname = usr_fname; } /* Compute the hash of the filename */ if (IS_ENCRYPTED(dir) && IS_CASEFOLDED(dir)) { /* * In this case the hash isn't computable without the key, so it * was saved on-disk. */ if (fname->disk_name.len + sizeof(f2fs_hash_t) > F2FS_NAME_LEN) return -EINVAL; fname->hash = get_unaligned((f2fs_hash_t *) &raw_inode->i_name[fname->disk_name.len]); } else if (IS_CASEFOLDED(dir)) { err = f2fs_init_casefolded_name(dir, fname); if (err) return err; f2fs_hash_filename(dir, fname); /* Case-sensitive match is fine for recovery */ f2fs_free_casefolded_name(fname); } else { f2fs_hash_filename(dir, fname); } return 0; } static int recover_dentry(struct inode *inode, struct folio *ifolio, struct list_head *dir_list) { struct f2fs_inode *raw_inode = F2FS_INODE(ifolio); nid_t pino = le32_to_cpu(raw_inode->i_pino); struct f2fs_dir_entry *de; struct f2fs_filename fname; struct qstr usr_fname; struct folio *folio; struct inode *dir, *einode; struct fsync_inode_entry *entry; int err = 0; char *name; entry = get_fsync_inode(dir_list, pino); if (!entry) { entry = add_fsync_inode(F2FS_I_SB(inode), dir_list, pino, false); if (IS_ERR(entry)) { dir = ERR_CAST(entry); err = PTR_ERR(entry); goto out; } } dir = entry->inode; err = init_recovered_filename(dir, raw_inode, &fname, &usr_fname); if (err) goto out; retry: de = __f2fs_find_entry(dir, &fname, &folio); if (de && inode->i_ino == le32_to_cpu(de->ino)) goto out_put; if (de) { einode = f2fs_iget_retry(inode->i_sb, le32_to_cpu(de->ino)); if (IS_ERR(einode)) { WARN_ON(1); err = PTR_ERR(einode); if (err == -ENOENT) err = -EEXIST; goto out_put; } err = f2fs_dquot_initialize(einode); if (err) { iput(einode); goto out_put; } err = f2fs_acquire_orphan_inode(F2FS_I_SB(inode)); if (err) { iput(einode); goto out_put; } f2fs_delete_entry(de, folio, dir, einode); iput(einode); goto retry; } else if (IS_ERR(folio)) { err = PTR_ERR(folio); } else { err = f2fs_add_dentry(dir, &fname, inode, inode->i_ino, inode->i_mode); } if (err == -ENOMEM) goto retry; goto out; out_put: f2fs_folio_put(folio, false); out: if (file_enc_name(inode)) name = "<encrypted>"; else name = raw_inode->i_name; f2fs_notice(F2FS_I_SB(inode), "%s: ino = %x, name = %s, dir = %lx, err = %d", __func__, ino_of_node(ifolio), name, IS_ERR(dir) ? 0 : dir->i_ino, err); return err; } static int recover_quota_data(struct inode *inode, struct folio *folio) { struct f2fs_inode *raw = F2FS_INODE(folio); struct iattr attr; uid_t i_uid = le32_to_cpu(raw->i_uid); gid_t i_gid = le32_to_cpu(raw->i_gid); int err; memset(&attr, 0, sizeof(attr)); attr.ia_vfsuid = VFSUIDT_INIT(make_kuid(inode->i_sb->s_user_ns, i_uid)); attr.ia_vfsgid = VFSGIDT_INIT(make_kgid(inode->i_sb->s_user_ns, i_gid)); if (!vfsuid_eq(attr.ia_vfsuid, i_uid_into_vfsuid(&nop_mnt_idmap, inode))) attr.ia_valid |= ATTR_UID; if (!vfsgid_eq(attr.ia_vfsgid, i_gid_into_vfsgid(&nop_mnt_idmap, inode))) attr.ia_valid |= ATTR_GID; if (!attr.ia_valid) return 0; err = dquot_transfer(&nop_mnt_idmap, inode, &attr); if (err) set_sbi_flag(F2FS_I_SB(inode), SBI_QUOTA_NEED_REPAIR); return err; } static void recover_inline_flags(struct inode *inode, struct f2fs_inode *ri) { if (ri->i_inline & F2FS_PIN_FILE) set_inode_flag(inode, FI_PIN_FILE); else clear_inode_flag(inode, FI_PIN_FILE); if (ri->i_inline & F2FS_DATA_EXIST) set_inode_flag(inode, FI_DATA_EXIST); else clear_inode_flag(inode, FI_DATA_EXIST); } static int recover_inode(struct inode *inode, struct folio *folio) { struct f2fs_inode *raw = F2FS_INODE(folio); struct f2fs_inode_info *fi = F2FS_I(inode); char *name; int err; inode->i_mode = le16_to_cpu(raw->i_mode); err = recover_quota_data(inode, folio); if (err) return err; i_uid_write(inode, le32_to_cpu(raw->i_uid)); i_gid_write(inode, le32_to_cpu(raw->i_gid)); if (raw->i_inline & F2FS_EXTRA_ATTR) { if (f2fs_sb_has_project_quota(F2FS_I_SB(inode)) && F2FS_FITS_IN_INODE(raw, le16_to_cpu(raw->i_extra_isize), i_projid)) { projid_t i_projid; kprojid_t kprojid; i_projid = (projid_t)le32_to_cpu(raw->i_projid); kprojid = make_kprojid(&init_user_ns, i_projid); if (!projid_eq(kprojid, fi->i_projid)) { err = f2fs_transfer_project_quota(inode, kprojid); if (err) return err; fi->i_projid = kprojid; } } } f2fs_i_size_write(inode, le64_to_cpu(raw->i_size)); inode_set_atime(inode, le64_to_cpu(raw->i_atime), le32_to_cpu(raw->i_atime_nsec)); inode_set_ctime(inode, le64_to_cpu(raw->i_ctime), le32_to_cpu(raw->i_ctime_nsec)); inode_set_mtime(inode, le64_to_cpu(raw->i_mtime), le32_to_cpu(raw->i_mtime_nsec)); fi->i_advise = raw->i_advise; fi->i_flags = le32_to_cpu(raw->i_flags); f2fs_set_inode_flags(inode); fi->i_gc_failures = le16_to_cpu(raw->i_gc_failures); recover_inline_flags(inode, raw); f2fs_mark_inode_dirty_sync(inode, true); if (file_enc_name(inode)) name = "<encrypted>"; else name = F2FS_INODE(folio)->i_name; f2fs_notice(F2FS_I_SB(inode), "recover_inode: ino = %x, name = %s, inline = %x", ino_of_node(folio), name, raw->i_inline); return 0; } static unsigned int adjust_por_ra_blocks(struct f2fs_sb_info *sbi, unsigned int ra_blocks, unsigned int blkaddr, unsigned int next_blkaddr) { if (blkaddr + 1 == next_blkaddr) ra_blocks = min_t(unsigned int, RECOVERY_MAX_RA_BLOCKS, ra_blocks * 2); else if (next_blkaddr % BLKS_PER_SEG(sbi)) ra_blocks = max_t(unsigned int, RECOVERY_MIN_RA_BLOCKS, ra_blocks / 2); return ra_blocks; } /* Detect looped node chain with Floyd's cycle detection algorithm. */ static int sanity_check_node_chain(struct f2fs_sb_info *sbi, block_t blkaddr, block_t *blkaddr_fast, bool *is_detecting) { unsigned int ra_blocks = RECOVERY_MAX_RA_BLOCKS; int i; if (!*is_detecting) return 0; for (i = 0; i < 2; i++) { struct folio *folio; if (!f2fs_is_valid_blkaddr(sbi, *blkaddr_fast, META_POR)) { *is_detecting = false; return 0; } folio = f2fs_get_tmp_folio(sbi, *blkaddr_fast); if (IS_ERR(folio)) return PTR_ERR(folio); if (!is_recoverable_dnode(folio)) { f2fs_folio_put(folio, true); *is_detecting = false; return 0; } ra_blocks = adjust_por_ra_blocks(sbi, ra_blocks, *blkaddr_fast, next_blkaddr_of_node(folio)); *blkaddr_fast = next_blkaddr_of_node(folio); f2fs_folio_put(folio, true); f2fs_ra_meta_pages_cond(sbi, *blkaddr_fast, ra_blocks); } if (*blkaddr_fast == blkaddr) { f2fs_notice(sbi, "%s: Detect looped node chain on blkaddr:%u." " Run fsck to fix it.", __func__, blkaddr); return -EINVAL; } return 0; } static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, bool check_only, bool *new_inode) { struct curseg_info *curseg; block_t blkaddr, blkaddr_fast; bool is_detecting = true; int err = 0; /* get node pages in the current segment */ curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); blkaddr_fast = blkaddr; while (1) { struct fsync_inode_entry *entry; struct folio *folio; if (!f2fs_is_valid_blkaddr(sbi, blkaddr, META_POR)) return 0; folio = f2fs_get_tmp_folio(sbi, blkaddr); if (IS_ERR(folio)) { err = PTR_ERR(folio); break; } if (!is_recoverable_dnode(folio)) { f2fs_folio_put(folio, true); break; } if (!is_fsync_dnode(folio)) goto next; entry = get_fsync_inode(head, ino_of_node(folio)); if (!entry) { bool quota_inode = false; if (!check_only && IS_INODE(folio) && is_dent_dnode(folio)) { err = f2fs_recover_inode_page(sbi, folio); if (err) { f2fs_folio_put(folio, true); break; } quota_inode = true; } entry = add_fsync_inode(sbi, head, ino_of_node(folio), quota_inode); if (IS_ERR(entry)) { err = PTR_ERR(entry); /* * CP | dnode(F) | inode(DF) * For this case, we should not give up now. */ if (err == -ENOENT) { if (check_only) *new_inode = true; goto next; } f2fs_folio_put(folio, true); break; } } entry->blkaddr = blkaddr; if (IS_INODE(folio) && is_dent_dnode(folio)) entry->last_dentry = blkaddr; next: /* check next segment */ blkaddr = next_blkaddr_of_node(folio); f2fs_folio_put(folio, true); err = sanity_check_node_chain(sbi, blkaddr, &blkaddr_fast, &is_detecting); if (err) break; } return err; } static void destroy_fsync_dnodes(struct list_head *head, int drop) { struct fsync_inode_entry *entry, *tmp; list_for_each_entry_safe(entry, tmp, head, list) del_fsync_inode(entry, drop); } static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, block_t blkaddr, struct dnode_of_data *dn) { struct seg_entry *sentry; unsigned int segno = GET_SEGNO(sbi, blkaddr); unsigned short blkoff = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); struct f2fs_summary_block *sum_node; struct f2fs_summary sum; struct folio *sum_folio, *node_folio; struct dnode_of_data tdn = *dn; nid_t ino, nid; struct inode *inode; unsigned int offset, ofs_in_node, max_addrs; block_t bidx; int i; sentry = get_seg_entry(sbi, segno); if (!f2fs_test_bit(blkoff, sentry->cur_valid_map)) return 0; /* Get the previous summary */ for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { struct curseg_info *curseg = CURSEG_I(sbi, i); if (curseg->segno == segno) { sum = sum_entries(curseg->sum_blk)[blkoff]; goto got_it; } } sum_folio = f2fs_get_sum_folio(sbi, segno); if (IS_ERR(sum_folio)) return PTR_ERR(sum_folio); sum_node = SUM_BLK_PAGE_ADDR(sbi, sum_folio, segno); sum = sum_entries(sum_node)[blkoff]; f2fs_folio_put(sum_folio, true); got_it: /* Use the locked dnode page and inode */ nid = le32_to_cpu(sum.nid); ofs_in_node = le16_to_cpu(sum.ofs_in_node); max_addrs = ADDRS_PER_PAGE(dn->node_folio, dn->inode); if (ofs_in_node >= max_addrs) { f2fs_err(sbi, "Inconsistent ofs_in_node:%u in summary, ino:%lu, nid:%u, max:%u", ofs_in_node, dn->inode->i_ino, nid, max_addrs); f2fs_handle_error(sbi, ERROR_INCONSISTENT_SUMMARY); return -EFSCORRUPTED; } if (dn->inode->i_ino == nid) { tdn.nid = nid; if (!dn->inode_folio_locked) folio_lock(dn->inode_folio); tdn.node_folio = dn->inode_folio; tdn.ofs_in_node = ofs_in_node; goto truncate_out; } else if (dn->nid == nid) { tdn.ofs_in_node = ofs_in_node; goto truncate_out; } /* Get the node page */ node_folio = f2fs_get_node_folio(sbi, nid, NODE_TYPE_REGULAR); if (IS_ERR(node_folio)) return PTR_ERR(node_folio); offset = ofs_of_node(node_folio); ino = ino_of_node(node_folio); f2fs_folio_put(node_folio, true); if (ino != dn->inode->i_ino) { int ret; /* Deallocate previous index in the node page */ inode = f2fs_iget_retry(sbi->sb, ino); if (IS_ERR(inode)) return PTR_ERR(inode); ret = f2fs_dquot_initialize(inode); if (ret) { iput(inode); return ret; } } else { inode = dn->inode; } bidx = f2fs_start_bidx_of_node(offset, inode) + le16_to_cpu(sum.ofs_in_node); /* * if inode page is locked, unlock temporarily, but its reference * count keeps alive. */ if (ino == dn->inode->i_ino && dn->inode_folio_locked) folio_unlock(dn->inode_folio); set_new_dnode(&tdn, inode, NULL, NULL, 0); if (f2fs_get_dnode_of_data(&tdn, bidx, LOOKUP_NODE)) goto out; if (tdn.data_blkaddr == blkaddr) f2fs_truncate_data_blocks_range(&tdn, 1); f2fs_put_dnode(&tdn); out: if (ino != dn->inode->i_ino) iput(inode); else if (dn->inode_folio_locked) folio_lock(dn->inode_folio); return 0; truncate_out: if (f2fs_data_blkaddr(&tdn) == blkaddr) f2fs_truncate_data_blocks_range(&tdn, 1); if (dn->inode->i_ino == nid && !dn->inode_folio_locked) folio_unlock(dn->inode_folio); return 0; } static int f2fs_reserve_new_block_retry(struct dnode_of_data *dn) { int i, err = 0; for (i = DEFAULT_FAILURE_RETRY_COUNT; i > 0; i--) { err = f2fs_reserve_new_block(dn); if (!err) break; } return err; } static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, struct folio *folio) { struct dnode_of_data dn; struct node_info ni; unsigned int start = 0, end = 0, index; int err = 0, recovered = 0; /* step 1: recover xattr */ if (IS_INODE(folio)) { err = f2fs_recover_inline_xattr(inode, folio); if (err) goto out; } else if (f2fs_has_xattr_block(ofs_of_node(folio))) { err = f2fs_recover_xattr_data(inode, folio); if (!err) recovered++; goto out; } /* step 2: recover inline data */ err = f2fs_recover_inline_data(inode, folio); if (err) { if (err == 1) err = 0; goto out; } /* step 3: recover data indices */ start = f2fs_start_bidx_of_node(ofs_of_node(folio), inode); end = start + ADDRS_PER_PAGE(folio, inode); set_new_dnode(&dn, inode, NULL, NULL, 0); retry_dn: err = f2fs_get_dnode_of_data(&dn, start, ALLOC_NODE); if (err) { if (err == -ENOMEM) { memalloc_retry_wait(GFP_NOFS); goto retry_dn; } goto out; } f2fs_folio_wait_writeback(dn.node_folio, NODE, true, true); err = f2fs_get_node_info(sbi, dn.nid, &ni, false); if (err) goto err; f2fs_bug_on(sbi, ni.ino != ino_of_node(folio)); if (ofs_of_node(dn.node_folio) != ofs_of_node(folio)) { f2fs_warn(sbi, "Inconsistent ofs_of_node, ino:%lu, ofs:%u, %u", inode->i_ino, ofs_of_node(dn.node_folio), ofs_of_node(folio)); err = -EFSCORRUPTED; f2fs_handle_error(sbi, ERROR_INCONSISTENT_FOOTER); goto err; } for (index = start; index < end; index++, dn.ofs_in_node++) { block_t src, dest; src = f2fs_data_blkaddr(&dn); dest = data_blkaddr(dn.inode, folio, dn.ofs_in_node); if (__is_valid_data_blkaddr(src) && !f2fs_is_valid_blkaddr(sbi, src, META_POR)) { err = -EFSCORRUPTED; goto err; } if (__is_valid_data_blkaddr(dest) && !f2fs_is_valid_blkaddr(sbi, dest, META_POR)) { err = -EFSCORRUPTED; goto err; } /* skip recovering if dest is the same as src */ if (src == dest) continue; /* dest is invalid, just invalidate src block */ if (dest == NULL_ADDR) { f2fs_truncate_data_blocks_range(&dn, 1); continue; } if (!file_keep_isize(inode) && (i_size_read(inode) <= ((loff_t)index << PAGE_SHIFT))) f2fs_i_size_write(inode, (loff_t)(index + 1) << PAGE_SHIFT); /* * dest is reserved block, invalidate src block * and then reserve one new block in dnode page. */ if (dest == NEW_ADDR) { f2fs_truncate_data_blocks_range(&dn, 1); err = f2fs_reserve_new_block_retry(&dn); if (err) goto err; continue; } /* dest is valid block, try to recover from src to dest */ if (f2fs_is_valid_blkaddr(sbi, dest, META_POR)) { if (src == NULL_ADDR) { err = f2fs_reserve_new_block_retry(&dn); if (err) goto err; } retry_prev: /* Check the previous node page having this index */ err = check_index_in_prev_nodes(sbi, dest, &dn); if (err) { if (err == -ENOMEM) { memalloc_retry_wait(GFP_NOFS); goto retry_prev; } goto err; } if (f2fs_is_valid_blkaddr(sbi, dest, DATA_GENERIC_ENHANCE_UPDATE)) { f2fs_err(sbi, "Inconsistent dest blkaddr:%u, ino:%lu, ofs:%u", dest, inode->i_ino, dn.ofs_in_node); err = -EFSCORRUPTED; goto err; } /* write dummy data page */ f2fs_replace_block(sbi, &dn, src, dest, ni.version, false, false); recovered++; } } copy_node_footer(dn.node_folio, folio); fill_node_footer(dn.node_folio, dn.nid, ni.ino, ofs_of_node(folio), false); folio_mark_dirty(dn.node_folio); err: f2fs_put_dnode(&dn); out: f2fs_notice(sbi, "recover_data: ino = %lx, nid = %x (i_size: %s), " "range (%u, %u), recovered = %d, err = %d", inode->i_ino, nid_of_node(folio), file_keep_isize(inode) ? "keep" : "recover", start, end, recovered, err); return err; } static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, struct list_head *tmp_inode_list, struct list_head *dir_list) { struct curseg_info *curseg; int err = 0; block_t blkaddr; unsigned int ra_blocks = RECOVERY_MAX_RA_BLOCKS; unsigned int recoverable_dnode = 0; unsigned int fsynced_dnode = 0; unsigned int total_dnode = 0; unsigned int recovered_inode = 0; unsigned int recovered_dentry = 0; unsigned int recovered_dnode = 0; f2fs_notice(sbi, "do_recover_data: start to recover dnode"); /* get node pages in the current segment */ curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); while (1) { struct fsync_inode_entry *entry; struct folio *folio; if (!f2fs_is_valid_blkaddr(sbi, blkaddr, META_POR)) break; folio = f2fs_get_tmp_folio(sbi, blkaddr); if (IS_ERR(folio)) { err = PTR_ERR(folio); break; } if (!is_recoverable_dnode(folio)) { f2fs_folio_put(folio, true); break; } recoverable_dnode++; entry = get_fsync_inode(inode_list, ino_of_node(folio)); if (!entry) goto next; fsynced_dnode++; /* * inode(x) | CP | inode(x) | dnode(F) * In this case, we can lose the latest inode(x). * So, call recover_inode for the inode update. */ if (IS_INODE(folio)) { err = recover_inode(entry->inode, folio); if (err) { f2fs_folio_put(folio, true); break; } recovered_inode++; } if (entry->last_dentry == blkaddr) { err = recover_dentry(entry->inode, folio, dir_list); if (err) { f2fs_folio_put(folio, true); break; } recovered_dentry++; } err = do_recover_data(sbi, entry->inode, folio); if (err) { f2fs_folio_put(folio, true); break; } recovered_dnode++; if (entry->blkaddr == blkaddr) list_move_tail(&entry->list, tmp_inode_list); next: ra_blocks = adjust_por_ra_blocks(sbi, ra_blocks, blkaddr, next_blkaddr_of_node(folio)); /* check next segment */ blkaddr = next_blkaddr_of_node(folio); f2fs_folio_put(folio, true); f2fs_ra_meta_pages_cond(sbi, blkaddr, ra_blocks); total_dnode++; } if (!err) err = f2fs_allocate_new_segments(sbi); f2fs_notice(sbi, "do_recover_data: dnode: (recoverable: %u, fsynced: %u, " "total: %u), recovered: (inode: %u, dentry: %u, dnode: %u), err: %d", recoverable_dnode, fsynced_dnode, total_dnode, recovered_inode, recovered_dentry, recovered_dnode, err); return err; } int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) { LIST_HEAD(inode_list); LIST_HEAD(tmp_inode_list); LIST_HEAD(dir_list); struct f2fs_lock_context lc; int err; int ret = 0; unsigned long s_flags = sbi->sb->s_flags; bool need_writecp = false; bool new_inode = false; f2fs_notice(sbi, "f2fs_recover_fsync_data: recovery fsync data, " "check_only: %d", check_only); if (is_sbi_flag_set(sbi, SBI_IS_WRITABLE)) f2fs_info(sbi, "recover fsync data on readonly fs"); /* prevent checkpoint */ f2fs_down_write_trace(&sbi->cp_global_sem, &lc); /* step #1: find fsynced inode numbers */ err = find_fsync_dnodes(sbi, &inode_list, check_only, &new_inode); if (err < 0 || (list_empty(&inode_list) && (!check_only || !new_inode))) goto skip; if (check_only) { ret = 1; goto skip; } need_writecp = true; /* step #2: recover data */ err = recover_data(sbi, &inode_list, &tmp_inode_list, &dir_list); if (!err) f2fs_bug_on(sbi, !list_empty(&inode_list)); else f2fs_bug_on(sbi, sbi->sb->s_flags & SB_ACTIVE); skip: destroy_fsync_dnodes(&inode_list, err); destroy_fsync_dnodes(&tmp_inode_list, err); /* truncate meta pages to be used by the recovery */ truncate_inode_pages_range(META_MAPPING(sbi), (loff_t)MAIN_BLKADDR(sbi) << PAGE_SHIFT, -1); if (err) { truncate_inode_pages_final(NODE_MAPPING(sbi)); truncate_inode_pages_final(META_MAPPING(sbi)); } /* * If fsync data succeeds or there is no fsync data to recover, * and the f2fs is not read only, check and fix zoned block devices' * write pointer consistency. */ if (!err) err = f2fs_check_and_fix_write_pointer(sbi); if (!err) clear_sbi_flag(sbi, SBI_POR_DOING); f2fs_up_write_trace(&sbi->cp_global_sem, &lc); /* let's drop all the directory inodes for clean checkpoint */ destroy_fsync_dnodes(&dir_list, err); if (need_writecp) { set_sbi_flag(sbi, SBI_IS_RECOVERED); if (!err) { struct cp_control cpc = { .reason = CP_RECOVERY, }; stat_inc_cp_call_count(sbi, TOTAL_CALL); err = f2fs_write_checkpoint(sbi, &cpc); } } sbi->sb->s_flags = s_flags; /* Restore SB_RDONLY status */ return ret ? ret : err; } int __init f2fs_create_recovery_cache(void) { fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry", sizeof(struct fsync_inode_entry)); return fsync_entry_slab ? 0 : -ENOMEM; } void f2fs_destroy_recovery_cache(void) { kmem_cache_destroy(fsync_entry_slab); }
163 27 165 5 1 4 8 9 422 506 506 304 293 504 8 4 12 6 2 4 6 9 8 660 2 1 660 652 4 490 59 2 61 7 2 11 5 660 2 4 625 3 3 12 2 587 65 1 589 657 651 60 651 19 658 657 281 14 37 249 137 137 9 1 3 2 124 129 10 167 3 163 163 101 112 165 64 16 55 83 23 55 37 80 93 94 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 // SPDX-License-Identifier: GPL-2.0 /* * * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. * */ #include <linux/fs.h> #include "debug.h" #include "ntfs.h" #include "ntfs_fs.h" static inline int compare_attr(const struct ATTRIB *left, enum ATTR_TYPE type, const __le16 *name, u8 name_len, const u16 *upcase) { /* First, compare the type codes. */ int diff = le32_to_cpu(left->type) - le32_to_cpu(type); if (diff) return diff; /* They have the same type code, so we have to compare the names. */ return ntfs_cmp_names(attr_name(left), left->name_len, name, name_len, upcase, true); } /* * mi_new_attt_id * * Return: Unused attribute id that is less than mrec->next_attr_id. */ static __le16 mi_new_attt_id(struct ntfs_inode *ni, struct mft_inode *mi) { u16 free_id, max_id, t16; struct MFT_REC *rec = mi->mrec; struct ATTRIB *attr; __le16 id; id = rec->next_attr_id; free_id = le16_to_cpu(id); if (free_id < 0x7FFF) { rec->next_attr_id = cpu_to_le16(free_id + 1); return id; } /* One record can store up to 1024/24 ~= 42 attributes. */ free_id = 0; max_id = 0; attr = NULL; for (;;) { attr = mi_enum_attr(ni, mi, attr); if (!attr) { rec->next_attr_id = cpu_to_le16(max_id + 1); mi->dirty = true; return cpu_to_le16(free_id); } t16 = le16_to_cpu(attr->id); if (t16 == free_id) { free_id += 1; attr = NULL; } else if (max_id < t16) max_id = t16; } } int mi_get(struct ntfs_sb_info *sbi, CLST rno, struct mft_inode **mi) { int err; struct mft_inode *m = kzalloc_obj(struct mft_inode, GFP_NOFS); if (!m) return -ENOMEM; err = mi_init(m, sbi, rno); if (err) { kfree(m); return err; } err = mi_read(m, false); if (err) { mi_put(m); return err; } *mi = m; return 0; } void mi_put(struct mft_inode *mi) { mi_clear(mi); kfree(mi); } int mi_init(struct mft_inode *mi, struct ntfs_sb_info *sbi, CLST rno) { mi->sbi = sbi; mi->rno = rno; mi->mrec = kmalloc(sbi->record_size, GFP_NOFS); if (!mi->mrec) return -ENOMEM; return 0; } /* * mi_read - Read MFT data. */ int mi_read(struct mft_inode *mi, bool is_mft) { int err; struct MFT_REC *rec = mi->mrec; struct ntfs_sb_info *sbi = mi->sbi; u32 bpr = sbi->record_size; u64 vbo = (u64)mi->rno << sbi->record_bits; struct ntfs_inode *mft_ni = sbi->mft.ni; struct runs_tree *run = mft_ni ? &mft_ni->file.run : NULL; struct rw_semaphore *rw_lock = NULL; if (is_mounted(sbi)) { if (!is_mft && mft_ni) { rw_lock = &mft_ni->file.run_lock; down_read(rw_lock); } } err = ntfs_read_bh(sbi, run, vbo, &rec->rhdr, bpr, &mi->nb); if (rw_lock) up_read(rw_lock); if (!err) goto ok; if (err == -E_NTFS_FIXUP) { mi->dirty = true; goto ok; } if (err != -ENOENT) goto out; if (rw_lock) { ni_lock(mft_ni); down_write(rw_lock); } err = attr_load_runs_vcn(mft_ni, ATTR_DATA, NULL, 0, run, vbo >> sbi->cluster_bits); if (rw_lock) { up_write(rw_lock); ni_unlock(mft_ni); } if (err) goto out; if (rw_lock) down_read(rw_lock); err = ntfs_read_bh(sbi, run, vbo, &rec->rhdr, bpr, &mi->nb); if (rw_lock) up_read(rw_lock); if (err == -E_NTFS_FIXUP) { mi->dirty = true; goto ok; } if (err) goto out; ok: /* Check field 'total' only here. */ if (le32_to_cpu(rec->total) != bpr) { err = -EINVAL; goto out; } return 0; out: if (err == -E_NTFS_CORRUPT) { ntfs_err(sbi->sb, "mft corrupted"); ntfs_set_state(sbi, NTFS_DIRTY_ERROR); err = -EINVAL; } return err; } /* * mi_enum_attr - start/continue attributes enumeration in record. * * NOTE: mi->mrec - memory of size sbi->record_size * here we sure that mi->mrec->total == sbi->record_size (see mi_read) */ struct ATTRIB *mi_enum_attr(struct ntfs_inode *ni, struct mft_inode *mi, struct ATTRIB *attr) { const struct MFT_REC *rec = mi->mrec; u32 used = le32_to_cpu(rec->used); u32 t32, off, asize, prev_type; u16 t16; u64 data_size, alloc_size, tot_size; if (!attr) { u32 total = le32_to_cpu(rec->total); off = le16_to_cpu(rec->attr_off); if (used > total) goto out; if (off >= used || off < MFTRECORD_FIXUP_OFFSET_1 || !IS_ALIGNED(off, 8)) { goto out; } /* Skip non-resident records. */ if (!is_rec_inuse(rec)) return NULL; prev_type = 0; attr = Add2Ptr(rec, off); } else { /* * We don't need to check previous attr here. There is * a bounds checking in the previous round. */ off = PtrOffset(rec, attr); asize = le32_to_cpu(attr->size); prev_type = le32_to_cpu(attr->type); attr = Add2Ptr(attr, asize); off += asize; } /* * Can we use the first fields: * attr->type, * attr->size */ if (off + 8 > used) { static_assert(ALIGN(sizeof(enum ATTR_TYPE), 8) == 8); goto out; } if (attr->type == ATTR_END) { /* End of enumeration. */ return NULL; } /* 0x100 is last known attribute for now. */ t32 = le32_to_cpu(attr->type); if (!t32 || (t32 & 0xf) || (t32 > 0x100)) goto out; /* attributes in record must be ordered by type */ if (t32 < prev_type) goto out; asize = le32_to_cpu(attr->size); if (!IS_ALIGNED(asize, 8)) goto out; /* Check overflow and boundary. */ if (off + asize < off || off + asize > used) goto out; /* Can we use the field attr->non_res. */ if (off + 9 > used) goto out; /* Check size of attribute. */ if (!attr->non_res) { /* Check resident fields. */ if (asize < SIZEOF_RESIDENT) goto out; t16 = le16_to_cpu(attr->res.data_off); if (t16 > asize) goto out; if (le32_to_cpu(attr->res.data_size) > asize - t16) goto out; t32 = sizeof(short) * attr->name_len; if (t32 && le16_to_cpu(attr->name_off) + t32 > t16) goto out; return attr; } /* Check nonresident fields. */ if (attr->non_res != 1) goto out; /* Can we use memory including attr->nres.valid_size? */ if (asize < SIZEOF_NONRESIDENT) goto out; t16 = le16_to_cpu(attr->nres.run_off); if (t16 > asize) goto out; t32 = sizeof(short) * attr->name_len; if (t32 && le16_to_cpu(attr->name_off) + t32 > t16) goto out; /* Check start/end vcn. */ if (le64_to_cpu(attr->nres.svcn) > le64_to_cpu(attr->nres.evcn) + 1) goto out; data_size = le64_to_cpu(attr->nres.data_size); if (le64_to_cpu(attr->nres.valid_size) > data_size) goto out; alloc_size = le64_to_cpu(attr->nres.alloc_size); if (data_size > alloc_size) goto out; t32 = mi->sbi->cluster_mask; if (alloc_size & t32) goto out; if (!attr->nres.svcn && is_attr_ext(attr)) { /* First segment of sparse/compressed attribute */ /* Can we use memory including attr->nres.total_size? */ if (asize < SIZEOF_NONRESIDENT_EX) goto out; tot_size = le64_to_cpu(attr->nres.total_size); if (tot_size & t32) goto out; if (tot_size > alloc_size) goto out; } else { if (attr->nres.c_unit) goto out; if (alloc_size > mi->sbi->volume.size) goto out; } return attr; out: _ntfs_bad_inode(&ni->vfs_inode); return NULL; } /* * mi_find_attr - Find the attribute by type and name and id. */ struct ATTRIB *mi_find_attr(struct ntfs_inode *ni, struct mft_inode *mi, struct ATTRIB *attr, enum ATTR_TYPE type, const __le16 *name, u8 name_len, const __le16 *id) { u32 type_in = le32_to_cpu(type); u32 atype; next_attr: attr = mi_enum_attr(ni, mi, attr); if (!attr) return NULL; atype = le32_to_cpu(attr->type); if (atype > type_in) return NULL; if (atype < type_in) goto next_attr; if (attr->name_len != name_len) goto next_attr; if (name_len && memcmp(attr_name(attr), name, name_len * sizeof(short))) goto next_attr; if (id && *id != attr->id) goto next_attr; return attr; } int mi_write(struct mft_inode *mi, int wait) { struct MFT_REC *rec; int err; struct ntfs_sb_info *sbi; if (!mi->dirty) return 0; sbi = mi->sbi; rec = mi->mrec; err = ntfs_write_bh(sbi, &rec->rhdr, &mi->nb, wait); if (err) return err; if (mi->rno < sbi->mft.recs_mirr) sbi->flags |= NTFS_FLAGS_MFTMIRR; mi->dirty = false; return 0; } int mi_format_new(struct mft_inode *mi, struct ntfs_sb_info *sbi, CLST rno, __le16 flags, bool is_mft) { int err; u16 seq = 1; struct MFT_REC *rec; u64 vbo = (u64)rno << sbi->record_bits; err = mi_init(mi, sbi, rno); if (err) return err; rec = mi->mrec; if (rno == MFT_REC_MFT) { ; } else if (rno < MFT_REC_FREE) { seq = rno; } else if (rno >= sbi->mft.used) { ; } else if (mi_read(mi, is_mft)) { ; } else if (rec->rhdr.sign == NTFS_FILE_SIGNATURE) { /* Record is reused. Update its sequence number. */ seq = le16_to_cpu(rec->seq) + 1; if (!seq) seq = 1; } memcpy(rec, sbi->new_rec, sbi->record_size); rec->seq = cpu_to_le16(seq); rec->flags = RECORD_FLAG_IN_USE | flags; if (MFTRECORD_FIXUP_OFFSET == MFTRECORD_FIXUP_OFFSET_3) rec->mft_record = cpu_to_le32(rno); mi->dirty = true; if (!mi->nb.nbufs) { struct ntfs_inode *ni = sbi->mft.ni; bool lock = false; if (is_mounted(sbi) && !is_mft) { down_read(&ni->file.run_lock); lock = true; } err = ntfs_get_bh(sbi, &ni->file.run, vbo, sbi->record_size, &mi->nb); if (lock) up_read(&ni->file.run_lock); } return err; } /* * mi_insert_attr - Reserve space for new attribute. * * Return: Not full constructed attribute or NULL if not possible to create. */ struct ATTRIB *mi_insert_attr(struct ntfs_inode *ni, struct mft_inode *mi, enum ATTR_TYPE type, const __le16 *name, u8 name_len, u32 asize, u16 name_off) { size_t tail; struct ATTRIB *attr; __le16 id; struct MFT_REC *rec = mi->mrec; struct ntfs_sb_info *sbi = mi->sbi; u32 used = le32_to_cpu(rec->used); const u16 *upcase = sbi->upcase; /* Can we insert mi attribute? */ if (used + asize > sbi->record_size) return NULL; /* * Scan through the list of attributes to find the point * at which we should insert it. */ attr = NULL; while ((attr = mi_enum_attr(ni, mi, attr))) { int diff = compare_attr(attr, type, name, name_len, upcase); if (diff < 0) continue; if (!diff && !is_attr_indexed(attr)) return NULL; break; } if (!attr) { /* Append. */ tail = 8; attr = Add2Ptr(rec, used - 8); } else { /* Insert before 'attr'. */ tail = used - PtrOffset(rec, attr); } id = mi_new_attt_id(ni, mi); memmove(Add2Ptr(attr, asize), attr, tail); memset(attr, 0, asize); attr->type = type; attr->size = cpu_to_le32(asize); attr->name_len = name_len; attr->name_off = cpu_to_le16(name_off); attr->id = id; memmove(Add2Ptr(attr, name_off), name, name_len * sizeof(short)); rec->used = cpu_to_le32(used + asize); mi->dirty = true; return attr; } /* * mi_remove_attr - Remove the attribute from record. * * NOTE: The source attr will point to next attribute. */ bool mi_remove_attr(struct ntfs_inode *ni, struct mft_inode *mi, struct ATTRIB *attr) { struct MFT_REC *rec = mi->mrec; u32 aoff = PtrOffset(rec, attr); u32 used = le32_to_cpu(rec->used); u32 asize = le32_to_cpu(attr->size); if (aoff + asize > used) return false; if (ni && is_attr_indexed(attr) && attr->type == ATTR_NAME) { u16 links = le16_to_cpu(ni->mi.mrec->hard_links); if (!links) { /* minor error. Not critical. */ } else { ni->mi.mrec->hard_links = cpu_to_le16(links - 1); ni->mi.dirty = true; } } used -= asize; memmove(attr, Add2Ptr(attr, asize), used - aoff); rec->used = cpu_to_le32(used); mi->dirty = true; return true; } /* bytes = "new attribute size" - "old attribute size" */ bool mi_resize_attr(struct mft_inode *mi, struct ATTRIB *attr, int bytes) { struct MFT_REC *rec = mi->mrec; u32 aoff = PtrOffset(rec, attr); u32 total, used = le32_to_cpu(rec->used); u32 nsize, asize = le32_to_cpu(attr->size); u32 rsize = le32_to_cpu(attr->res.data_size); int tail = (int)(used - aoff - asize); int dsize; char *next; if (tail < 0 || aoff >= used) return false; if (!bytes) return true; total = le32_to_cpu(rec->total); next = Add2Ptr(attr, asize); if (bytes > 0) { dsize = ALIGN(bytes, 8); if (used + dsize > total) return false; nsize = asize + dsize; /* Move tail */ memmove(next + dsize, next, tail); memset(next, 0, dsize); used += dsize; rsize += dsize; } else { dsize = ALIGN(-bytes, 8); if (dsize > asize) return false; nsize = asize - dsize; memmove(next - dsize, next, tail); used -= dsize; rsize -= dsize; } rec->used = cpu_to_le32(used); attr->size = cpu_to_le32(nsize); if (!attr->non_res) attr->res.data_size = cpu_to_le32(rsize); mi->dirty = true; return true; } /* * Pack runs in MFT record. * If failed record is not changed. */ int mi_pack_runs(struct mft_inode *mi, struct ATTRIB *attr, const struct runs_tree *run, CLST len) { int err = 0; struct ntfs_sb_info *sbi = mi->sbi; u32 new_run_size; CLST plen; struct MFT_REC *rec = mi->mrec; CLST svcn = le64_to_cpu(attr->nres.svcn); u32 used = le32_to_cpu(rec->used); u32 aoff = PtrOffset(rec, attr); u32 asize = le32_to_cpu(attr->size); char *next = Add2Ptr(attr, asize); u16 run_off = le16_to_cpu(attr->nres.run_off); u32 run_size = asize - run_off; u32 tail = used - aoff - asize; u32 dsize = sbi->record_size - used; /* Make a maximum gap in current record. */ memmove(next + dsize, next, tail); /* Pack as much as possible. */ err = run_pack(run, svcn, len, Add2Ptr(attr, run_off), run_size + dsize, &plen); if (err < 0) { memmove(next, next + dsize, tail); return err; } new_run_size = ALIGN(err, 8); memmove(next + new_run_size - run_size, next + dsize, tail); attr->size = cpu_to_le32(asize + new_run_size - run_size); attr->nres.evcn = cpu_to_le64(svcn + plen - 1); rec->used = cpu_to_le32(used + new_run_size - run_size); mi->dirty = true; return 0; }
2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 // SPDX-License-Identifier: GPL-2.0 /* * (C) 2001 Clemson University and The University of Chicago * * See COPYING in top-level directory. */ #include "protocol.h" #include "orangefs-kernel.h" /* tags assigned to kernel upcall operations */ static __u64 next_tag_value; static DEFINE_SPINLOCK(next_tag_value_lock); /* the orangefs memory caches */ /* a cache for orangefs upcall/downcall operations */ static struct kmem_cache *op_cache; int op_cache_initialize(void) { op_cache = kmem_cache_create("orangefs_op_cache", sizeof(struct orangefs_kernel_op_s), 0, 0, NULL); if (!op_cache) { gossip_err("Cannot create orangefs_op_cache\n"); return -ENOMEM; } /* initialize our atomic tag counter */ spin_lock(&next_tag_value_lock); next_tag_value = 100; spin_unlock(&next_tag_value_lock); return 0; } int op_cache_finalize(void) { kmem_cache_destroy(op_cache); return 0; } char *get_opname_string(struct orangefs_kernel_op_s *new_op) { if (new_op) { __s32 type = new_op->upcall.type; if (type == ORANGEFS_VFS_OP_FILE_IO) return "OP_FILE_IO"; else if (type == ORANGEFS_VFS_OP_LOOKUP) return "OP_LOOKUP"; else if (type == ORANGEFS_VFS_OP_CREATE) return "OP_CREATE"; else if (type == ORANGEFS_VFS_OP_GETATTR) return "OP_GETATTR"; else if (type == ORANGEFS_VFS_OP_REMOVE) return "OP_REMOVE"; else if (type == ORANGEFS_VFS_OP_MKDIR) return "OP_MKDIR"; else if (type == ORANGEFS_VFS_OP_READDIR) return "OP_READDIR"; else if (type == ORANGEFS_VFS_OP_READDIRPLUS) return "OP_READDIRPLUS"; else if (type == ORANGEFS_VFS_OP_SETATTR) return "OP_SETATTR"; else if (type == ORANGEFS_VFS_OP_SYMLINK) return "OP_SYMLINK"; else if (type == ORANGEFS_VFS_OP_RENAME) return "OP_RENAME"; else if (type == ORANGEFS_VFS_OP_STATFS) return "OP_STATFS"; else if (type == ORANGEFS_VFS_OP_TRUNCATE) return "OP_TRUNCATE"; else if (type == ORANGEFS_VFS_OP_RA_FLUSH) return "OP_RA_FLUSH"; else if (type == ORANGEFS_VFS_OP_FS_MOUNT) return "OP_FS_MOUNT"; else if (type == ORANGEFS_VFS_OP_FS_UMOUNT) return "OP_FS_UMOUNT"; else if (type == ORANGEFS_VFS_OP_GETXATTR) return "OP_GETXATTR"; else if (type == ORANGEFS_VFS_OP_SETXATTR) return "OP_SETXATTR"; else if (type == ORANGEFS_VFS_OP_LISTXATTR) return "OP_LISTXATTR"; else if (type == ORANGEFS_VFS_OP_REMOVEXATTR) return "OP_REMOVEXATTR"; else if (type == ORANGEFS_VFS_OP_PARAM) return "OP_PARAM"; else if (type == ORANGEFS_VFS_OP_PERF_COUNT) return "OP_PERF_COUNT"; else if (type == ORANGEFS_VFS_OP_CANCEL) return "OP_CANCEL"; else if (type == ORANGEFS_VFS_OP_FSYNC) return "OP_FSYNC"; else if (type == ORANGEFS_VFS_OP_FSKEY) return "OP_FSKEY"; else if (type == ORANGEFS_VFS_OP_FEATURES) return "OP_FEATURES"; } return "OP_UNKNOWN?"; } void orangefs_new_tag(struct orangefs_kernel_op_s *op) { spin_lock(&next_tag_value_lock); op->tag = next_tag_value++; if (next_tag_value == 0) next_tag_value = 100; spin_unlock(&next_tag_value_lock); } struct orangefs_kernel_op_s *op_alloc(__s32 type) { struct orangefs_kernel_op_s *new_op = NULL; new_op = kmem_cache_zalloc(op_cache, GFP_KERNEL); if (new_op) { INIT_LIST_HEAD(&new_op->list); spin_lock_init(&new_op->lock); init_completion(&new_op->waitq); new_op->upcall.type = ORANGEFS_VFS_OP_INVALID; new_op->downcall.type = ORANGEFS_VFS_OP_INVALID; new_op->downcall.status = -1; new_op->op_state = OP_VFS_STATE_UNKNOWN; /* initialize the op specific tag and upcall credentials */ orangefs_new_tag(new_op); new_op->upcall.type = type; new_op->attempts = 0; gossip_debug(GOSSIP_CACHE_DEBUG, "Alloced OP (%p: %llu %s)\n", new_op, llu(new_op->tag), get_opname_string(new_op)); new_op->upcall.uid = from_kuid(&init_user_ns, current_fsuid()); new_op->upcall.gid = from_kgid(&init_user_ns, current_fsgid()); } else { gossip_err("op_alloc: kmem_cache_zalloc failed!\n"); } return new_op; } void op_release(struct orangefs_kernel_op_s *orangefs_op) { if (orangefs_op) { gossip_debug(GOSSIP_CACHE_DEBUG, "Releasing OP (%p: %llu)\n", orangefs_op, llu(orangefs_op->tag)); kmem_cache_free(op_cache, orangefs_op); } else { gossip_err("NULL pointer in op_release\n"); } }
2558 12141 35 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_PREEMPT_H #define __LINUX_PREEMPT_H /* * include/linux/preempt.h - macros for accessing and manipulating * preempt_count (used for kernel preemption, interrupt count, etc.) */ #include <linux/linkage.h> #include <linux/cleanup.h> #include <linux/types.h> /* * We put the hardirq and softirq counter into the preemption * counter. The bitmask has the following meaning: * * - bits 0-7 are the preemption count (max preemption depth: 256) * - bits 8-15 are the softirq count (max # of softirqs: 256) * * The hardirq count could in theory be the same as the number of * interrupts in the system, but we run all interrupt handlers with * interrupts disabled, so we cannot have nesting interrupts. Though * there are a few palaeontologic drivers which reenable interrupts in * the handler, so we need more than one bit here. * * PREEMPT_MASK: 0x000000ff * SOFTIRQ_MASK: 0x0000ff00 * HARDIRQ_MASK: 0x000f0000 * NMI_MASK: 0x00f00000 * PREEMPT_NEED_RESCHED: 0x80000000 */ #define PREEMPT_BITS 8 #define SOFTIRQ_BITS 8 #define HARDIRQ_BITS 4 #define NMI_BITS 4 #define PREEMPT_SHIFT 0 #define SOFTIRQ_SHIFT (PREEMPT_SHIFT + PREEMPT_BITS) #define HARDIRQ_SHIFT (SOFTIRQ_SHIFT + SOFTIRQ_BITS) #define NMI_SHIFT (HARDIRQ_SHIFT + HARDIRQ_BITS) #define __IRQ_MASK(x) ((1UL << (x))-1) #define PREEMPT_MASK (__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT) #define SOFTIRQ_MASK (__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT) #define HARDIRQ_MASK (__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT) #define NMI_MASK (__IRQ_MASK(NMI_BITS) << NMI_SHIFT) #define PREEMPT_OFFSET (1UL << PREEMPT_SHIFT) #define SOFTIRQ_OFFSET (1UL << SOFTIRQ_SHIFT) #define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT) #define NMI_OFFSET (1UL << NMI_SHIFT) #define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET) #define PREEMPT_DISABLED (PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED) /* * Disable preemption until the scheduler is running -- use an unconditional * value so that it also works on !PREEMPT_COUNT kernels. * * Reset by start_kernel()->sched_init()->init_idle()->init_idle_preempt_count(). */ #define INIT_PREEMPT_COUNT PREEMPT_OFFSET /* * Initial preempt_count value; reflects the preempt_count schedule invariant * which states that during context switches: * * preempt_count() == 2*PREEMPT_DISABLE_OFFSET * * Note: PREEMPT_DISABLE_OFFSET is 0 for !PREEMPT_COUNT kernels. * Note: See finish_task_switch(). */ #define FORK_PREEMPT_COUNT (2*PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED) /* preempt_count() and related functions, depends on PREEMPT_NEED_RESCHED */ #include <asm/preempt.h> /** * interrupt_context_level - return interrupt context level * * Returns the current interrupt context level. * 0 - normal context * 1 - softirq context * 2 - hardirq context * 3 - NMI context */ static __always_inline unsigned char interrupt_context_level(void) { unsigned long pc = preempt_count(); unsigned char level = 0; level += !!(pc & (NMI_MASK)); level += !!(pc & (NMI_MASK | HARDIRQ_MASK)); level += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)); return level; } /* * These macro definitions avoid redundant invocations of preempt_count() * because such invocations would result in redundant loads given that * preempt_count() is commonly implemented with READ_ONCE(). */ #define nmi_count() (preempt_count() & NMI_MASK) #define hardirq_count() (preempt_count() & HARDIRQ_MASK) #ifdef CONFIG_PREEMPT_RT # define softirq_count() (current->softirq_disable_cnt & SOFTIRQ_MASK) # define irq_count() ((preempt_count() & (NMI_MASK | HARDIRQ_MASK)) | softirq_count()) #else # define softirq_count() (preempt_count() & SOFTIRQ_MASK) # define irq_count() (preempt_count() & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_MASK)) #endif /* * Macros to retrieve the current execution context: * * in_nmi() - We're in NMI context * in_hardirq() - We're in hard IRQ context * in_serving_softirq() - We're in softirq context * in_task() - We're in task context */ #define in_nmi() (nmi_count()) #define in_hardirq() (hardirq_count()) #define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET) #ifdef CONFIG_PREEMPT_RT # define in_task() (!((preempt_count() & (NMI_MASK | HARDIRQ_MASK)) | in_serving_softirq())) #else # define in_task() (!(preempt_count() & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET))) #endif /* * The following macros are deprecated and should not be used in new code: * in_softirq() - We have BH disabled, or are processing softirqs * in_interrupt() - We're in NMI,IRQ,SoftIRQ context or have BH disabled */ #define in_softirq() (softirq_count()) #define in_interrupt() (irq_count()) /* * The preempt_count offset after preempt_disable(); */ #if defined(CONFIG_PREEMPT_COUNT) # define PREEMPT_DISABLE_OFFSET PREEMPT_OFFSET #else # define PREEMPT_DISABLE_OFFSET 0 #endif /* * The preempt_count offset after spin_lock() */ #if !defined(CONFIG_PREEMPT_RT) #define PREEMPT_LOCK_OFFSET PREEMPT_DISABLE_OFFSET #else /* Locks on RT do not disable preemption */ #define PREEMPT_LOCK_OFFSET 0 #endif /* * The preempt_count offset needed for things like: * * spin_lock_bh() * * Which need to disable both preemption (CONFIG_PREEMPT_COUNT) and * softirqs, such that unlock sequences of: * * spin_unlock(); * local_bh_enable(); * * Work as expected. */ #define SOFTIRQ_LOCK_OFFSET (SOFTIRQ_DISABLE_OFFSET + PREEMPT_LOCK_OFFSET) /* * Are we running in atomic context? WARNING: this macro cannot * always detect atomic context; in particular, it cannot know about * held spinlocks in non-preemptible kernels. Thus it should not be * used in the general case to determine whether sleeping is possible. * Do not use in_atomic() in driver code. */ #define in_atomic() (preempt_count() != 0) /* * Check whether we were atomic before we did preempt_disable(): * (used by the scheduler) */ #define in_atomic_preempt_off() (preempt_count() != PREEMPT_DISABLE_OFFSET) #if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_TRACE_PREEMPT_TOGGLE) extern void preempt_count_add(int val); extern void preempt_count_sub(int val); #define preempt_count_dec_and_test() \ ({ preempt_count_sub(1); should_resched(0); }) #else #define preempt_count_add(val) __preempt_count_add(val) #define preempt_count_sub(val) __preempt_count_sub(val) #define preempt_count_dec_and_test() __preempt_count_dec_and_test() #endif #define __preempt_count_inc() __preempt_count_add(1) #define __preempt_count_dec() __preempt_count_sub(1) #define preempt_count_inc() preempt_count_add(1) #define preempt_count_dec() preempt_count_sub(1) #ifdef CONFIG_PREEMPT_COUNT #define preempt_disable() \ do { \ preempt_count_inc(); \ barrier(); \ } while (0) #define sched_preempt_enable_no_resched() \ do { \ barrier(); \ preempt_count_dec(); \ } while (0) #define preempt_enable_no_resched() sched_preempt_enable_no_resched() #define preemptible() (preempt_count() == 0 && !irqs_disabled()) #ifdef CONFIG_PREEMPTION #define preempt_enable() \ do { \ barrier(); \ if (unlikely(preempt_count_dec_and_test())) \ __preempt_schedule(); \ } while (0) #define preempt_enable_notrace() \ do { \ barrier(); \ if (unlikely(__preempt_count_dec_and_test())) \ __preempt_schedule_notrace(); \ } while (0) #define preempt_check_resched() \ do { \ if (should_resched(0)) \ __preempt_schedule(); \ } while (0) #else /* !CONFIG_PREEMPTION */ #define preempt_enable() \ do { \ barrier(); \ preempt_count_dec(); \ } while (0) #define preempt_enable_notrace() \ do { \ barrier(); \ __preempt_count_dec(); \ } while (0) #define preempt_check_resched() do { } while (0) #endif /* CONFIG_PREEMPTION */ #define preempt_disable_notrace() \ do { \ __preempt_count_inc(); \ barrier(); \ } while (0) #define preempt_enable_no_resched_notrace() \ do { \ barrier(); \ __preempt_count_dec(); \ } while (0) #else /* !CONFIG_PREEMPT_COUNT */ /* * Even if we don't have any preemption, we need preempt disable/enable * to be barriers, so that we don't have things like get_user/put_user * that can cause faults and scheduling migrate into our preempt-protected * region. */ #define preempt_disable() barrier() #define sched_preempt_enable_no_resched() barrier() #define preempt_enable_no_resched() barrier() #define preempt_enable() barrier() #define preempt_check_resched() do { } while (0) #define preempt_disable_notrace() barrier() #define preempt_enable_no_resched_notrace() barrier() #define preempt_enable_notrace() barrier() #define preemptible() 0 #endif /* CONFIG_PREEMPT_COUNT */ #ifdef MODULE /* * Modules have no business playing preemption tricks. */ #undef sched_preempt_enable_no_resched #undef preempt_enable_no_resched #undef preempt_enable_no_resched_notrace #undef preempt_check_resched #endif #define preempt_set_need_resched() \ do { \ set_preempt_need_resched(); \ } while (0) #define preempt_fold_need_resched() \ do { \ if (tif_need_resched()) \ set_preempt_need_resched(); \ } while (0) #ifdef CONFIG_PREEMPT_NOTIFIERS struct preempt_notifier; struct task_struct; /** * preempt_ops - notifiers called when a task is preempted and rescheduled * @sched_in: we're about to be rescheduled: * notifier: struct preempt_notifier for the task being scheduled * cpu: cpu we're scheduled on * @sched_out: we've just been preempted * notifier: struct preempt_notifier for the task being preempted * next: the task that's kicking us out * * Please note that sched_in and out are called under different * contexts. sched_out is called with rq lock held and irq disabled * while sched_in is called without rq lock and irq enabled. This * difference is intentional and depended upon by its users. */ struct preempt_ops { void (*sched_in)(struct preempt_notifier *notifier, int cpu); void (*sched_out)(struct preempt_notifier *notifier, struct task_struct *next); }; /** * preempt_notifier - key for installing preemption notifiers * @link: internal use * @ops: defines the notifier functions to be called * * Usually used in conjunction with container_of(). */ struct preempt_notifier { struct hlist_node link; struct preempt_ops *ops; }; void preempt_notifier_inc(void); void preempt_notifier_dec(void); void preempt_notifier_register(struct preempt_notifier *notifier); void preempt_notifier_unregister(struct preempt_notifier *notifier); static inline void preempt_notifier_init(struct preempt_notifier *notifier, struct preempt_ops *ops) { /* INIT_HLIST_NODE() open coded, to avoid dependency on list.h */ notifier->link.next = NULL; notifier->link.pprev = NULL; notifier->ops = ops; } #endif /* * Migrate-Disable and why it is undesired. * * When a preempted task becomes eligible to run under the ideal model (IOW it * becomes one of the M highest priority tasks), it might still have to wait * for the preemptee's migrate_disable() section to complete. Thereby suffering * a reduction in bandwidth in the exact duration of the migrate_disable() * section. * * Per this argument, the change from preempt_disable() to migrate_disable() * gets us: * * - a higher priority tasks gains reduced wake-up latency; with preempt_disable() * it would have had to wait for the lower priority task. * * - a lower priority tasks; which under preempt_disable() could've instantly * migrated away when another CPU becomes available, is now constrained * by the ability to push the higher priority task away, which might itself be * in a migrate_disable() section, reducing its available bandwidth. * * IOW it trades latency / moves the interference term, but it stays in the * system, and as long as it remains unbounded, the system is not fully * deterministic. * * * The reason we have it anyway. * * PREEMPT_RT breaks a number of assumptions traditionally held. By forcing a * number of primitives into becoming preemptible, they would also allow * migration. This turns out to break a bunch of per-cpu usage. To this end, * all these primitives employ migrate_disable() to restore this implicit * assumption. * * This is a 'temporary' work-around at best. The correct solution is getting * rid of the above assumptions and reworking the code to employ explicit * per-cpu locking or short preempt-disable regions. * * The end goal must be to get rid of migrate_disable(), alternatively we need * a schedulability theory that does not depend on arbitrary migration. * * * Notes on the implementation. * * The implementation is particularly tricky since existing code patterns * dictate neither migrate_disable() nor migrate_enable() is allowed to block. * This means that it cannot use cpus_read_lock() to serialize against hotplug, * nor can it easily migrate itself into a pending affinity mask change on * migrate_enable(). * * * Note: even non-work-conserving schedulers like semi-partitioned depends on * migration, so migrate_disable() is not only a problem for * work-conserving schedulers. * */ /** * preempt_disable_nested - Disable preemption inside a normally preempt disabled section * * Use for code which requires preemption protection inside a critical * section which has preemption disabled implicitly on non-PREEMPT_RT * enabled kernels, by e.g.: * - holding a spinlock/rwlock * - soft interrupt context * - regular interrupt handlers * * On PREEMPT_RT enabled kernels spinlock/rwlock held sections, soft * interrupt context and regular interrupt handlers are preemptible and * only prevent migration. preempt_disable_nested() ensures that preemption * is disabled for cases which require CPU local serialization even on * PREEMPT_RT. For non-PREEMPT_RT kernels this is a NOP. * * The use cases are code sequences which are not serialized by a * particular lock instance, e.g.: * - seqcount write side critical sections where the seqcount is not * associated to a particular lock and therefore the automatic * protection mechanism does not work. This prevents a live lock * against a preempting high priority reader. * - RMW per CPU variable updates like vmstat. */ /* Macro to avoid header recursion hell vs. lockdep */ #define preempt_disable_nested() \ do { \ if (IS_ENABLED(CONFIG_PREEMPT_RT)) \ preempt_disable(); \ else \ lockdep_assert_preemption_disabled(); \ } while (0) /** * preempt_enable_nested - Undo the effect of preempt_disable_nested() */ static __always_inline void preempt_enable_nested(void) { if (IS_ENABLED(CONFIG_PREEMPT_RT)) preempt_enable(); } DEFINE_LOCK_GUARD_0(preempt, preempt_disable(), preempt_enable()) DEFINE_LOCK_GUARD_0(preempt_notrace, preempt_disable_notrace(), preempt_enable_notrace()) #ifdef CONFIG_PREEMPT_DYNAMIC extern bool preempt_model_none(void); extern bool preempt_model_voluntary(void); extern bool preempt_model_full(void); extern bool preempt_model_lazy(void); #else static inline bool preempt_model_none(void) { return IS_ENABLED(CONFIG_PREEMPT_NONE); } static inline bool preempt_model_voluntary(void) { return IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY); } static inline bool preempt_model_full(void) { return IS_ENABLED(CONFIG_PREEMPT); } static inline bool preempt_model_lazy(void) { return IS_ENABLED(CONFIG_PREEMPT_LAZY); } #endif static inline bool preempt_model_rt(void) { return IS_ENABLED(CONFIG_PREEMPT_RT); } extern const char *preempt_model_str(void); /* * Does the preemption model allow non-cooperative preemption? * * For !CONFIG_PREEMPT_DYNAMIC kernels this is an exact match with * CONFIG_PREEMPTION; for CONFIG_PREEMPT_DYNAMIC this doesn't work as the * kernel is *built* with CONFIG_PREEMPTION=y but may run with e.g. the * PREEMPT_NONE model. */ static inline bool preempt_model_preemptible(void) { return preempt_model_full() || preempt_model_lazy() || preempt_model_rt(); } #endif /* __LINUX_PREEMPT_H */
5 103 26 106 149 150 102 148 5 112 5 5 24 25 112 112 113 85 87 1 87 73 8 3 4 26 88 115 114 50 66 7 36 43 72 70 61 59 62 71 63 6 3 69 10 3 60 60 60 61 59 72 73 73 73 4 56 14 28 70 4 57 45 61 61 44 18 50 9 57 4 65 6 24 76 1 76 75 75 73 76 74 73 4 27 37 65 64 71 35 1 44 3 74 69 12 83 81 82 85 83 82 12 53 9 93 85 16 84 31 4 51 13 46 97 2 9 83 84 9 95 1 9 81 80 84 1 2 1 46 3 42 41 1 1 56 27 1 27 2 52 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 // SPDX-License-Identifier: GPL-2.0 /* * This file contains the procedures for the handling of select and poll * * Created for Linux based loosely upon Mathius Lattner's minix * patches by Peter MacDonald. Heavily edited by Linus. * * 4 February 1994 * COFF/ELF binary emulation. If the process has the STICKY_TIMEOUTS * flag set in its personality we do *not* modify the given timeout * parameter to reflect time remaining. * * 24 January 2000 * Changed sys_poll()/do_poll() to use PAGE_SIZE chunk-based allocation * of fds to overcome nfds < 16390 descriptors limit (Tigran Aivazian). */ #include <linux/compat.h> #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/sched/rt.h> #include <linux/syscalls.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/poll.h> #include <linux/personality.h> /* for STICKY_TIMEOUTS */ #include <linux/file.h> #include <linux/fdtable.h> #include <linux/fs.h> #include <linux/rcupdate.h> #include <linux/hrtimer.h> #include <linux/freezer.h> #include <net/busy_poll.h> #include <linux/vmalloc.h> #include <linux/uaccess.h> /* * Estimate expected accuracy in ns from a timeval. * * After quite a bit of churning around, we've settled on * a simple thing of taking 0.1% of the timeout as the * slack, with a cap of 100 msec. * "nice" tasks get a 0.5% slack instead. * * Consider this comment an open invitation to come up with even * better solutions.. */ #define MAX_SLACK (100 * NSEC_PER_MSEC) static long __estimate_accuracy(struct timespec64 *tv) { long slack; int divfactor = 1000; if (tv->tv_sec < 0) return 0; if (task_nice(current) > 0) divfactor = divfactor / 5; if (tv->tv_sec > MAX_SLACK / (NSEC_PER_SEC/divfactor)) return MAX_SLACK; slack = tv->tv_nsec / divfactor; slack += tv->tv_sec * (NSEC_PER_SEC/divfactor); if (slack > MAX_SLACK) return MAX_SLACK; return slack; } u64 select_estimate_accuracy(struct timespec64 *tv) { u64 ret; struct timespec64 now; u64 slack = current->timer_slack_ns; if (slack == 0) return 0; ktime_get_ts64(&now); now = timespec64_sub(*tv, now); ret = __estimate_accuracy(&now); if (ret < slack) return slack; return ret; } struct poll_table_page { struct poll_table_page * next; struct poll_table_entry * entry; struct poll_table_entry entries[]; }; #define POLL_TABLE_FULL(table) \ ((unsigned long)((table)->entry+1) > PAGE_SIZE + (unsigned long)(table)) /* * Ok, Peter made a complicated, but straightforward multiple_wait() function. * I have rewritten this, taking some shortcuts: This code may not be easy to * follow, but it should be free of race-conditions, and it's practical. If you * understand what I'm doing here, then you understand how the linux * sleep/wakeup mechanism works. * * Two very simple procedures, poll_wait() and poll_freewait() make all the * work. poll_wait() is an inline-function defined in <linux/poll.h>, * as all select/poll functions have to call it to add an entry to the * poll table. */ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p); void poll_initwait(struct poll_wqueues *pwq) { init_poll_funcptr(&pwq->pt, __pollwait); pwq->polling_task = current; pwq->triggered = 0; pwq->error = 0; pwq->table = NULL; pwq->inline_index = 0; } EXPORT_SYMBOL(poll_initwait); static void free_poll_entry(struct poll_table_entry *entry) { remove_wait_queue(entry->wait_address, &entry->wait); fput(entry->filp); } void poll_freewait(struct poll_wqueues *pwq) { struct poll_table_page * p = pwq->table; int i; for (i = 0; i < pwq->inline_index; i++) free_poll_entry(pwq->inline_entries + i); while (p) { struct poll_table_entry * entry; struct poll_table_page *old; entry = p->entry; do { entry--; free_poll_entry(entry); } while (entry > p->entries); old = p; p = p->next; free_page((unsigned long) old); } } EXPORT_SYMBOL(poll_freewait); static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p) { struct poll_table_page *table = p->table; if (p->inline_index < N_INLINE_POLL_ENTRIES) return p->inline_entries + p->inline_index++; if (!table || POLL_TABLE_FULL(table)) { struct poll_table_page *new_table; new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL); if (!new_table) { p->error = -ENOMEM; return NULL; } new_table->entry = new_table->entries; new_table->next = table; p->table = new_table; table = new_table; } return table->entry++; } static int __pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { struct poll_wqueues *pwq = wait->private; DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task); /* * Although this function is called under waitqueue lock, LOCK * doesn't imply write barrier and the users expect write * barrier semantics on wakeup functions. The following * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up() * and is paired with smp_store_mb() in poll_schedule_timeout. */ smp_wmb(); WRITE_ONCE(pwq->triggered, 1); /* * Perform the default wake up operation using a dummy * waitqueue. * * TODO: This is hacky but there currently is no interface to * pass in @sync. @sync is scheduled to be removed and once * that happens, wake_up_process() can be used directly. */ return default_wake_function(&dummy_wait, mode, sync, key); } static int pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { struct poll_table_entry *entry; entry = container_of(wait, struct poll_table_entry, wait); if (key && !(key_to_poll(key) & entry->key)) return 0; return __pollwake(wait, mode, sync, key); } /* Add a new entry */ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p) { struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt); struct poll_table_entry *entry = poll_get_entry(pwq); if (!entry) return; entry->filp = get_file(filp); entry->wait_address = wait_address; entry->key = p->_key; init_waitqueue_func_entry(&entry->wait, pollwake); entry->wait.private = pwq; add_wait_queue(wait_address, &entry->wait); } static int poll_schedule_timeout(struct poll_wqueues *pwq, int state, ktime_t *expires, unsigned long slack) { int rc = -EINTR; set_current_state(state); if (!READ_ONCE(pwq->triggered)) rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS); __set_current_state(TASK_RUNNING); /* * Prepare for the next iteration. * * The following smp_store_mb() serves two purposes. First, it's * the counterpart rmb of the wmb in pollwake() such that data * written before wake up is always visible after wake up. * Second, the full barrier guarantees that triggered clearing * doesn't pass event check of the next iteration. Note that * this problem doesn't exist for the first iteration as * add_wait_queue() has full barrier semantics. */ smp_store_mb(pwq->triggered, 0); return rc; } /** * poll_select_set_timeout - helper function to setup the timeout value * @to: pointer to timespec64 variable for the final timeout * @sec: seconds (from user space) * @nsec: nanoseconds (from user space) * * Note, we do not use a timespec for the user space value here, That * way we can use the function for timeval and compat interfaces as well. * * Returns -EINVAL if sec/nsec are not normalized. Otherwise 0. */ int poll_select_set_timeout(struct timespec64 *to, time64_t sec, long nsec) { struct timespec64 ts = {.tv_sec = sec, .tv_nsec = nsec}; if (!timespec64_valid(&ts)) return -EINVAL; /* Optimize for the zero timeout value here */ if (!sec && !nsec) { to->tv_sec = to->tv_nsec = 0; } else { ktime_get_ts64(to); *to = timespec64_add_safe(*to, ts); } return 0; } enum poll_time_type { PT_TIMEVAL = 0, PT_OLD_TIMEVAL = 1, PT_TIMESPEC = 2, PT_OLD_TIMESPEC = 3, }; static int poll_select_finish(struct timespec64 *end_time, void __user *p, enum poll_time_type pt_type, int ret) { struct timespec64 rts; restore_saved_sigmask_unless(ret == -ERESTARTNOHAND); if (!p) return ret; if (current->personality & STICKY_TIMEOUTS) goto sticky; /* No update for zero timeout */ if (!end_time->tv_sec && !end_time->tv_nsec) return ret; ktime_get_ts64(&rts); rts = timespec64_sub(*end_time, rts); if (rts.tv_sec < 0) rts.tv_sec = rts.tv_nsec = 0; switch (pt_type) { case PT_TIMEVAL: { struct __kernel_old_timeval rtv; if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec)) memset(&rtv, 0, sizeof(rtv)); rtv.tv_sec = rts.tv_sec; rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC; if (!copy_to_user(p, &rtv, sizeof(rtv))) return ret; } break; case PT_OLD_TIMEVAL: { struct old_timeval32 rtv; rtv.tv_sec = rts.tv_sec; rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC; if (!copy_to_user(p, &rtv, sizeof(rtv))) return ret; } break; case PT_TIMESPEC: if (!put_timespec64(&rts, p)) return ret; break; case PT_OLD_TIMESPEC: if (!put_old_timespec32(&rts, p)) return ret; break; default: BUG(); } /* * If an application puts its timeval in read-only memory, we * don't want the Linux-specific update to the timeval to * cause a fault after the select has completed * successfully. However, because we're not updating the * timeval, we can't restart the system call. */ sticky: if (ret == -ERESTARTNOHAND) ret = -EINTR; return ret; } /* * Scalable version of the fd_set. */ typedef struct { unsigned long *in, *out, *ex; unsigned long *res_in, *res_out, *res_ex; } fd_set_bits; /* * How many longwords for "nr" bits? */ #define FDS_BITPERLONG (8*sizeof(long)) #define FDS_LONGS(nr) (((nr)+FDS_BITPERLONG-1)/FDS_BITPERLONG) #define FDS_BYTES(nr) (FDS_LONGS(nr)*sizeof(long)) /* * Use "unsigned long" accesses to let user-mode fd_set's be long-aligned. */ static inline int get_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset) { nr = FDS_BYTES(nr); if (ufdset) return copy_from_user(fdset, ufdset, nr) ? -EFAULT : 0; memset(fdset, 0, nr); return 0; } static inline unsigned long __must_check set_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset) { if (ufdset) return __copy_to_user(ufdset, fdset, FDS_BYTES(nr)); return 0; } static inline void zero_fd_set(unsigned long nr, unsigned long *fdset) { memset(fdset, 0, FDS_BYTES(nr)); } #define FDS_IN(fds, n) (fds->in + n) #define FDS_OUT(fds, n) (fds->out + n) #define FDS_EX(fds, n) (fds->ex + n) #define BITS(fds, n) (*FDS_IN(fds, n)|*FDS_OUT(fds, n)|*FDS_EX(fds, n)) static int max_select_fd(unsigned long n, fd_set_bits *fds) { unsigned long *open_fds; unsigned long set; int max; struct fdtable *fdt; /* handle last in-complete long-word first */ set = ~(~0UL << (n & (BITS_PER_LONG-1))); n /= BITS_PER_LONG; fdt = files_fdtable(current->files); open_fds = fdt->open_fds + n; max = 0; if (set) { set &= BITS(fds, n); if (set) { if (!(set & ~*open_fds)) goto get_max; return -EBADF; } } while (n) { open_fds--; n--; set = BITS(fds, n); if (!set) continue; if (set & ~*open_fds) return -EBADF; if (max) continue; get_max: do { max++; set >>= 1; } while (set); max += n * BITS_PER_LONG; } return max; } #define POLLIN_SET (EPOLLRDNORM | EPOLLRDBAND | EPOLLIN | EPOLLHUP | EPOLLERR |\ EPOLLNVAL) #define POLLOUT_SET (EPOLLWRBAND | EPOLLWRNORM | EPOLLOUT | EPOLLERR |\ EPOLLNVAL) #define POLLEX_SET (EPOLLPRI | EPOLLNVAL) static inline __poll_t select_poll_one(int fd, poll_table *wait, unsigned long in, unsigned long out, unsigned long bit, __poll_t ll_flag) { CLASS(fd, f)(fd); if (fd_empty(f)) return EPOLLNVAL; wait->_key = POLLEX_SET | ll_flag; if (in & bit) wait->_key |= POLLIN_SET; if (out & bit) wait->_key |= POLLOUT_SET; return vfs_poll(fd_file(f), wait); } static noinline_for_stack int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time) { ktime_t expire, *to = NULL; struct poll_wqueues table; poll_table *wait; int retval, i, timed_out = 0; u64 slack = 0; __poll_t busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; unsigned long busy_start = 0; rcu_read_lock(); retval = max_select_fd(n, fds); rcu_read_unlock(); if (retval < 0) return retval; n = retval; poll_initwait(&table); wait = &table.pt; if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { wait->_qproc = NULL; timed_out = 1; } if (end_time && !timed_out) slack = select_estimate_accuracy(end_time); retval = 0; for (;;) { unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp; bool can_busy_loop = false; inp = fds->in; outp = fds->out; exp = fds->ex; rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex; for (i = 0; i < n; ++rinp, ++routp, ++rexp) { unsigned long in, out, ex, all_bits, bit = 1, j; unsigned long res_in = 0, res_out = 0, res_ex = 0; __poll_t mask; in = *inp++; out = *outp++; ex = *exp++; all_bits = in | out | ex; if (all_bits == 0) { i += BITS_PER_LONG; continue; } for (j = 0; j < BITS_PER_LONG; ++j, ++i, bit <<= 1) { if (i >= n) break; if (!(bit & all_bits)) continue; mask = select_poll_one(i, wait, in, out, bit, busy_flag); if ((mask & POLLIN_SET) && (in & bit)) { res_in |= bit; retval++; wait->_qproc = NULL; } if ((mask & POLLOUT_SET) && (out & bit)) { res_out |= bit; retval++; wait->_qproc = NULL; } if ((mask & POLLEX_SET) && (ex & bit)) { res_ex |= bit; retval++; wait->_qproc = NULL; } /* got something, stop busy polling */ if (retval) { can_busy_loop = false; busy_flag = 0; /* * only remember a returned * POLL_BUSY_LOOP if we asked for it */ } else if (busy_flag & mask) can_busy_loop = true; } if (res_in) *rinp = res_in; if (res_out) *routp = res_out; if (res_ex) *rexp = res_ex; cond_resched(); } wait->_qproc = NULL; if (retval || timed_out || signal_pending(current)) break; if (table.error) { retval = table.error; break; } /* only if found POLL_BUSY_LOOP sockets && not out of time */ if (can_busy_loop && !need_resched()) { if (!busy_start) { busy_start = busy_loop_current_time(); continue; } if (!busy_loop_timeout(busy_start)) continue; } busy_flag = 0; /* * If this is the first loop and we have a timeout * given, then we convert to ktime_t and set the to * pointer to the expiry value. */ if (end_time && !to) { expire = timespec64_to_ktime(*end_time); to = &expire; } if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE, to, slack)) timed_out = 1; } poll_freewait(&table); return retval; } /* * We can actually return ERESTARTSYS instead of EINTR, but I'd * like to be certain this leads to no problems. So I return * EINTR just for safety. * * Update: ERESTARTSYS breaks at least the xview clock binary, so * I'm trying ERESTARTNOHAND which restart only when you want to. */ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct timespec64 *end_time) { fd_set_bits fds; void *bits; int ret, max_fds; size_t size, alloc_size; struct fdtable *fdt; /* Allocate small arguments on the stack to save memory and be faster */ long stack_fds[SELECT_STACK_ALLOC/sizeof(long)]; ret = -EINVAL; if (unlikely(n < 0)) goto out_nofds; /* max_fds can increase, so grab it once to avoid race */ rcu_read_lock(); fdt = files_fdtable(current->files); max_fds = fdt->max_fds; rcu_read_unlock(); if (n > max_fds) n = max_fds; /* * We need 6 bitmaps (in/out/ex for both incoming and outgoing), * since we used fdset we need to allocate memory in units of * long-words. */ size = FDS_BYTES(n); bits = stack_fds; if (size > sizeof(stack_fds) / 6) { /* Not enough space in on-stack array; must use kmalloc */ ret = -ENOMEM; if (size > (SIZE_MAX / 6)) goto out_nofds; alloc_size = 6 * size; bits = kvmalloc(alloc_size, GFP_KERNEL); if (!bits) goto out_nofds; } fds.in = bits; fds.out = bits + size; fds.ex = bits + 2*size; fds.res_in = bits + 3*size; fds.res_out = bits + 4*size; fds.res_ex = bits + 5*size; if ((ret = get_fd_set(n, inp, fds.in)) || (ret = get_fd_set(n, outp, fds.out)) || (ret = get_fd_set(n, exp, fds.ex))) goto out; zero_fd_set(n, fds.res_in); zero_fd_set(n, fds.res_out); zero_fd_set(n, fds.res_ex); ret = do_select(n, &fds, end_time); if (ret < 0) goto out; if (!ret) { ret = -ERESTARTNOHAND; if (signal_pending(current)) goto out; ret = 0; } if (set_fd_set(n, inp, fds.res_in) || set_fd_set(n, outp, fds.res_out) || set_fd_set(n, exp, fds.res_ex)) ret = -EFAULT; out: if (bits != stack_fds) kvfree(bits); out_nofds: return ret; } static int kern_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct __kernel_old_timeval __user *tvp) { struct timespec64 end_time, *to = NULL; struct __kernel_old_timeval tv; int ret; if (tvp) { if (copy_from_user(&tv, tvp, sizeof(tv))) return -EFAULT; to = &end_time; if (poll_select_set_timeout(to, tv.tv_sec + (tv.tv_usec / USEC_PER_SEC), (tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC)) return -EINVAL; } ret = core_sys_select(n, inp, outp, exp, to); return poll_select_finish(&end_time, tvp, PT_TIMEVAL, ret); } SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp, fd_set __user *, exp, struct __kernel_old_timeval __user *, tvp) { return kern_select(n, inp, outp, exp, tvp); } static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, void __user *tsp, const sigset_t __user *sigmask, size_t sigsetsize, enum poll_time_type type) { struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { switch (type) { case PT_TIMESPEC: if (get_timespec64(&ts, tsp)) return -EFAULT; break; case PT_OLD_TIMESPEC: if (get_old_timespec32(&ts, tsp)) return -EFAULT; break; default: BUG(); } to = &end_time; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } ret = set_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = core_sys_select(n, inp, outp, exp, to); return poll_select_finish(&end_time, tsp, type, ret); } /* * Most architectures can't handle 7-argument syscalls. So we provide a * 6-argument version where the sixth argument is a pointer to a structure * which has a pointer to the sigset_t itself followed by a size_t containing * the sigset size. */ struct sigset_argpack { sigset_t __user *p; size_t size; }; static inline int get_sigset_argpack(struct sigset_argpack *to, struct sigset_argpack __user *from) { // the path is hot enough for overhead of copy_from_user() to matter if (from) { scoped_user_read_access(from, Efault) { unsafe_get_user(to->p, &from->p, Efault); unsafe_get_user(to->size, &from->size, Efault); } } return 0; Efault: return -EFAULT; } SYSCALL_DEFINE6(pselect6, int, n, fd_set __user *, inp, fd_set __user *, outp, fd_set __user *, exp, struct __kernel_timespec __user *, tsp, void __user *, sig) { struct sigset_argpack x = {NULL, 0}; if (get_sigset_argpack(&x, sig)) return -EFAULT; return do_pselect(n, inp, outp, exp, tsp, x.p, x.size, PT_TIMESPEC); } #if defined(CONFIG_COMPAT_32BIT_TIME) && !defined(CONFIG_64BIT) SYSCALL_DEFINE6(pselect6_time32, int, n, fd_set __user *, inp, fd_set __user *, outp, fd_set __user *, exp, struct old_timespec32 __user *, tsp, void __user *, sig) { struct sigset_argpack x = {NULL, 0}; if (get_sigset_argpack(&x, sig)) return -EFAULT; return do_pselect(n, inp, outp, exp, tsp, x.p, x.size, PT_OLD_TIMESPEC); } #endif #ifdef __ARCH_WANT_SYS_OLD_SELECT struct sel_arg_struct { unsigned long n; fd_set __user *inp, *outp, *exp; struct __kernel_old_timeval __user *tvp; }; SYSCALL_DEFINE1(old_select, struct sel_arg_struct __user *, arg) { struct sel_arg_struct a; if (copy_from_user(&a, arg, sizeof(a))) return -EFAULT; return kern_select(a.n, a.inp, a.outp, a.exp, a.tvp); } #endif struct poll_list { struct poll_list *next; unsigned int len; struct pollfd entries[] __counted_by(len); }; #define POLLFD_PER_PAGE ((PAGE_SIZE-sizeof(struct poll_list)) / sizeof(struct pollfd)) /* * Fish for pollable events on the pollfd->fd file descriptor. We're only * interested in events matching the pollfd->events mask, and the result * matching that mask is both recorded in pollfd->revents and returned. The * pwait poll_table will be used by the fd-provided poll handler for waiting, * if pwait->_qproc is non-NULL. */ static inline __poll_t do_pollfd(struct pollfd *pollfd, poll_table *pwait, bool *can_busy_poll, __poll_t busy_flag) { int fd = pollfd->fd; __poll_t mask, filter; if (unlikely(fd < 0)) return 0; CLASS(fd, f)(fd); if (fd_empty(f)) return EPOLLNVAL; /* userland u16 ->events contains POLL... bitmap */ filter = demangle_poll(pollfd->events) | EPOLLERR | EPOLLHUP; pwait->_key = filter | busy_flag; mask = vfs_poll(fd_file(f), pwait); if (mask & busy_flag) *can_busy_poll = true; return mask & filter; /* Mask out unneeded events. */ } static int do_poll(struct poll_list *list, struct poll_wqueues *wait, struct timespec64 *end_time) { poll_table* pt = &wait->pt; ktime_t expire, *to = NULL; int timed_out = 0, count = 0; u64 slack = 0; __poll_t busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; unsigned long busy_start = 0; /* Optimise the no-wait case */ if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { pt->_qproc = NULL; timed_out = 1; } if (end_time && !timed_out) slack = select_estimate_accuracy(end_time); for (;;) { struct poll_list *walk; bool can_busy_loop = false; for (walk = list; walk != NULL; walk = walk->next) { struct pollfd * pfd, * pfd_end; pfd = walk->entries; pfd_end = pfd + walk->len; for (; pfd != pfd_end; pfd++) { __poll_t mask; /* * Fish for events. If we found one, record it * and kill poll_table->_qproc, so we don't * needlessly register any other waiters after * this. They'll get immediately deregistered * when we break out and return. */ mask = do_pollfd(pfd, pt, &can_busy_loop, busy_flag); pfd->revents = mangle_poll(mask); if (mask) { count++; pt->_qproc = NULL; /* found something, stop busy polling */ busy_flag = 0; can_busy_loop = false; } } } /* * All waiters have already been registered, so don't provide * a poll_table->_qproc to them on the next loop iteration. */ pt->_qproc = NULL; if (!count) { count = wait->error; if (signal_pending(current)) count = -ERESTARTNOHAND; } if (count || timed_out) break; /* only if found POLL_BUSY_LOOP sockets && not out of time */ if (can_busy_loop && !need_resched()) { if (!busy_start) { busy_start = busy_loop_current_time(); continue; } if (!busy_loop_timeout(busy_start)) continue; } busy_flag = 0; /* * If this is the first loop and we have a timeout * given, then we convert to ktime_t and set the to * pointer to the expiry value. */ if (end_time && !to) { expire = timespec64_to_ktime(*end_time); to = &expire; } if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack)) timed_out = 1; } return count; } #define N_STACK_PPS ((sizeof(stack_pps) - sizeof(struct poll_list)) / \ sizeof(struct pollfd)) static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, struct timespec64 *end_time) { struct poll_wqueues table; int err = -EFAULT, fdcount; /* Allocate small arguments on the stack to save memory and be faster - use long to make sure the buffer is aligned properly on 64 bit archs to avoid unaligned access */ long stack_pps[POLL_STACK_ALLOC/sizeof(long)]; struct poll_list *const head = (struct poll_list *)stack_pps; struct poll_list *walk = head; unsigned int todo = nfds; unsigned int len; if (nfds > rlimit(RLIMIT_NOFILE)) return -EINVAL; len = min_t(unsigned int, nfds, N_STACK_PPS); for (;;) { walk->next = NULL; walk->len = len; if (!len) break; if (copy_from_user(walk->entries, ufds + nfds-todo, sizeof(struct pollfd) * walk->len)) goto out_fds; if (walk->len >= todo) break; todo -= walk->len; len = min(todo, POLLFD_PER_PAGE); walk = walk->next = kmalloc_flex(*walk, entries, len); if (!walk) { err = -ENOMEM; goto out_fds; } } poll_initwait(&table); fdcount = do_poll(head, &table, end_time); poll_freewait(&table); if (!user_write_access_begin(ufds, nfds * sizeof(*ufds))) goto out_fds; for (walk = head; walk; walk = walk->next) { struct pollfd *fds = walk->entries; unsigned int j; for (j = walk->len; j; fds++, ufds++, j--) unsafe_put_user(fds->revents, &ufds->revents, Efault); } user_write_access_end(); err = fdcount; out_fds: walk = head->next; while (walk) { struct poll_list *pos = walk; walk = walk->next; kfree(pos); } return err; Efault: user_write_access_end(); err = -EFAULT; goto out_fds; } static long do_restart_poll(struct restart_block *restart_block) { struct pollfd __user *ufds = restart_block->poll.ufds; int nfds = restart_block->poll.nfds; struct timespec64 *to = NULL; int ret; if (restart_block->poll.has_timeout) to = &restart_block->poll.end_time; ret = do_sys_poll(ufds, nfds, to); if (ret == -ERESTARTNOHAND) ret = set_restart_fn(restart_block, do_restart_poll); return ret; } SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds, int, timeout_msecs) { struct timespec64 end_time, *to = NULL; int ret; if (timeout_msecs >= 0) { to = &end_time; poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC, NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC)); } ret = do_sys_poll(ufds, nfds, to); if (ret == -ERESTARTNOHAND) { struct restart_block *restart_block; restart_block = &current->restart_block; restart_block->poll.ufds = ufds; restart_block->poll.nfds = nfds; if (timeout_msecs >= 0) { restart_block->poll.end_time = end_time; restart_block->poll.has_timeout = 1; } else restart_block->poll.has_timeout = 0; ret = set_restart_fn(restart_block, do_restart_poll); } return ret; } SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds, struct __kernel_timespec __user *, tsp, const sigset_t __user *, sigmask, size_t, sigsetsize) { struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { if (get_timespec64(&ts, tsp)) return -EFAULT; to = &end_time; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } ret = set_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = do_sys_poll(ufds, nfds, to); return poll_select_finish(&end_time, tsp, PT_TIMESPEC, ret); } #if defined(CONFIG_COMPAT_32BIT_TIME) && !defined(CONFIG_64BIT) SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds, unsigned int, nfds, struct old_timespec32 __user *, tsp, const sigset_t __user *, sigmask, size_t, sigsetsize) { struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { if (get_old_timespec32(&ts, tsp)) return -EFAULT; to = &end_time; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } ret = set_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = do_sys_poll(ufds, nfds, to); return poll_select_finish(&end_time, tsp, PT_OLD_TIMESPEC, ret); } #endif #ifdef CONFIG_COMPAT #define __COMPAT_NFDBITS (8 * sizeof(compat_ulong_t)) /* * Ooo, nasty. We need here to frob 32-bit unsigned longs to * 64-bit unsigned longs. */ static int compat_get_fd_set(unsigned long nr, compat_ulong_t __user *ufdset, unsigned long *fdset) { if (ufdset) { return compat_get_bitmap(fdset, ufdset, nr); } else { zero_fd_set(nr, fdset); return 0; } } static int compat_set_fd_set(unsigned long nr, compat_ulong_t __user *ufdset, unsigned long *fdset) { if (!ufdset) return 0; return compat_put_bitmap(ufdset, fdset, nr); } /* * This is a virtual copy of sys_select from fs/select.c and probably * should be compared to it from time to time */ /* * We can actually return ERESTARTSYS instead of EINTR, but I'd * like to be certain this leads to no problems. So I return * EINTR just for safety. * * Update: ERESTARTSYS breaks at least the xview clock binary, so * I'm trying ERESTARTNOHAND which restart only when you want to. */ static int compat_core_sys_select(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp, compat_ulong_t __user *exp, struct timespec64 *end_time) { fd_set_bits fds; void *bits; int size, max_fds, ret = -EINVAL; struct fdtable *fdt; long stack_fds[SELECT_STACK_ALLOC/sizeof(long)]; if (n < 0) goto out_nofds; /* max_fds can increase, so grab it once to avoid race */ rcu_read_lock(); fdt = files_fdtable(current->files); max_fds = fdt->max_fds; rcu_read_unlock(); if (n > max_fds) n = max_fds; /* * We need 6 bitmaps (in/out/ex for both incoming and outgoing), * since we used fdset we need to allocate memory in units of * long-words. */ size = FDS_BYTES(n); bits = stack_fds; if (size > sizeof(stack_fds) / 6) { bits = kmalloc_array(6, size, GFP_KERNEL); ret = -ENOMEM; if (!bits) goto out_nofds; } fds.in = (unsigned long *) bits; fds.out = (unsigned long *) (bits + size); fds.ex = (unsigned long *) (bits + 2*size); fds.res_in = (unsigned long *) (bits + 3*size); fds.res_out = (unsigned long *) (bits + 4*size); fds.res_ex = (unsigned long *) (bits + 5*size); if ((ret = compat_get_fd_set(n, inp, fds.in)) || (ret = compat_get_fd_set(n, outp, fds.out)) || (ret = compat_get_fd_set(n, exp, fds.ex))) goto out; zero_fd_set(n, fds.res_in); zero_fd_set(n, fds.res_out); zero_fd_set(n, fds.res_ex); ret = do_select(n, &fds, end_time); if (ret < 0) goto out; if (!ret) { ret = -ERESTARTNOHAND; if (signal_pending(current)) goto out; ret = 0; } if (compat_set_fd_set(n, inp, fds.res_in) || compat_set_fd_set(n, outp, fds.res_out) || compat_set_fd_set(n, exp, fds.res_ex)) ret = -EFAULT; out: if (bits != stack_fds) kfree(bits); out_nofds: return ret; } static int do_compat_select(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp, compat_ulong_t __user *exp, struct old_timeval32 __user *tvp) { struct timespec64 end_time, *to = NULL; struct old_timeval32 tv; int ret; if (tvp) { if (copy_from_user(&tv, tvp, sizeof(tv))) return -EFAULT; to = &end_time; if (poll_select_set_timeout(to, tv.tv_sec + (tv.tv_usec / USEC_PER_SEC), (tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC)) return -EINVAL; } ret = compat_core_sys_select(n, inp, outp, exp, to); return poll_select_finish(&end_time, tvp, PT_OLD_TIMEVAL, ret); } COMPAT_SYSCALL_DEFINE5(select, int, n, compat_ulong_t __user *, inp, compat_ulong_t __user *, outp, compat_ulong_t __user *, exp, struct old_timeval32 __user *, tvp) { return do_compat_select(n, inp, outp, exp, tvp); } struct compat_sel_arg_struct { compat_ulong_t n; compat_uptr_t inp; compat_uptr_t outp; compat_uptr_t exp; compat_uptr_t tvp; }; COMPAT_SYSCALL_DEFINE1(old_select, struct compat_sel_arg_struct __user *, arg) { struct compat_sel_arg_struct a; if (copy_from_user(&a, arg, sizeof(a))) return -EFAULT; return do_compat_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp), compat_ptr(a.exp), compat_ptr(a.tvp)); } static long do_compat_pselect(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp, compat_ulong_t __user *exp, void __user *tsp, compat_sigset_t __user *sigmask, compat_size_t sigsetsize, enum poll_time_type type) { struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { switch (type) { case PT_OLD_TIMESPEC: if (get_old_timespec32(&ts, tsp)) return -EFAULT; break; case PT_TIMESPEC: if (get_timespec64(&ts, tsp)) return -EFAULT; break; default: BUG(); } to = &end_time; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } ret = set_compat_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = compat_core_sys_select(n, inp, outp, exp, to); return poll_select_finish(&end_time, tsp, type, ret); } struct compat_sigset_argpack { compat_uptr_t p; compat_size_t size; }; static inline int get_compat_sigset_argpack(struct compat_sigset_argpack *to, struct compat_sigset_argpack __user *from) { if (from) { if (!user_read_access_begin(from, sizeof(*from))) return -EFAULT; unsafe_get_user(to->p, &from->p, Efault); unsafe_get_user(to->size, &from->size, Efault); user_read_access_end(); } return 0; Efault: user_read_access_end(); return -EFAULT; } COMPAT_SYSCALL_DEFINE6(pselect6_time64, int, n, compat_ulong_t __user *, inp, compat_ulong_t __user *, outp, compat_ulong_t __user *, exp, struct __kernel_timespec __user *, tsp, void __user *, sig) { struct compat_sigset_argpack x = {0, 0}; if (get_compat_sigset_argpack(&x, sig)) return -EFAULT; return do_compat_pselect(n, inp, outp, exp, tsp, compat_ptr(x.p), x.size, PT_TIMESPEC); } #if defined(CONFIG_COMPAT_32BIT_TIME) COMPAT_SYSCALL_DEFINE6(pselect6_time32, int, n, compat_ulong_t __user *, inp, compat_ulong_t __user *, outp, compat_ulong_t __user *, exp, struct old_timespec32 __user *, tsp, void __user *, sig) { struct compat_sigset_argpack x = {0, 0}; if (get_compat_sigset_argpack(&x, sig)) return -EFAULT; return do_compat_pselect(n, inp, outp, exp, tsp, compat_ptr(x.p), x.size, PT_OLD_TIMESPEC); } #endif #if defined(CONFIG_COMPAT_32BIT_TIME) COMPAT_SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds, unsigned int, nfds, struct old_timespec32 __user *, tsp, const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize) { struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { if (get_old_timespec32(&ts, tsp)) return -EFAULT; to = &end_time; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } ret = set_compat_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = do_sys_poll(ufds, nfds, to); return poll_select_finish(&end_time, tsp, PT_OLD_TIMESPEC, ret); } #endif /* New compat syscall for 64 bit time_t*/ COMPAT_SYSCALL_DEFINE5(ppoll_time64, struct pollfd __user *, ufds, unsigned int, nfds, struct __kernel_timespec __user *, tsp, const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize) { struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { if (get_timespec64(&ts, tsp)) return -EFAULT; to = &end_time; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } ret = set_compat_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = do_sys_poll(ufds, nfds, to); return poll_select_finish(&end_time, tsp, PT_TIMESPEC, ret); } #endif
283 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef BTRFS_BLOCK_GROUP_H #define BTRFS_BLOCK_GROUP_H #include <linux/atomic.h> #include <linux/mutex.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/refcount.h> #include <linux/wait.h> #include <linux/sizes.h> #include <linux/rwsem.h> #include <linux/rbtree.h> #include <uapi/linux/btrfs_tree.h> #include "free-space-cache.h" struct btrfs_chunk_map; struct btrfs_fs_info; struct btrfs_inode; struct btrfs_trans_handle; enum btrfs_disk_cache_state { BTRFS_DC_WRITTEN, BTRFS_DC_ERROR, BTRFS_DC_CLEAR, BTRFS_DC_SETUP, }; enum btrfs_block_group_size_class { /* Unset */ BTRFS_BG_SZ_NONE, /* 0 < size <= 128K */ BTRFS_BG_SZ_SMALL, /* 128K < size <= 8M */ BTRFS_BG_SZ_MEDIUM, /* 8M < size < BG_LENGTH */ BTRFS_BG_SZ_LARGE, }; /* * This describes the state of the block_group for async discard. This is due * to the two pass nature of it where extent discarding is prioritized over * bitmap discarding. BTRFS_DISCARD_RESET_CURSOR is set when we are resetting * between lists to prevent contention for discard state variables * (eg. discard_cursor). */ enum btrfs_discard_state { BTRFS_DISCARD_EXTENTS, BTRFS_DISCARD_BITMAPS, BTRFS_DISCARD_RESET_CURSOR, BTRFS_DISCARD_FULLY_REMAPPED, }; /* * Control flags for do_chunk_alloc's force field CHUNK_ALLOC_NO_FORCE means to * only allocate a chunk if we really need one. * * CHUNK_ALLOC_LIMITED means to only try and allocate one if we have very few * chunks already allocated. This is used as part of the clustering code to * help make sure we have a good pool of storage to cluster in, without filling * the FS with empty chunks * * CHUNK_ALLOC_FORCE means it must try to allocate one * * CHUNK_ALLOC_FORCE_FOR_EXTENT like CHUNK_ALLOC_FORCE but called from * find_free_extent() that also activates the zone */ enum btrfs_chunk_alloc_enum { CHUNK_ALLOC_NO_FORCE, CHUNK_ALLOC_LIMITED, CHUNK_ALLOC_FORCE, CHUNK_ALLOC_FORCE_FOR_EXTENT, }; /* Block group flags set at runtime */ enum btrfs_block_group_flags { BLOCK_GROUP_FLAG_IREF, BLOCK_GROUP_FLAG_REMOVED, BLOCK_GROUP_FLAG_TO_COPY, BLOCK_GROUP_FLAG_RELOCATING_REPAIR, BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED, BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, /* Does the block group need to be added to the free space tree? */ BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, /* Set after we add a new block group to the free space tree. */ BLOCK_GROUP_FLAG_FREE_SPACE_ADDED, /* Indicate that the block group is placed on a sequential zone */ BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, /* * Indicate that block group is in the list of new block groups of a * transaction. */ BLOCK_GROUP_FLAG_NEW, BLOCK_GROUP_FLAG_FULLY_REMAPPED, BLOCK_GROUP_FLAG_STRIPE_REMOVAL_PENDING, }; enum btrfs_caching_type { BTRFS_CACHE_NO, BTRFS_CACHE_STARTED, BTRFS_CACHE_FINISHED, BTRFS_CACHE_ERROR, }; struct btrfs_caching_control { struct list_head list; struct mutex mutex; wait_queue_head_t wait; struct btrfs_work work; struct btrfs_block_group *block_group; /* Track progress of caching during allocation. */ atomic_t progress; refcount_t count; }; /* Once caching_thread() finds this much free space, it will wake up waiters. */ #define CACHING_CTL_WAKE_UP SZ_2M struct btrfs_block_group { struct btrfs_fs_info *fs_info; struct btrfs_inode *inode; spinlock_t lock; u64 start; u64 length; u64 pinned; u64 reserved; u64 used; u64 delalloc_bytes; u64 bytes_super; u64 flags; u64 cache_generation; u64 global_root_id; u64 remap_bytes; u32 identity_remap_count; /* * The last committed used bytes of this block group, if the above @used * is still the same as @last_used, we don't need to update block * group item of this block group. */ u64 last_used; /* The last committed remap_bytes value of this block group. */ u64 last_remap_bytes; /* The last commited identity_remap_count value of this block group. */ u32 last_identity_remap_count; /* The last committed flags value for this block group. */ u64 last_flags; /* * If the free space extent count exceeds this number, convert the block * group to bitmaps. */ u32 bitmap_high_thresh; /* * If the free space extent count drops below this number, convert the * block group back to extents. */ u32 bitmap_low_thresh; /* * It is just used for the delayed data space allocation because * only the data space allocation and the relative metadata update * can be done cross the transaction. */ struct rw_semaphore data_rwsem; /* For raid56, this is a full stripe, without parity */ unsigned long full_stripe_len; unsigned long runtime_flags; unsigned int ro; int disk_cache_state; /* Cache tracking stuff */ int cached; struct btrfs_caching_control *caching_ctl; struct btrfs_space_info *space_info; /* Free space cache stuff */ struct btrfs_free_space_ctl *free_space_ctl; /* Block group cache stuff */ struct rb_node cache_node; /* For block groups in the same raid type */ struct list_head list; refcount_t refs; /* * List of struct btrfs_free_clusters for this block group. * Today it will only have one thing on it, but that may change */ struct list_head cluster_list; /* * Used for several lists: * * 1) struct btrfs_fs_info::unused_bgs * 2) struct btrfs_fs_info::reclaim_bgs * 3) struct btrfs_transaction::deleted_bgs * 4) struct btrfs_trans_handle::new_bgs */ struct list_head bg_list; /* For read-only block groups */ struct list_head ro_list; /* * When non-zero it means the block group's logical address and its * device extents can not be reused for future block group allocations * until the counter goes down to 0. This is to prevent them from being * reused while some task is still using the block group after it was * deleted - we want to make sure they can only be reused for new block * groups after that task is done with the deleted block group. */ atomic_t frozen; /* For discard operations */ struct list_head discard_list; int discard_index; u64 discard_eligible_time; u64 discard_cursor; enum btrfs_discard_state discard_state; /* For dirty block groups */ struct list_head dirty_list; struct list_head io_list; struct btrfs_io_ctl io_ctl; /* * Incremented when doing extent allocations and holding a read lock * on the space_info's groups_sem semaphore. * Decremented when an ordered extent that represents an IO against this * block group's range is created (after it's added to its inode's * root's list of ordered extents) or immediately after the allocation * if it's a metadata extent or fallocate extent (for these cases we * don't create ordered extents). */ atomic_t reservations; /* * Incremented while holding the spinlock *lock* by a task checking if * it can perform a nocow write (incremented if the value for the *ro* * field is 0). Decremented by such tasks once they create an ordered * extent or before that if some error happens before reaching that step. * This is to prevent races between block group relocation and nocow * writes through direct IO. */ atomic_t nocow_writers; /* Lock for free space tree operations. */ struct mutex free_space_lock; /* Protected by @free_space_lock. */ bool using_free_space_bitmaps; /* Protected by @free_space_lock. */ bool using_free_space_bitmaps_cached; /* * Number of extents in this block group used for swap files. * All accesses protected by the spinlock 'lock'. */ int swap_extents; /* * Allocation offset for the block group to implement sequential * allocation. This is used only on a zoned filesystem. */ u64 alloc_offset; u64 zone_unusable; u64 zone_capacity; u64 meta_write_pointer; struct btrfs_chunk_map *physical_map; struct list_head active_bg_list; struct work_struct zone_finish_work; struct extent_buffer *last_eb; enum btrfs_block_group_size_class size_class; u64 reclaim_mark; }; static inline u64 btrfs_block_group_end(const struct btrfs_block_group *block_group) { return (block_group->start + block_group->length); } static inline bool btrfs_is_block_group_used(const struct btrfs_block_group *bg) { lockdep_assert_held(&bg->lock); return (bg->used > 0 || bg->reserved > 0 || bg->pinned > 0 || bg->remap_bytes > 0); } static inline bool btrfs_is_block_group_data_only(const struct btrfs_block_group *block_group) { /* * In mixed mode the fragmentation is expected to be high, lowering the * efficiency, so only proper data block groups are considered. */ return (block_group->flags & BTRFS_BLOCK_GROUP_DATA) && !(block_group->flags & BTRFS_BLOCK_GROUP_METADATA); } static inline u64 btrfs_block_group_available_space(const struct btrfs_block_group *bg) { lockdep_assert_held(&bg->lock); return (bg->length - bg->used - bg->pinned - bg->reserved - bg->bytes_super - bg->zone_unusable); } #ifdef CONFIG_BTRFS_DEBUG int btrfs_should_fragment_free_space(const struct btrfs_block_group *block_group); #endif struct btrfs_block_group *btrfs_lookup_first_block_group( struct btrfs_fs_info *info, u64 bytenr); struct btrfs_block_group *btrfs_lookup_block_group( struct btrfs_fs_info *info, u64 bytenr); struct btrfs_block_group *btrfs_next_block_group( struct btrfs_block_group *cache); void btrfs_get_block_group(struct btrfs_block_group *cache); void btrfs_put_block_group(struct btrfs_block_group *cache); void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, const u64 start); void btrfs_wait_block_group_reservations(struct btrfs_block_group *bg); struct btrfs_block_group *btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr); void btrfs_dec_nocow_writers(struct btrfs_block_group *bg); void btrfs_wait_nocow_writers(struct btrfs_block_group *bg); void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache, u64 num_bytes); int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait); struct btrfs_caching_control *btrfs_get_caching_control( struct btrfs_block_group *cache); int btrfs_add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end, u64 *total_added_ret); struct btrfs_trans_handle *btrfs_start_trans_remove_block_group( struct btrfs_fs_info *fs_info, const u64 chunk_offset); void btrfs_remove_bg_from_sinfo(struct btrfs_block_group *bg); int btrfs_remove_block_group(struct btrfs_trans_handle *trans, struct btrfs_chunk_map *map); void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info); void btrfs_mark_bg_unused(struct btrfs_block_group *bg); void btrfs_reclaim_bgs_work(struct work_struct *work); void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info); void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg); int btrfs_read_block_groups(struct btrfs_fs_info *info); struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans, struct btrfs_space_info *space_info, u64 type, u64 chunk_offset, u64 size); void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans); int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, bool do_chunk_alloc); void btrfs_dec_block_group_ro(struct btrfs_block_group *cache); int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans); int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans); int btrfs_setup_space_cache(struct btrfs_trans_handle *trans); int btrfs_update_block_group(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, bool alloc); int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, u64 ram_bytes, u64 num_bytes, bool delalloc, bool force_wrong_size_class); void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, u64 num_bytes, bool is_delalloc); int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_space_info *space_info, u64 flags, enum btrfs_chunk_alloc_enum force); int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type); void check_system_chunk(struct btrfs_trans_handle *trans, const u64 type); void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans, bool is_item_insertion); u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags); void btrfs_put_block_group_cache(struct btrfs_fs_info *info); int btrfs_free_block_groups(struct btrfs_fs_info *info); int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, u64 physical, u64 **logical, int *naddrs, int *stripe_len); static inline u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info) { return btrfs_get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_DATA); } static inline u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info) { return btrfs_get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_METADATA); } static inline u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info) { return btrfs_get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); } static inline int btrfs_block_group_done(const struct btrfs_block_group *cache) { smp_mb(); return cache->cached == BTRFS_CACHE_FINISHED || cache->cached == BTRFS_CACHE_ERROR; } void btrfs_freeze_block_group(struct btrfs_block_group *cache); void btrfs_unfreeze_block_group(struct btrfs_block_group *cache); bool btrfs_inc_block_group_swap_extents(struct btrfs_block_group *bg); void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount); enum btrfs_block_group_size_class btrfs_calc_block_group_size_class(u64 size); int btrfs_use_block_group_size_class(struct btrfs_block_group *bg, enum btrfs_block_group_size_class size_class, bool force_wrong_size_class); bool btrfs_block_group_should_use_size_class(const struct btrfs_block_group *bg); void btrfs_mark_bg_fully_remapped(struct btrfs_block_group *bg, struct btrfs_trans_handle *trans); int btrfs_populate_fully_remapped_bgs_list(struct btrfs_fs_info *fs_info); #endif /* BTRFS_BLOCK_GROUP_H */
3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 // SPDX-License-Identifier: GPL-2.0-only #include <linux/etherdevice.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/init.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/workqueue.h> #include <linux/spinlock.h> #include <linux/netfilter/nf_conntrack_common.h> #include <linux/netfilter/nf_tables.h> #include <net/ip.h> #include <net/flow.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_extend.h> #include <net/netfilter/nf_flow_table.h> struct nft_flow_offload { struct nft_flowtable *flowtable; }; static bool nft_flow_offload_skip(struct sk_buff *skb, int family) { if (skb_sec_path(skb)) return true; if (family == NFPROTO_IPV4) { const struct ip_options *opt; opt = &(IPCB(skb)->opt); if (unlikely(opt->optlen)) return true; } return false; } static void flow_offload_ct_tcp(struct nf_conn *ct) { /* conntrack will not see all packets, disable tcp window validation. */ spin_lock_bh(&ct->lock); ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; spin_unlock_bh(&ct->lock); } static void nft_flow_offload_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_flow_offload *priv = nft_expr_priv(expr); struct nf_flowtable *flowtable = &priv->flowtable->data; struct tcphdr _tcph, *tcph = NULL; struct nf_flow_route route = {}; enum ip_conntrack_info ctinfo; struct flow_offload *flow; enum ip_conntrack_dir dir; struct nf_conn *ct; int ret; if (nft_flow_offload_skip(pkt->skb, nft_pf(pkt))) goto out; ct = nf_ct_get(pkt->skb, &ctinfo); if (!ct) goto out; switch (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum) { case IPPROTO_TCP: tcph = skb_header_pointer(pkt->skb, nft_thoff(pkt), sizeof(_tcph), &_tcph); if (unlikely(!tcph || tcph->fin || tcph->rst || !nf_conntrack_tcp_established(ct))) goto out; break; case IPPROTO_UDP: break; #ifdef CONFIG_NF_CT_PROTO_GRE case IPPROTO_GRE: { struct nf_conntrack_tuple *tuple; if (ct->status & IPS_NAT_MASK) goto out; tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; /* No support for GRE v1 */ if (tuple->src.u.gre.key || tuple->dst.u.gre.key) goto out; break; } #endif default: goto out; } if (nf_ct_ext_exist(ct, NF_CT_EXT_HELPER) || ct->status & (IPS_SEQ_ADJUST | IPS_NAT_CLASH)) goto out; if (!nf_ct_is_confirmed(ct)) goto out; if (test_and_set_bit(IPS_OFFLOAD_BIT, &ct->status)) goto out; dir = CTINFO2DIR(ctinfo); if (nft_flow_route(pkt, ct, &route, dir, priv->flowtable) < 0) goto err_flow_route; flow = flow_offload_alloc(ct); if (!flow) goto err_flow_alloc; flow_offload_route_init(flow, &route); if (tcph) flow_offload_ct_tcp(ct); __set_bit(NF_FLOW_HW_BIDIRECTIONAL, &flow->flags); ret = flow_offload_add(flowtable, flow); if (ret < 0) goto err_flow_add; return; err_flow_add: flow_offload_free(flow); err_flow_alloc: dst_release(route.tuple[dir].dst); dst_release(route.tuple[!dir].dst); err_flow_route: clear_bit(IPS_OFFLOAD_BIT, &ct->status); out: regs->verdict.code = NFT_BREAK; } static int nft_flow_offload_validate(const struct nft_ctx *ctx, const struct nft_expr *expr) { unsigned int hook_mask = (1 << NF_INET_FORWARD); if (ctx->family != NFPROTO_IPV4 && ctx->family != NFPROTO_IPV6 && ctx->family != NFPROTO_INET) return -EOPNOTSUPP; return nft_chain_validate_hooks(ctx->chain, hook_mask); } static const struct nla_policy nft_flow_offload_policy[NFTA_FLOW_MAX + 1] = { [NFTA_FLOW_TABLE_NAME] = { .type = NLA_STRING, .len = NFT_NAME_MAXLEN - 1 }, }; static int nft_flow_offload_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_flow_offload *priv = nft_expr_priv(expr); u8 genmask = nft_genmask_next(ctx->net); struct nft_flowtable *flowtable; if (!tb[NFTA_FLOW_TABLE_NAME]) return -EINVAL; flowtable = nft_flowtable_lookup(ctx->net, ctx->table, tb[NFTA_FLOW_TABLE_NAME], genmask); if (IS_ERR(flowtable)) return PTR_ERR(flowtable); if (!nft_use_inc(&flowtable->use)) return -EMFILE; priv->flowtable = flowtable; return nf_ct_netns_get(ctx->net, ctx->family); } static void nft_flow_offload_deactivate(const struct nft_ctx *ctx, const struct nft_expr *expr, enum nft_trans_phase phase) { struct nft_flow_offload *priv = nft_expr_priv(expr); nf_tables_deactivate_flowtable(ctx, priv->flowtable, phase); } static void nft_flow_offload_activate(const struct nft_ctx *ctx, const struct nft_expr *expr) { struct nft_flow_offload *priv = nft_expr_priv(expr); nft_use_inc_restore(&priv->flowtable->use); } static void nft_flow_offload_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { nf_ct_netns_put(ctx->net, ctx->family); } static int nft_flow_offload_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { struct nft_flow_offload *priv = nft_expr_priv(expr); if (nla_put_string(skb, NFTA_FLOW_TABLE_NAME, priv->flowtable->name)) goto nla_put_failure; return 0; nla_put_failure: return -1; } static struct nft_expr_type nft_flow_offload_type; static const struct nft_expr_ops nft_flow_offload_ops = { .type = &nft_flow_offload_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_flow_offload)), .eval = nft_flow_offload_eval, .init = nft_flow_offload_init, .activate = nft_flow_offload_activate, .deactivate = nft_flow_offload_deactivate, .destroy = nft_flow_offload_destroy, .validate = nft_flow_offload_validate, .dump = nft_flow_offload_dump, .reduce = NFT_REDUCE_READONLY, }; static struct nft_expr_type nft_flow_offload_type __read_mostly = { .name = "flow_offload", .ops = &nft_flow_offload_ops, .policy = nft_flow_offload_policy, .maxattr = NFTA_FLOW_MAX, .owner = THIS_MODULE, }; static int flow_offload_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); if (event != NETDEV_DOWN) return NOTIFY_DONE; nf_flow_table_cleanup(dev); return NOTIFY_DONE; } static struct notifier_block flow_offload_netdev_notifier = { .notifier_call = flow_offload_netdev_event, }; static int __init nft_flow_offload_module_init(void) { int err; err = register_netdevice_notifier(&flow_offload_netdev_notifier); if (err) goto err; err = nft_register_expr(&nft_flow_offload_type); if (err < 0) goto register_expr; return 0; register_expr: unregister_netdevice_notifier(&flow_offload_netdev_notifier); err: return err; } static void __exit nft_flow_offload_module_exit(void) { nft_unregister_expr(&nft_flow_offload_type); unregister_netdevice_notifier(&flow_offload_netdev_notifier); } module_init(nft_flow_offload_module_init); module_exit(nft_flow_offload_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); MODULE_ALIAS_NFT_EXPR("flow_offload"); MODULE_DESCRIPTION("nftables hardware flow offload module");
91 91 6 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 // SPDX-License-Identifier: GPL-2.0 #include <linux/cache.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/pid_namespace.h> #include "internal.h" /* * /proc/self: */ static const char *proc_self_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { struct pid_namespace *ns = proc_pid_ns(inode->i_sb); pid_t tgid = task_tgid_nr_ns(current, ns); char *name; if (!tgid) return ERR_PTR(-ENOENT); /* max length of unsigned int in decimal + NULL term */ name = kmalloc(10 + 1, dentry ? GFP_KERNEL : GFP_ATOMIC); if (unlikely(!name)) return dentry ? ERR_PTR(-ENOMEM) : ERR_PTR(-ECHILD); sprintf(name, "%u", tgid); set_delayed_call(done, kfree_link, name); return name; } static const struct inode_operations proc_self_inode_operations = { .get_link = proc_self_get_link, }; unsigned self_inum __ro_after_init; int proc_setup_self(struct super_block *s) { struct inode *root_inode = d_inode(s->s_root); struct dentry *self; int ret = -ENOMEM; inode_lock(root_inode); self = d_alloc_name(s->s_root, "self"); if (self) { struct inode *inode = new_inode(s); if (inode) { inode->i_ino = self_inum; simple_inode_init_ts(inode); inode->i_mode = S_IFLNK | S_IRWXUGO; inode->i_uid = GLOBAL_ROOT_UID; inode->i_gid = GLOBAL_ROOT_GID; inode->i_op = &proc_self_inode_operations; d_make_persistent(self, inode); ret = 0; } dput(self); } inode_unlock(root_inode); if (ret) pr_err("proc_fill_super: can't allocate /proc/self\n"); return ret; } void __init proc_self_init(void) { proc_alloc_inum(&self_inum); }
2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 /* SPDX-License-Identifier: GPL-2.0 */ /* * include/linux/userfaultfd_k.h * * Copyright (C) 2015 Red Hat, Inc. * */ #ifndef _LINUX_USERFAULTFD_K_H #define _LINUX_USERFAULTFD_K_H #ifdef CONFIG_USERFAULTFD #include <linux/userfaultfd.h> /* linux/include/uapi/linux/userfaultfd.h */ #include <linux/fcntl.h> #include <linux/mm.h> #include <linux/swap.h> #include <linux/leafops.h> #include <asm-generic/pgtable_uffd.h> #include <linux/hugetlb_inline.h> /* The set of all possible UFFD-related VM flags. */ #define __VM_UFFD_FLAGS (VM_UFFD_MISSING | VM_UFFD_WP | VM_UFFD_MINOR) /* * CAREFUL: Check include/uapi/asm-generic/fcntl.h when defining * new flags, since they might collide with O_* ones. We want * to re-use O_* flags that couldn't possibly have a meaning * from userfaultfd, in order to leave a free define-space for * shared O_* flags. */ #define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK) /* * Start with fault_pending_wqh and fault_wqh so they're more likely * to be in the same cacheline. * * Locking order: * fd_wqh.lock * fault_pending_wqh.lock * fault_wqh.lock * event_wqh.lock * * To avoid deadlocks, IRQs must be disabled when taking any of the above locks, * since fd_wqh.lock is taken by aio_poll() while it's holding a lock that's * also taken in IRQ context. */ struct userfaultfd_ctx { /* waitqueue head for the pending (i.e. not read) userfaults */ wait_queue_head_t fault_pending_wqh; /* waitqueue head for the userfaults */ wait_queue_head_t fault_wqh; /* waitqueue head for the pseudo fd to wakeup poll/read */ wait_queue_head_t fd_wqh; /* waitqueue head for events */ wait_queue_head_t event_wqh; /* a refile sequence protected by fault_pending_wqh lock */ seqcount_spinlock_t refile_seq; /* pseudo fd refcounting */ refcount_t refcount; /* userfaultfd syscall flags */ unsigned int flags; /* features requested from the userspace */ unsigned int features; /* released */ bool released; /* * Prevents userfaultfd operations (fill/move/wp) from happening while * some non-cooperative event(s) is taking place. Increments are done * in write-mode. Whereas, userfaultfd operations, which includes * reading mmap_changing, is done under read-mode. */ struct rw_semaphore map_changing_lock; /* memory mappings are changing because of non-cooperative event */ atomic_t mmap_changing; /* mm with one ore more vmas attached to this userfaultfd_ctx */ struct mm_struct *mm; }; extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason); /* A combined operation mode + behavior flags. */ typedef unsigned int __bitwise uffd_flags_t; /* Mutually exclusive modes of operation. */ enum mfill_atomic_mode { MFILL_ATOMIC_COPY, MFILL_ATOMIC_ZEROPAGE, MFILL_ATOMIC_CONTINUE, MFILL_ATOMIC_POISON, NR_MFILL_ATOMIC_MODES, }; #define MFILL_ATOMIC_MODE_BITS (const_ilog2(NR_MFILL_ATOMIC_MODES - 1) + 1) #define MFILL_ATOMIC_BIT(nr) BIT(MFILL_ATOMIC_MODE_BITS + (nr)) #define MFILL_ATOMIC_FLAG(nr) ((__force uffd_flags_t) MFILL_ATOMIC_BIT(nr)) #define MFILL_ATOMIC_MODE_MASK ((__force uffd_flags_t) (MFILL_ATOMIC_BIT(0) - 1)) static inline bool uffd_flags_mode_is(uffd_flags_t flags, enum mfill_atomic_mode expected) { return (flags & MFILL_ATOMIC_MODE_MASK) == ((__force uffd_flags_t) expected); } static inline uffd_flags_t uffd_flags_set_mode(uffd_flags_t flags, enum mfill_atomic_mode mode) { flags &= ~MFILL_ATOMIC_MODE_MASK; return flags | ((__force uffd_flags_t) mode); } /* Flags controlling behavior. These behavior changes are mode-independent. */ #define MFILL_ATOMIC_WP MFILL_ATOMIC_FLAG(0) extern int mfill_atomic_install_pte(pmd_t *dst_pmd, struct vm_area_struct *dst_vma, unsigned long dst_addr, struct page *page, bool newly_allocated, uffd_flags_t flags); extern ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start, unsigned long src_start, unsigned long len, uffd_flags_t flags); extern ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx, unsigned long dst_start, unsigned long len); extern ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long dst_start, unsigned long len, uffd_flags_t flags); extern ssize_t mfill_atomic_poison(struct userfaultfd_ctx *ctx, unsigned long start, unsigned long len, uffd_flags_t flags); extern int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start, unsigned long len, bool enable_wp); extern long uffd_wp_range(struct vm_area_struct *vma, unsigned long start, unsigned long len, bool enable_wp); /* move_pages */ void double_pt_lock(spinlock_t *ptl1, spinlock_t *ptl2); void double_pt_unlock(spinlock_t *ptl1, spinlock_t *ptl2); ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, unsigned long src_start, unsigned long len, __u64 flags); int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval, struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, unsigned long dst_addr, unsigned long src_addr); /* mm helpers */ static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma, struct vm_userfaultfd_ctx vm_ctx) { return vma->vm_userfaultfd_ctx.ctx == vm_ctx.ctx; } /* * Never enable huge pmd sharing on some uffd registered vmas: * * - VM_UFFD_WP VMAs, because write protect information is per pgtable entry. * * - VM_UFFD_MINOR VMAs, because otherwise we would never get minor faults for * VMAs which share huge pmds. (If you have two mappings to the same * underlying pages, and fault in the non-UFFD-registered one with a write, * with huge pmd sharing this would *also* setup the second UFFD-registered * mapping, and we'd not get minor faults.) */ static inline bool uffd_disable_huge_pmd_share(struct vm_area_struct *vma) { return vma->vm_flags & (VM_UFFD_WP | VM_UFFD_MINOR); } /* * Don't do fault around for either WP or MINOR registered uffd range. For * MINOR registered range, fault around will be a total disaster and ptes can * be installed without notifications; for WP it should mostly be fine as long * as the fault around checks for pte_none() before the installation, however * to be super safe we just forbid it. */ static inline bool uffd_disable_fault_around(struct vm_area_struct *vma) { return vma->vm_flags & (VM_UFFD_WP | VM_UFFD_MINOR); } static inline bool userfaultfd_missing(struct vm_area_struct *vma) { return vma->vm_flags & VM_UFFD_MISSING; } static inline bool userfaultfd_wp(struct vm_area_struct *vma) { return vma->vm_flags & VM_UFFD_WP; } static inline bool userfaultfd_minor(struct vm_area_struct *vma) { return vma->vm_flags & VM_UFFD_MINOR; } static inline bool userfaultfd_pte_wp(struct vm_area_struct *vma, pte_t pte) { return userfaultfd_wp(vma) && pte_uffd_wp(pte); } static inline bool userfaultfd_huge_pmd_wp(struct vm_area_struct *vma, pmd_t pmd) { return userfaultfd_wp(vma) && pmd_uffd_wp(pmd); } static inline bool userfaultfd_armed(struct vm_area_struct *vma) { return vma->vm_flags & __VM_UFFD_FLAGS; } static inline bool vma_can_userfault(struct vm_area_struct *vma, vm_flags_t vm_flags, bool wp_async) { vm_flags &= __VM_UFFD_FLAGS; if (vma->vm_flags & VM_DROPPABLE) return false; if ((vm_flags & VM_UFFD_MINOR) && (!is_vm_hugetlb_page(vma) && !vma_is_shmem(vma))) return false; /* * If wp async enabled, and WP is the only mode enabled, allow any * memory type. */ if (wp_async && (vm_flags == VM_UFFD_WP)) return true; /* * If user requested uffd-wp but not enabled pte markers for * uffd-wp, then shmem & hugetlbfs are not supported but only * anonymous. */ if (!uffd_supports_wp_marker() && (vm_flags & VM_UFFD_WP) && !vma_is_anonymous(vma)) return false; /* By default, allow any of anon|shmem|hugetlb */ return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) || vma_is_shmem(vma); } static inline bool vma_has_uffd_without_event_remap(struct vm_area_struct *vma) { struct userfaultfd_ctx *uffd_ctx = vma->vm_userfaultfd_ctx.ctx; return uffd_ctx && (uffd_ctx->features & UFFD_FEATURE_EVENT_REMAP) == 0; } extern int dup_userfaultfd(struct vm_area_struct *, struct list_head *); extern void dup_userfaultfd_complete(struct list_head *); void dup_userfaultfd_fail(struct list_head *); extern void mremap_userfaultfd_prep(struct vm_area_struct *, struct vm_userfaultfd_ctx *); extern void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *, unsigned long from, unsigned long to, unsigned long len); void mremap_userfaultfd_fail(struct vm_userfaultfd_ctx *); extern bool userfaultfd_remove(struct vm_area_struct *vma, unsigned long start, unsigned long end); extern int userfaultfd_unmap_prep(struct vm_area_struct *vma, unsigned long start, unsigned long end, struct list_head *uf); extern void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf); extern bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma); extern bool userfaultfd_wp_async(struct vm_area_struct *vma); void userfaultfd_reset_ctx(struct vm_area_struct *vma); struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end); int userfaultfd_register_range(struct userfaultfd_ctx *ctx, struct vm_area_struct *vma, vm_flags_t vm_flags, unsigned long start, unsigned long end, bool wp_async); void userfaultfd_release_new(struct userfaultfd_ctx *ctx); void userfaultfd_release_all(struct mm_struct *mm, struct userfaultfd_ctx *ctx); static inline bool userfaultfd_wp_use_markers(struct vm_area_struct *vma) { /* Only wr-protect mode uses pte markers */ if (!userfaultfd_wp(vma)) return false; /* File-based uffd-wp always need markers */ if (!vma_is_anonymous(vma)) return true; /* * Anonymous uffd-wp only needs the markers if WP_UNPOPULATED * enabled (to apply markers on zero pages). */ return userfaultfd_wp_unpopulated(vma); } /* * Returns true if this is a swap pte and was uffd-wp wr-protected in either * forms (pte marker or a normal swap pte), false otherwise. */ static inline bool pte_swp_uffd_wp_any(pte_t pte) { if (!uffd_supports_wp_marker()) return false; if (pte_present(pte)) return false; if (pte_swp_uffd_wp(pte)) return true; if (pte_is_uffd_wp_marker(pte)) return true; return false; } #else /* CONFIG_USERFAULTFD */ /* mm helpers */ static inline vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) { return VM_FAULT_SIGBUS; } static inline long uffd_wp_range(struct vm_area_struct *vma, unsigned long start, unsigned long len, bool enable_wp) { return false; } static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma, struct vm_userfaultfd_ctx vm_ctx) { return true; } static inline bool userfaultfd_missing(struct vm_area_struct *vma) { return false; } static inline bool userfaultfd_wp(struct vm_area_struct *vma) { return false; } static inline bool userfaultfd_minor(struct vm_area_struct *vma) { return false; } static inline bool userfaultfd_pte_wp(struct vm_area_struct *vma, pte_t pte) { return false; } static inline bool userfaultfd_huge_pmd_wp(struct vm_area_struct *vma, pmd_t pmd) { return false; } static inline bool userfaultfd_armed(struct vm_area_struct *vma) { return false; } static inline int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *l) { return 0; } static inline void dup_userfaultfd_complete(struct list_head *l) { } static inline void dup_userfaultfd_fail(struct list_head *l) { } static inline void mremap_userfaultfd_prep(struct vm_area_struct *vma, struct vm_userfaultfd_ctx *ctx) { } static inline void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *ctx, unsigned long from, unsigned long to, unsigned long len) { } static inline void mremap_userfaultfd_fail(struct vm_userfaultfd_ctx *ctx) { } static inline bool userfaultfd_remove(struct vm_area_struct *vma, unsigned long start, unsigned long end) { return true; } static inline int userfaultfd_unmap_prep(struct vm_area_struct *vma, unsigned long start, unsigned long end, struct list_head *uf) { return 0; } static inline void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf) { } static inline bool uffd_disable_fault_around(struct vm_area_struct *vma) { return false; } static inline bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma) { return false; } static inline bool userfaultfd_wp_async(struct vm_area_struct *vma) { return false; } static inline bool vma_has_uffd_without_event_remap(struct vm_area_struct *vma) { return false; } static inline bool userfaultfd_wp_use_markers(struct vm_area_struct *vma) { return false; } /* * Returns true if this is a swap pte and was uffd-wp wr-protected in either * forms (pte marker or a normal swap pte), false otherwise. */ static inline bool pte_swp_uffd_wp_any(pte_t pte) { return false; } #endif /* CONFIG_USERFAULTFD */ #endif /* _LINUX_USERFAULTFD_K_H */
1 1 29 14 54 14 26 29 11 11 3 2 3 3 3 3 2 2 3 3 3 3 3 3 7 7 7 7 15 15 4 7 7 3 7 1 10 2 2 10 10 10 15 15 15 1 1 2 1 10 8 3 2 10 10 15 9 9 9 9 10 10 10 10 9 9 9 9 9 9 15 10 10 9 15 9 14 20 1 1 1 2 1 2 12 13 5 15 12 3 15 6 9 6 4 3 2 4 3 2 12 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. */ #include <linux/sched.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" #include "locking.h" #include "accessors.h" #include "messages.h" #include "delalloc-space.h" #include "subpage.h" #include "defrag.h" #include "file-item.h" #include "super.h" #include "compression.h" static struct kmem_cache *btrfs_inode_defrag_cachep; /* * When auto defrag is enabled we queue up these defrag structs to remember * which inodes need defragging passes. */ struct inode_defrag { struct rb_node rb_node; /* Inode number */ u64 ino; /* * Transid where the defrag was added, we search for extents newer than * this. */ u64 transid; /* Root objectid */ u64 root; /* * The extent size threshold for autodefrag. * * This value is different for compressed/non-compressed extents, thus * needs to be passed from higher layer. * (aka, inode_should_defrag()) */ u32 extent_thresh; }; static int compare_inode_defrag(const struct inode_defrag *defrag1, const struct inode_defrag *defrag2) { if (defrag1->root > defrag2->root) return 1; else if (defrag1->root < defrag2->root) return -1; else if (defrag1->ino > defrag2->ino) return 1; else if (defrag1->ino < defrag2->ino) return -1; else return 0; } static int inode_defrag_cmp(struct rb_node *new, const struct rb_node *existing) { const struct inode_defrag *new_defrag = rb_entry(new, struct inode_defrag, rb_node); const struct inode_defrag *existing_defrag = rb_entry(existing, struct inode_defrag, rb_node); return compare_inode_defrag(new_defrag, existing_defrag); } /* * Insert a record for an inode into the defrag tree. The lock must be held * already. * * If you're inserting a record for an older transid than an existing record, * the transid already in the tree is lowered. */ static int btrfs_insert_inode_defrag(struct btrfs_inode *inode, struct inode_defrag *defrag) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct rb_node *node; node = rb_find_add(&defrag->rb_node, &fs_info->defrag_inodes, inode_defrag_cmp); if (node) { struct inode_defrag *entry; entry = rb_entry(node, struct inode_defrag, rb_node); /* * If we're reinserting an entry for an old defrag run, make * sure to lower the transid of our existing record. */ if (defrag->transid < entry->transid) entry->transid = defrag->transid; entry->extent_thresh = min(defrag->extent_thresh, entry->extent_thresh); return -EEXIST; } set_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags); return 0; } static inline bool need_auto_defrag(struct btrfs_fs_info *fs_info) { if (!btrfs_test_opt(fs_info, AUTO_DEFRAG)) return false; if (btrfs_fs_closing(fs_info)) return false; return true; } /* * Insert a defrag record for this inode if auto defrag is enabled. No errors * returned as they're not considered fatal. */ void btrfs_add_inode_defrag(struct btrfs_inode *inode, u32 extent_thresh) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct inode_defrag *defrag; int ret; if (!need_auto_defrag(fs_info)) return; if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) return; defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS); if (!defrag) return; defrag->ino = btrfs_ino(inode); defrag->transid = btrfs_get_root_last_trans(root); defrag->root = btrfs_root_id(root); defrag->extent_thresh = extent_thresh; spin_lock(&fs_info->defrag_inodes_lock); if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) { /* * If we set IN_DEFRAG flag and evict the inode from memory, * and then re-read this inode, this new inode doesn't have * IN_DEFRAG flag. At the case, we may find the existed defrag. */ ret = btrfs_insert_inode_defrag(inode, defrag); if (ret) kmem_cache_free(btrfs_inode_defrag_cachep, defrag); } else { kmem_cache_free(btrfs_inode_defrag_cachep, defrag); } spin_unlock(&fs_info->defrag_inodes_lock); } /* * Pick the defraggable inode that we want, if it doesn't exist, we will get the * next one. */ static struct inode_defrag *btrfs_pick_defrag_inode( struct btrfs_fs_info *fs_info, u64 root, u64 ino) { struct inode_defrag *entry = NULL; struct inode_defrag tmp; struct rb_node *p; struct rb_node *parent = NULL; int ret; tmp.ino = ino; tmp.root = root; spin_lock(&fs_info->defrag_inodes_lock); p = fs_info->defrag_inodes.rb_node; while (p) { parent = p; entry = rb_entry(parent, struct inode_defrag, rb_node); ret = compare_inode_defrag(&tmp, entry); if (ret < 0) p = parent->rb_left; else if (ret > 0) p = parent->rb_right; else goto out; } if (parent && compare_inode_defrag(&tmp, entry) > 0) { parent = rb_next(parent); entry = rb_entry_safe(parent, struct inode_defrag, rb_node); } out: if (entry) rb_erase(parent, &fs_info->defrag_inodes); spin_unlock(&fs_info->defrag_inodes_lock); return entry; } void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info) { struct inode_defrag *defrag, *next; spin_lock(&fs_info->defrag_inodes_lock); rbtree_postorder_for_each_entry_safe(defrag, next, &fs_info->defrag_inodes, rb_node) kmem_cache_free(btrfs_inode_defrag_cachep, defrag); fs_info->defrag_inodes = RB_ROOT; spin_unlock(&fs_info->defrag_inodes_lock); } #define BTRFS_DEFRAG_BATCH 1024 static int btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, struct inode_defrag *defrag, struct file_ra_state *ra) { struct btrfs_root *inode_root; struct btrfs_inode *inode; struct btrfs_ioctl_defrag_range_args range; int ret = 0; u64 cur = 0; again: if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) goto cleanup; if (!need_auto_defrag(fs_info)) goto cleanup; /* Get the inode */ inode_root = btrfs_get_fs_root(fs_info, defrag->root, true); if (IS_ERR(inode_root)) { ret = PTR_ERR(inode_root); goto cleanup; } inode = btrfs_iget(defrag->ino, inode_root); btrfs_put_root(inode_root); if (IS_ERR(inode)) { ret = PTR_ERR(inode); goto cleanup; } if (cur >= i_size_read(&inode->vfs_inode)) { iput(&inode->vfs_inode); goto cleanup; } /* Do a chunk of defrag */ clear_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags); memset(&range, 0, sizeof(range)); range.len = (u64)-1; range.start = cur; range.extent_thresh = defrag->extent_thresh; file_ra_state_init(ra, inode->vfs_inode.i_mapping); scoped_guard(super_write, fs_info->sb) ret = btrfs_defrag_file(inode, ra, &range, defrag->transid, BTRFS_DEFRAG_BATCH); iput(&inode->vfs_inode); if (ret < 0) goto cleanup; cur = max(cur + fs_info->sectorsize, range.start); goto again; cleanup: kmem_cache_free(btrfs_inode_defrag_cachep, defrag); return ret; } /* * Run through the list of inodes in the FS that need defragging. */ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) { struct inode_defrag *defrag; u64 first_ino = 0; u64 root_objectid = 0; atomic_inc(&fs_info->defrag_running); while (1) { struct file_ra_state ra = { 0 }; /* Pause the auto defragger. */ if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) break; if (!need_auto_defrag(fs_info)) break; /* find an inode to defrag */ defrag = btrfs_pick_defrag_inode(fs_info, root_objectid, first_ino); if (!defrag) { if (root_objectid || first_ino) { root_objectid = 0; first_ino = 0; continue; } else { break; } } first_ino = defrag->ino + 1; root_objectid = defrag->root; btrfs_run_defrag_inode(fs_info, defrag, &ra); } atomic_dec(&fs_info->defrag_running); /* * During unmount, we use the transaction_wait queue to wait for the * defragger to stop. */ wake_up(&fs_info->transaction_wait); return 0; } /* * Check if two blocks addresses are close, used by defrag. */ static bool close_blocks(u64 blocknr, u64 other, u32 blocksize) { if (blocknr < other && other - (blocknr + blocksize) < SZ_32K) return true; if (blocknr > other && blocknr - (other + blocksize) < SZ_32K) return true; return false; } /* * Go through all the leaves pointed to by a node and reallocate them so that * disk order is close to key order. */ static int btrfs_realloc_node(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *parent, int start_slot, u64 *last_ret, struct btrfs_key *progress) { struct btrfs_fs_info *fs_info = root->fs_info; const u32 blocksize = fs_info->nodesize; const int end_slot = btrfs_header_nritems(parent) - 1; u64 search_start = *last_ret; u64 last_block = 0; int ret = 0; bool progress_passed = false; /* * COWing must happen through a running transaction, which always * matches the current fs generation (it's a transaction with a state * less than TRANS_STATE_UNBLOCKED). If it doesn't, then turn the fs * into error state to prevent the commit of any transaction. */ if (unlikely(trans->transaction != fs_info->running_transaction || trans->transid != fs_info->generation)) { btrfs_abort_transaction(trans, -EUCLEAN); btrfs_crit(fs_info, "unexpected transaction when attempting to reallocate parent %llu for root %llu, transaction %llu running transaction %llu fs generation %llu", parent->start, btrfs_root_id(root), trans->transid, fs_info->running_transaction->transid, fs_info->generation); return -EUCLEAN; } if (btrfs_header_nritems(parent) <= 1) return 0; for (int i = start_slot; i <= end_slot; i++) { struct extent_buffer *cur; struct btrfs_disk_key disk_key; u64 blocknr; u64 other; bool close = true; btrfs_node_key(parent, &disk_key, i); if (!progress_passed && btrfs_comp_keys(&disk_key, progress) < 0) continue; progress_passed = true; blocknr = btrfs_node_blockptr(parent, i); if (last_block == 0) last_block = blocknr; if (i > 0) { other = btrfs_node_blockptr(parent, i - 1); close = close_blocks(blocknr, other, blocksize); } if (!close && i < end_slot) { other = btrfs_node_blockptr(parent, i + 1); close = close_blocks(blocknr, other, blocksize); } if (close) { last_block = blocknr; continue; } cur = btrfs_read_node_slot(parent, i); if (IS_ERR(cur)) return PTR_ERR(cur); if (search_start == 0) search_start = last_block; btrfs_tree_lock(cur); ret = btrfs_force_cow_block(trans, root, cur, parent, i, &cur, search_start, min(16 * blocksize, (end_slot - i) * blocksize), BTRFS_NESTING_COW); if (ret) { btrfs_tree_unlock(cur); free_extent_buffer(cur); break; } search_start = cur->start; last_block = cur->start; *last_ret = search_start; btrfs_tree_unlock(cur); free_extent_buffer(cur); } return ret; } /* * Defrag all the leaves in a given btree. * Read all the leaves and try to get key order to * better reflect disk order */ static int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, struct btrfs_root *root) { struct btrfs_path *path = NULL; struct btrfs_key key; int ret = 0; int wret; int level; int next_key_ret = 0; u64 last_ret = 0; if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) goto out; path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto out; } level = btrfs_header_level(root->node); if (level == 0) goto out; if (root->defrag_progress.objectid == 0) { struct extent_buffer *root_node; u32 nritems; root_node = btrfs_lock_root_node(root); nritems = btrfs_header_nritems(root_node); root->defrag_max.objectid = 0; /* from above we know this is not a leaf */ btrfs_node_key_to_cpu(root_node, &root->defrag_max, nritems - 1); btrfs_tree_unlock(root_node); free_extent_buffer(root_node); memset(&key, 0, sizeof(key)); } else { memcpy(&key, &root->defrag_progress, sizeof(key)); } path->keep_locks = true; ret = btrfs_search_forward(root, &key, path, BTRFS_OLDEST_GENERATION); if (ret < 0) goto out; if (ret > 0) { ret = 0; goto out; } btrfs_release_path(path); /* * We don't need a lock on a leaf. btrfs_realloc_node() will lock all * leafs from path->nodes[1], so set lowest_level to 1 to avoid later * a deadlock (attempting to write lock an already write locked leaf). */ path->lowest_level = 1; wret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (wret < 0) { ret = wret; goto out; } if (!path->nodes[1]) { ret = 0; goto out; } /* * The node at level 1 must always be locked when our path has * keep_locks set and lowest_level is 1, regardless of the value of * path->slots[1]. */ ASSERT(path->locks[1] != 0); ret = btrfs_realloc_node(trans, root, path->nodes[1], 0, &last_ret, &root->defrag_progress); if (ret) { WARN_ON(ret == -EAGAIN); goto out; } /* * Now that we reallocated the node we can find the next key. Note that * btrfs_find_next_key() can release our path and do another search * without COWing, this is because even with path->keep_locks == true, * btrfs_search_slot() / ctree.c:unlock_up() does not keeps a lock on a * node when path->slots[node_level - 1] does not point to the last * item or a slot beyond the last item (ctree.c:unlock_up()). Therefore * we search for the next key after reallocating our node. */ path->slots[1] = btrfs_header_nritems(path->nodes[1]); next_key_ret = btrfs_find_next_key(root, path, &key, 1, BTRFS_OLDEST_GENERATION); if (next_key_ret == 0) { memcpy(&root->defrag_progress, &key, sizeof(key)); ret = -EAGAIN; } out: btrfs_free_path(path); if (ret == -EAGAIN) { if (root->defrag_max.objectid > root->defrag_progress.objectid) goto done; if (root->defrag_max.type > root->defrag_progress.type) goto done; if (root->defrag_max.offset > root->defrag_progress.offset) goto done; ret = 0; } done: if (ret != -EAGAIN) memset(&root->defrag_progress, 0, sizeof(root->defrag_progress)); return ret; } /* * Defrag a given btree. Every leaf in the btree is read and defragmented. */ int btrfs_defrag_root(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; int ret; if (test_and_set_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state)) return 0; while (1) { struct btrfs_trans_handle *trans; trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); break; } ret = btrfs_defrag_leaves(trans, root); btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); cond_resched(); if (btrfs_fs_closing(fs_info) || ret != -EAGAIN) break; if (btrfs_defrag_cancelled(fs_info)) { btrfs_debug(fs_info, "defrag_root cancelled"); ret = -EAGAIN; break; } } clear_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state); return ret; } /* * Defrag specific helper to get an extent map. * * Differences between this and btrfs_get_extent() are: * * - No extent_map will be added to inode->extent_tree * To reduce memory usage in the long run. * * - Extra optimization to skip file extents older than @newer_than * By using btrfs_search_forward() we can skip entire file ranges that * have extents created in past transactions, because btrfs_search_forward() * will not visit leaves and nodes with a generation smaller than given * minimal generation threshold (@newer_than). * * Return valid em if we find a file extent matching the requirement. * Return NULL if we can not find a file extent matching the requirement. * * Return ERR_PTR() for error. */ static struct extent_map *defrag_get_extent(struct btrfs_inode *inode, u64 start, u64 newer_than) { struct btrfs_root *root = inode->root; struct btrfs_file_extent_item *fi; BTRFS_PATH_AUTO_RELEASE(path); struct extent_map *em; struct btrfs_key key; u64 ino = btrfs_ino(inode); int ret; em = btrfs_alloc_extent_map(); if (!em) { ret = -ENOMEM; goto err; } key.objectid = ino; key.type = BTRFS_EXTENT_DATA_KEY; key.offset = start; if (newer_than) { ret = btrfs_search_forward(root, &key, &path, newer_than); if (ret < 0) goto err; /* Can't find anything newer */ if (ret > 0) goto not_found; } else { ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0); if (ret < 0) goto err; } if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) { /* * If btrfs_search_slot() makes path to point beyond nritems, * we should not have an empty leaf, as this inode must at * least have its INODE_ITEM. */ ASSERT(btrfs_header_nritems(path.nodes[0])); path.slots[0] = btrfs_header_nritems(path.nodes[0]) - 1; } btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); /* Perfect match, no need to go one slot back */ if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY && key.offset == start) goto iterate; /* We didn't find a perfect match, needs to go one slot back */ if (path.slots[0] > 0) { btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY) path.slots[0]--; } iterate: /* Iterate through the path to find a file extent covering @start */ while (true) { u64 extent_end; if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) goto next; btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); /* * We may go one slot back to INODE_REF/XATTR item, then * need to go forward until we reach an EXTENT_DATA. * But we should still has the correct ino as key.objectid. */ if (WARN_ON(key.objectid < ino) || key.type < BTRFS_EXTENT_DATA_KEY) goto next; /* It's beyond our target range, definitely not extent found */ if (key.objectid > ino || key.type > BTRFS_EXTENT_DATA_KEY) goto not_found; /* * | |<- File extent ->| * \- start * * This means there is a hole between start and key.offset. */ if (key.offset > start) { em->start = start; em->disk_bytenr = EXTENT_MAP_HOLE; em->disk_num_bytes = 0; em->ram_bytes = 0; em->offset = 0; em->len = key.offset - start; break; } fi = btrfs_item_ptr(path.nodes[0], path.slots[0], struct btrfs_file_extent_item); extent_end = btrfs_file_extent_end(&path); /* * |<- file extent ->| | * \- start * * We haven't reached start, search next slot. */ if (extent_end <= start) goto next; /* Now this extent covers @start, convert it to em */ btrfs_extent_item_to_extent_map(inode, &path, fi, em); break; next: ret = btrfs_next_item(root, &path); if (ret < 0) goto err; if (ret > 0) goto not_found; } return em; not_found: btrfs_free_extent_map(em); return NULL; err: btrfs_free_extent_map(em); return ERR_PTR(ret); } static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, u64 newer_than, bool locked) { struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_map *em; const u32 sectorsize = BTRFS_I(inode)->root->fs_info->sectorsize; /* * Hopefully we have this extent in the tree already, try without the * full extent lock. */ read_lock(&em_tree->lock); em = btrfs_lookup_extent_mapping(em_tree, start, sectorsize); read_unlock(&em_tree->lock); /* * We can get a merged extent, in that case, we need to re-search * tree to get the original em for defrag. * * This is because even if we have adjacent extents that are contiguous * and compatible (same type and flags), we still want to defrag them * so that we use less metadata (extent items in the extent tree and * file extent items in the inode's subvolume tree). */ if (em && (em->flags & EXTENT_FLAG_MERGED)) { btrfs_free_extent_map(em); em = NULL; } if (!em) { struct extent_state *cached = NULL; u64 end = start + sectorsize - 1; /* Get the big lock and read metadata off disk. */ if (!locked) btrfs_lock_extent(io_tree, start, end, &cached); em = defrag_get_extent(BTRFS_I(inode), start, newer_than); if (!locked) btrfs_unlock_extent(io_tree, start, end, &cached); if (IS_ERR(em)) return NULL; } return em; } static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info, const struct extent_map *em) { if (btrfs_extent_map_is_compressed(em)) return BTRFS_MAX_COMPRESSED; return fs_info->max_extent_size; } static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, u32 extent_thresh, u64 newer_than, bool locked) { struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct extent_map *next; const u64 em_end = btrfs_extent_map_end(em); bool ret = false; /* This is the last extent */ if (em_end >= i_size_read(inode)) return false; /* * Here we need to pass @newer_then when checking the next extent, or * we will hit a case we mark current extent for defrag, but the next * one will not be a target. * This will just cause extra IO without really reducing the fragments. */ next = defrag_lookup_extent(inode, em_end, newer_than, locked); /* No more em or hole */ if (!next || next->disk_bytenr >= EXTENT_MAP_LAST_BYTE) goto out; if (next->flags & EXTENT_FLAG_PREALLOC) goto out; /* * If the next extent is at its max capacity, defragging current extent * makes no sense, as the total number of extents won't change. */ if (next->len >= get_extent_max_capacity(fs_info, em)) goto out; /* Skip older extent */ if (next->generation < newer_than) goto out; /* Also check extent size */ if (next->len >= extent_thresh) goto out; ret = true; out: btrfs_free_extent_map(next); return ret; } /* * Prepare one page to be defragged. * * This will ensure: * * - Returned page is locked and has been set up properly. * - No ordered extent exists in the page. * - The page is uptodate. * * NOTE: Caller should also wait for page writeback after the cluster is * prepared, here we don't do writeback wait for each page. */ static struct folio *defrag_prepare_one_folio(struct btrfs_inode *inode, pgoff_t index) { struct address_space *mapping = inode->vfs_inode.i_mapping; gfp_t mask = btrfs_alloc_write_mask(mapping); u64 lock_start; u64 lock_end; struct extent_state *cached_state = NULL; struct folio *folio; int ret; again: /* TODO: Add order fgp order flags when large folios are fully enabled. */ folio = __filemap_get_folio(mapping, index, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask); if (IS_ERR(folio)) return folio; /* * Since we can defragment files opened read-only, we can encounter * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS). * * The IO for such large folios is not fully tested, thus return * an error to reject such folios unless it's an experimental build. * * Filesystem transparent huge pages are typically only used for * executables that explicitly enable them, so this isn't very * restrictive. */ if (!IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL) && folio_test_large(folio)) { folio_unlock(folio); folio_put(folio); return ERR_PTR(-ETXTBSY); } ret = set_folio_extent_mapped(folio); if (ret < 0) { folio_unlock(folio); folio_put(folio); return ERR_PTR(ret); } lock_start = folio_pos(folio); lock_end = folio_next_pos(folio) - 1; /* Wait for any existing ordered extent in the range */ while (1) { struct btrfs_ordered_extent *ordered; btrfs_lock_extent(&inode->io_tree, lock_start, lock_end, &cached_state); ordered = btrfs_lookup_ordered_range(inode, lock_start, folio_size(folio)); btrfs_unlock_extent(&inode->io_tree, lock_start, lock_end, &cached_state); if (!ordered) break; folio_unlock(folio); btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); folio_lock(folio); /* * We unlocked the folio above, so we need check if it was * released or not. */ if (folio->mapping != mapping || !folio->private) { folio_unlock(folio); folio_put(folio); goto again; } } /* * Now the page range has no ordered extent any more. Read the page to * make it uptodate. */ if (!folio_test_uptodate(folio)) { btrfs_read_folio(NULL, folio); folio_lock(folio); if (folio->mapping != mapping || !folio->private) { folio_unlock(folio); folio_put(folio); goto again; } if (unlikely(!folio_test_uptodate(folio))) { folio_unlock(folio); folio_put(folio); return ERR_PTR(-EIO); } } return folio; } struct defrag_target_range { struct list_head list; u64 start; u64 len; }; /* * Collect all valid target extents. * * @start: file offset to lookup * @len: length to lookup * @extent_thresh: file extent size threshold, any extent size >= this value * will be ignored * @newer_than: only defrag extents newer than this value * @do_compress: whether the defrag is doing compression or no-compression * if true, @extent_thresh will be ignored and all regular * file extents meeting @newer_than will be targets. * @locked: if the range has already held extent lock * @target_list: list of targets file extents */ static int defrag_collect_targets(struct btrfs_inode *inode, u64 start, u64 len, u32 extent_thresh, u64 newer_than, bool do_compress, bool locked, struct list_head *target_list, u64 *last_scanned_ret) { struct btrfs_fs_info *fs_info = inode->root->fs_info; bool last_is_target = false; u64 cur = start; int ret = 0; while (cur < start + len) { struct extent_map *em; struct defrag_target_range *new; bool next_mergeable = true; u64 range_len; last_is_target = false; em = defrag_lookup_extent(&inode->vfs_inode, cur, newer_than, locked); if (!em) break; /* * If the file extent is an inlined one, we may still want to * defrag it (fallthrough) if it will cause a regular extent. * This is for users who want to convert inline extents to * regular ones through max_inline= mount option. */ if (em->disk_bytenr == EXTENT_MAP_INLINE && em->len <= inode->root->fs_info->max_inline) goto next; /* Skip holes and preallocated extents. */ if (em->disk_bytenr == EXTENT_MAP_HOLE || (em->flags & EXTENT_FLAG_PREALLOC)) goto next; /* Skip older extent */ if (em->generation < newer_than) goto next; /* This em is under writeback, no need to defrag */ if (em->generation == (u64)-1) goto next; /* * Our start offset might be in the middle of an existing extent * map, so take that into account. */ range_len = em->len - (cur - em->start); /* * If this range of the extent map is already flagged for delalloc, * skip it, because: * * 1) We could deadlock later, when trying to reserve space for * delalloc, because in case we can't immediately reserve space * the flusher can start delalloc and wait for the respective * ordered extents to complete. The deadlock would happen * because we do the space reservation while holding the range * locked, and starting writeback, or finishing an ordered * extent, requires locking the range; * * 2) If there's delalloc there, it means there's dirty pages for * which writeback has not started yet (we clean the delalloc * flag when starting writeback and after creating an ordered * extent). If we mark pages in an adjacent range for defrag, * then we will have a larger contiguous range for delalloc, * very likely resulting in a larger extent after writeback is * triggered (except in a case of free space fragmentation). */ if (btrfs_test_range_bit_exists(&inode->io_tree, cur, cur + range_len - 1, EXTENT_DELALLOC)) goto next; /* * For do_compress case, we want to compress all valid file * extents, thus no @extent_thresh or mergeable check. */ if (do_compress) goto add; /* Skip too large extent */ if (em->len >= extent_thresh) goto next; /* * Skip extents already at its max capacity, this is mostly for * compressed extents, which max cap is only 128K. */ if (em->len >= get_extent_max_capacity(fs_info, em)) goto next; /* * Normally there are no more extents after an inline one, thus * @next_mergeable will normally be false and not defragged. * So if an inline extent passed all above checks, just add it * for defrag, and be converted to regular extents. */ if (em->disk_bytenr == EXTENT_MAP_INLINE) goto add; next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em, extent_thresh, newer_than, locked); if (!next_mergeable) { struct defrag_target_range *last; /* Empty target list, no way to merge with last entry */ if (list_empty(target_list)) goto next; last = list_last_entry(target_list, struct defrag_target_range, list); /* Not mergeable with last entry */ if (last->start + last->len != cur) goto next; /* Mergeable, fall through to add it to @target_list. */ } add: last_is_target = true; range_len = min(btrfs_extent_map_end(em), start + len) - cur; /* * This one is a good target, check if it can be merged into * last range of the target list. */ if (!list_empty(target_list)) { struct defrag_target_range *last; last = list_last_entry(target_list, struct defrag_target_range, list); ASSERT(last->start + last->len <= cur); if (last->start + last->len == cur) { /* Mergeable, enlarge the last entry */ last->len += range_len; goto next; } /* Fall through to allocate a new entry */ } /* Allocate new defrag_target_range */ new = kmalloc_obj(*new, GFP_NOFS); if (!new) { btrfs_free_extent_map(em); ret = -ENOMEM; break; } new->start = cur; new->len = range_len; list_add_tail(&new->list, target_list); next: cur = btrfs_extent_map_end(em); btrfs_free_extent_map(em); } if (ret < 0) { struct defrag_target_range *entry; struct defrag_target_range *tmp; list_for_each_entry_safe(entry, tmp, target_list, list) { list_del_init(&entry->list); kfree(entry); } } if (!ret && last_scanned_ret) { /* * If the last extent is not a target, the caller can skip to * the end of that extent. * Otherwise, we can only go the end of the specified range. */ if (!last_is_target) *last_scanned_ret = max(cur, *last_scanned_ret); else *last_scanned_ret = max(start + len, *last_scanned_ret); } return ret; } #define CLUSTER_SIZE (SZ_256K) static_assert(PAGE_ALIGNED(CLUSTER_SIZE)); /* * Defrag one contiguous target range. * * @inode: target inode * @target: target range to defrag * @pages: locked pages covering the defrag range * @nr_pages: number of locked pages * * Caller should ensure: * * - Pages are prepared * Pages should be locked, no ordered extent in the pages range, * no writeback. * * - Extent bits are locked */ static int defrag_one_locked_target(struct btrfs_inode *inode, struct defrag_target_range *target, struct folio **folios, int nr_pages, struct extent_state **cached_state) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_changeset *data_reserved = NULL; const u64 start = target->start; const u64 len = target->len; int ret = 0; ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start, len); if (ret < 0) return ret; btrfs_clear_extent_bit(&inode->io_tree, start, start + len - 1, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, cached_state); btrfs_set_extent_bit(&inode->io_tree, start, start + len - 1, EXTENT_DELALLOC | EXTENT_DEFRAG, cached_state); /* * Update the page status. * Due to possible large folios, we have to check all folios one by one. */ for (int i = 0; i < nr_pages && folios[i]; i++) { struct folio *folio = folios[i]; if (!folio) break; if (start >= folio_next_pos(folio) || start + len <= folio_pos(folio)) continue; btrfs_folio_clamp_clear_checked(fs_info, folio, start, len); btrfs_folio_clamp_set_dirty(fs_info, folio, start, len); } btrfs_delalloc_release_extents(inode, len); extent_changeset_free(data_reserved); return ret; } static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, u32 extent_thresh, u64 newer_than, bool do_compress, u64 *last_scanned_ret) { struct extent_state *cached_state = NULL; struct defrag_target_range *entry; struct defrag_target_range *tmp; LIST_HEAD(target_list); struct folio **folios; const u32 sectorsize = inode->root->fs_info->sectorsize; u64 cur = start; const unsigned int nr_pages = ((start + len - 1) >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1; int ret = 0; ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE); ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize)); folios = kzalloc_objs(struct folio *, nr_pages, GFP_NOFS); if (!folios) return -ENOMEM; /* Prepare all pages */ for (int i = 0; cur < start + len && i < nr_pages; i++) { folios[i] = defrag_prepare_one_folio(inode, cur >> PAGE_SHIFT); if (IS_ERR(folios[i])) { ret = PTR_ERR(folios[i]); folios[i] = NULL; goto free_folios; } cur = folio_next_pos(folios[i]); } for (int i = 0; i < nr_pages; i++) { if (!folios[i]) break; folio_wait_writeback(folios[i]); } /* We should get at least one folio. */ ASSERT(folios[0]); /* Lock the pages range */ btrfs_lock_extent(&inode->io_tree, folio_pos(folios[0]), cur - 1, &cached_state); /* * Now we have a consistent view about the extent map, re-check * which range really needs to be defragged. * * And this time we have extent locked already, pass @locked = true * so that we won't relock the extent range and cause deadlock. */ ret = defrag_collect_targets(inode, start, len, extent_thresh, newer_than, do_compress, true, &target_list, last_scanned_ret); if (ret < 0) goto unlock_extent; list_for_each_entry(entry, &target_list, list) { ret = defrag_one_locked_target(inode, entry, folios, nr_pages, &cached_state); if (ret < 0) break; } list_for_each_entry_safe(entry, tmp, &target_list, list) { list_del_init(&entry->list); kfree(entry); } unlock_extent: btrfs_unlock_extent(&inode->io_tree, folio_pos(folios[0]), cur - 1, &cached_state); free_folios: for (int i = 0; i < nr_pages; i++) { if (!folios[i]) break; folio_unlock(folios[i]); folio_put(folios[i]); } kfree(folios); return ret; } static int defrag_one_cluster(struct btrfs_inode *inode, struct file_ra_state *ra, u64 start, u32 len, u32 extent_thresh, u64 newer_than, bool do_compress, unsigned long *sectors_defragged, unsigned long max_sectors, u64 *last_scanned_ret) { const u32 sectorsize = inode->root->fs_info->sectorsize; struct defrag_target_range *entry; struct defrag_target_range *tmp; LIST_HEAD(target_list); int ret; ret = defrag_collect_targets(inode, start, len, extent_thresh, newer_than, do_compress, false, &target_list, NULL); if (ret < 0) goto out; list_for_each_entry(entry, &target_list, list) { u32 range_len = entry->len; /* Reached or beyond the limit */ if (max_sectors && *sectors_defragged >= max_sectors) { ret = 1; break; } if (max_sectors) range_len = min_t(u32, range_len, (max_sectors - *sectors_defragged) * sectorsize); /* * If defrag_one_range() has updated last_scanned_ret, * our range may already be invalid (e.g. hole punched). * Skip if our range is before last_scanned_ret, as there is * no need to defrag the range anymore. */ if (entry->start + range_len <= *last_scanned_ret) continue; page_cache_sync_readahead(inode->vfs_inode.i_mapping, ra, NULL, entry->start >> PAGE_SHIFT, ((entry->start + range_len - 1) >> PAGE_SHIFT) - (entry->start >> PAGE_SHIFT) + 1); /* * Here we may not defrag any range if holes are punched before * we locked the pages. * But that's fine, it only affects the @sectors_defragged * accounting. */ ret = defrag_one_range(inode, entry->start, range_len, extent_thresh, newer_than, do_compress, last_scanned_ret); if (ret < 0) break; *sectors_defragged += range_len >> inode->root->fs_info->sectorsize_bits; } out: list_for_each_entry_safe(entry, tmp, &target_list, list) { list_del_init(&entry->list); kfree(entry); } if (ret >= 0) *last_scanned_ret = max(*last_scanned_ret, start + len); return ret; } /* * Entry point to file defragmentation. * * @inode: inode to be defragged * @ra: readahead state * @range: defrag options including range and flags * @newer_than: minimum transid to defrag * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode * will be defragged. * * Return <0 for error. * Return >=0 for the number of sectors defragged, and range->start will be updated * to indicate the file offset where next defrag should be started at. * (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without * defragging all the range). */ int btrfs_defrag_file(struct btrfs_inode *inode, struct file_ra_state *ra, struct btrfs_ioctl_defrag_range_args *range, u64 newer_than, unsigned long max_to_defrag) { struct btrfs_fs_info *fs_info = inode->root->fs_info; unsigned long sectors_defragged = 0; u64 isize = i_size_read(&inode->vfs_inode); u64 cur; u64 last_byte; bool do_compress = (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS); bool no_compress = (range->flags & BTRFS_DEFRAG_RANGE_NOCOMPRESS); int compress_type = BTRFS_COMPRESS_ZLIB; int compress_level = 0; int ret = 0; u32 extent_thresh = range->extent_thresh; pgoff_t start_index; ASSERT(ra); if (isize == 0) return 0; if (range->start >= isize) return -EINVAL; if (do_compress) { if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS_LEVEL) { if (range->compress.type >= BTRFS_NR_COMPRESS_TYPES) return -EINVAL; if (range->compress.type) { compress_type = range->compress.type; compress_level = range->compress.level; if (!btrfs_compress_level_valid(compress_type, compress_level)) return -EINVAL; } } else { if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES) return -EINVAL; if (range->compress_type) compress_type = range->compress_type; } } else if (range->flags & BTRFS_DEFRAG_RANGE_NOCOMPRESS) { compress_type = BTRFS_DEFRAG_DONT_COMPRESS; compress_level = 1; } if (extent_thresh == 0) extent_thresh = SZ_256K; if (range->start + range->len > range->start) { /* Got a specific range */ last_byte = min(isize, range->start + range->len); } else { /* Defrag until file end */ last_byte = isize; } /* Align the range */ cur = round_down(range->start, fs_info->sectorsize); last_byte = round_up(last_byte, fs_info->sectorsize) - 1; /* * Make writeback start from the beginning of the range, so that the * defrag range can be written sequentially. */ start_index = cur >> PAGE_SHIFT; if (start_index < inode->vfs_inode.i_mapping->writeback_index) inode->vfs_inode.i_mapping->writeback_index = start_index; while (cur < last_byte) { const unsigned long prev_sectors_defragged = sectors_defragged; u64 last_scanned = cur; u64 cluster_end; if (btrfs_defrag_cancelled(fs_info)) { ret = -EAGAIN; break; } /* We want the cluster end at page boundary when possible */ cluster_end = (((cur >> PAGE_SHIFT) + (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1; cluster_end = min(cluster_end, last_byte); btrfs_inode_lock(inode, 0); if (IS_SWAPFILE(&inode->vfs_inode)) { ret = -ETXTBSY; btrfs_inode_unlock(inode, 0); break; } if (!(inode->vfs_inode.i_sb->s_flags & SB_ACTIVE)) { btrfs_inode_unlock(inode, 0); break; } if (do_compress || no_compress) { inode->defrag_compress = compress_type; inode->defrag_compress_level = compress_level; } ret = defrag_one_cluster(inode, ra, cur, cluster_end + 1 - cur, extent_thresh, newer_than, do_compress || no_compress, &sectors_defragged, max_to_defrag, &last_scanned); if (sectors_defragged > prev_sectors_defragged) balance_dirty_pages_ratelimited(inode->vfs_inode.i_mapping); btrfs_inode_unlock(inode, 0); if (ret < 0) break; cur = max(cluster_end + 1, last_scanned); if (ret > 0) { ret = 0; break; } cond_resched(); } /* * Update range.start for autodefrag, this will indicate where to start * in next run. */ range->start = cur; if (sectors_defragged) { /* * We have defragged some sectors, for compression case they * need to be written back immediately. */ if (range->flags & BTRFS_DEFRAG_RANGE_START_IO) { filemap_flush(inode->vfs_inode.i_mapping); if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags)) filemap_flush(inode->vfs_inode.i_mapping); } if (range->compress_type == BTRFS_COMPRESS_LZO) btrfs_set_fs_incompat(fs_info, COMPRESS_LZO); else if (range->compress_type == BTRFS_COMPRESS_ZSTD) btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD); ret = sectors_defragged; } if (do_compress || no_compress) { btrfs_inode_lock(inode, 0); inode->defrag_compress = BTRFS_COMPRESS_NONE; btrfs_inode_unlock(inode, 0); } return ret; } void __cold btrfs_auto_defrag_exit(void) { kmem_cache_destroy(btrfs_inode_defrag_cachep); } int __init btrfs_auto_defrag_init(void) { btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag", sizeof(struct inode_defrag), 0, 0, NULL); if (!btrfs_inode_defrag_cachep) return -ENOMEM; return 0; }
10 10 10 10 6 2 4 4 1 5 6 6 6 6 1 2 1 3 8 6 10 9 10 10 7 10 8 8 3 4 9 10 10 4 10 10 3 9 1 20 9 13 5 3 21 18 4 21 11 10 17 4 12 8 8 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 // SPDX-License-Identifier: GPL-2.0 #include <linux/blkdev.h> #include <linux/fscrypt.h> #include <linux/iversion.h> #include "ctree.h" #include "fs.h" #include "messages.h" #include "compression.h" #include "delalloc-space.h" #include "disk-io.h" #include "reflink.h" #include "transaction.h" #include "subpage.h" #include "accessors.h" #include "file-item.h" #include "file.h" #include "super.h" #define BTRFS_MAX_DEDUPE_LEN SZ_16M static int clone_finish_inode_update(struct btrfs_trans_handle *trans, struct inode *inode, u64 endoff, const u64 destoff, const u64 olen, bool no_time_update) { int ret; inode_inc_iversion(inode); if (!no_time_update) { inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); } /* * We round up to the block size at eof when determining which * extents to clone above, but shouldn't round up the file size. */ if (endoff > destoff + olen) endoff = destoff + olen; if (endoff > inode->i_size) { i_size_write(inode, endoff); btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); } ret = btrfs_update_inode(trans, BTRFS_I(inode)); if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); return ret; } return btrfs_end_transaction(trans); } static int copy_inline_to_page(struct btrfs_inode *inode, const u64 file_offset, char *inline_data, const u64 size, const u64 datal, const u8 comp_type) { struct btrfs_fs_info *fs_info = inode->root->fs_info; const u32 block_size = fs_info->sectorsize; const u64 range_end = file_offset + block_size - 1; const size_t inline_size = size - btrfs_file_extent_calc_inline_size(0); char *data_start = inline_data + btrfs_file_extent_calc_inline_size(0); struct extent_changeset *data_reserved = NULL; struct folio *folio = NULL; struct address_space *mapping = inode->vfs_inode.i_mapping; int ret; ASSERT(IS_ALIGNED(file_offset, block_size)); /* * We have flushed and locked the ranges of the source and destination * inodes, we also have locked the inodes, so we are safe to do a * reservation here. Also we must not do the reservation while holding * a transaction open, otherwise we would deadlock. */ ret = btrfs_delalloc_reserve_space(inode, &data_reserved, file_offset, block_size); if (ret) goto out; folio = __filemap_get_folio(mapping, file_offset >> PAGE_SHIFT, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, btrfs_alloc_write_mask(mapping)); if (IS_ERR(folio)) { ret = PTR_ERR(folio); goto out_unlock; } ret = set_folio_extent_mapped(folio); if (ret < 0) goto out_unlock; btrfs_clear_extent_bit(&inode->io_tree, file_offset, range_end, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, NULL); ret = btrfs_set_extent_delalloc(inode, file_offset, range_end, 0, NULL); if (ret) goto out_unlock; /* * After dirtying the page our caller will need to start a transaction, * and if we are low on metadata free space, that can cause flushing of * delalloc for all inodes in order to get metadata space released. * However we are holding the range locked for the whole duration of * the clone/dedupe operation, so we may deadlock if that happens and no * other task releases enough space. So mark this inode as not being * possible to flush to avoid such deadlock. We will clear that flag * when we finish cloning all extents, since a transaction is started * after finding each extent to clone. */ set_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &inode->runtime_flags); if (comp_type == BTRFS_COMPRESS_NONE) { memcpy_to_folio(folio, offset_in_folio(folio, file_offset), data_start, datal); } else { ret = btrfs_decompress(comp_type, data_start, folio, offset_in_folio(folio, file_offset), inline_size, datal); if (ret) goto out_unlock; flush_dcache_folio(folio); } /* * If our inline data is smaller then the block/page size, then the * remaining of the block/page is equivalent to zeroes. We had something * like the following done: * * $ xfs_io -f -c "pwrite -S 0xab 0 500" file * $ sync # (or fsync) * $ xfs_io -c "falloc 0 4K" file * $ xfs_io -c "pwrite -S 0xcd 4K 4K" * * So what's in the range [500, 4095] corresponds to zeroes. */ if (datal < block_size) folio_zero_range(folio, datal, block_size - datal); btrfs_folio_set_uptodate(fs_info, folio, file_offset, block_size); btrfs_folio_clear_checked(fs_info, folio, file_offset, block_size); btrfs_folio_set_dirty(fs_info, folio, file_offset, block_size); out_unlock: if (!IS_ERR(folio)) { folio_unlock(folio); folio_put(folio); } if (ret) btrfs_delalloc_release_space(inode, data_reserved, file_offset, block_size, true); btrfs_delalloc_release_extents(inode, block_size); out: extent_changeset_free(data_reserved); return ret; } /* * Deal with cloning of inline extents. We try to copy the inline extent from * the source inode to destination inode when possible. When not possible we * copy the inline extent's data into the respective page of the inode. */ static int clone_copy_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path, struct btrfs_key *new_key, const u64 drop_start, const u64 datal, const u64 size, const u8 comp_type, char *inline_data, struct btrfs_trans_handle **trans_out) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; const u64 aligned_end = ALIGN(new_key->offset + datal, fs_info->sectorsize); struct btrfs_trans_handle *trans = NULL; struct btrfs_drop_extents_args drop_args = { 0 }; int ret; struct btrfs_key key; if (new_key->offset > 0) { ret = copy_inline_to_page(inode, new_key->offset, inline_data, size, datal, comp_type); goto out; } key.objectid = btrfs_ino(inode); key.type = BTRFS_EXTENT_DATA_KEY; key.offset = 0; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) { return ret; } else if (ret > 0) { if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { ret = btrfs_next_leaf(root, path); if (ret < 0) return ret; else if (ret > 0) goto copy_inline_extent; } btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); if (key.objectid == btrfs_ino(inode) && key.type == BTRFS_EXTENT_DATA_KEY) { /* * There's an implicit hole at file offset 0, copy the * inline extent's data to the page. */ ASSERT(key.offset > 0); goto copy_to_page; } } else if (i_size_read(&inode->vfs_inode) <= datal) { struct btrfs_file_extent_item *ei; ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_file_extent_item); /* * If it's an inline extent replace it with the source inline * extent, otherwise copy the source inline extent data into * the respective page at the destination inode. */ if (btrfs_file_extent_type(path->nodes[0], ei) == BTRFS_FILE_EXTENT_INLINE) goto copy_inline_extent; goto copy_to_page; } copy_inline_extent: /* * We have no extent items, or we have an extent at offset 0 which may * or may not be inlined. All these cases are dealt the same way. */ if (i_size_read(&inode->vfs_inode) > datal) { /* * At the destination offset 0 we have either a hole, a regular * extent or an inline extent larger then the one we want to * clone. Deal with all these cases by copying the inline extent * data into the respective page at the destination inode. */ goto copy_to_page; } /* * Release path before starting a new transaction so we don't hold locks * that would confuse lockdep. */ btrfs_release_path(path); /* * If we end up here it means were copy the inline extent into a leaf * of the destination inode. We know we will drop or adjust at most one * extent item in the destination root. * * 1 unit - adjusting old extent (we may have to split it) * 1 unit - add new extent * 1 unit - inode update */ trans = btrfs_start_transaction(root, 3); if (IS_ERR(trans)) { ret = PTR_ERR(trans); trans = NULL; goto out; } drop_args.path = path; drop_args.start = drop_start; drop_args.end = aligned_end; drop_args.drop_cache = true; ret = btrfs_drop_extents(trans, root, inode, &drop_args); if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } ret = btrfs_insert_empty_item(trans, root, path, new_key, size); if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } write_extent_buffer(path->nodes[0], inline_data, btrfs_item_ptr_offset(path->nodes[0], path->slots[0]), size); btrfs_update_inode_bytes(inode, datal, drop_args.bytes_found); btrfs_set_inode_full_sync(inode); ret = btrfs_inode_set_file_extent_range(inode, 0, aligned_end); if (unlikely(ret)) btrfs_abort_transaction(trans, ret); out: if (!ret && !trans) { /* * No transaction here means we copied the inline extent into a * page of the destination inode. * * 1 unit to update inode item */ trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { ret = PTR_ERR(trans); trans = NULL; } } if (ret && trans) btrfs_end_transaction(trans); if (!ret) *trans_out = trans; return ret; copy_to_page: /* * Release our path because we don't need it anymore and also because * copy_inline_to_page() needs to reserve data and metadata, which may * need to flush delalloc when we are low on available space and * therefore cause a deadlock if writeback of an inline extent needs to * write to the same leaf or an ordered extent completion needs to write * to the same leaf. */ btrfs_release_path(path); ret = copy_inline_to_page(inode, new_key->offset, inline_data, size, datal, comp_type); goto out; } /* * Clone a range from inode file to another. * * @src: Inode to clone from * @inode: Inode to clone to * @off: Offset within source to start clone from * @olen: Original length, passed by user, of range to clone * @olen_aligned: Block-aligned value of olen * @destoff: Offset within @inode to start clone * @no_time_update: Whether to update mtime/ctime on the target inode */ static int btrfs_clone(struct inode *src, struct inode *inode, const u64 off, const u64 olen, const u64 olen_aligned, const u64 destoff, bool no_time_update) { struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_trans_handle *trans; char AUTO_KVFREE(buf); struct btrfs_key key; u32 nritems; int slot; int ret; const u64 len = olen_aligned; u64 last_dest_end = destoff; u64 prev_extent_end = off; ret = -ENOMEM; buf = kvmalloc(fs_info->nodesize, GFP_KERNEL); if (!buf) return ret; path = btrfs_alloc_path(); if (!path) return ret; path->reada = READA_FORWARD; /* Clone data */ key.objectid = btrfs_ino(BTRFS_I(src)); key.type = BTRFS_EXTENT_DATA_KEY; key.offset = off; while (1) { struct btrfs_file_extent_item *extent; u64 extent_gen; int type; u32 size; struct btrfs_key new_key; u64 disko = 0, diskl = 0; u64 datao = 0, datal = 0; u8 comp; u64 drop_start; /* Note the key will change type as we walk through the tree */ ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path, 0, 0); if (ret < 0) goto out; /* * First search, if no extent item that starts at offset off was * found but the previous item is an extent item, it's possible * it might overlap our target range, therefore process it. */ if (key.offset == off && ret > 0 && path->slots[0] > 0) { btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1); if (key.type == BTRFS_EXTENT_DATA_KEY) path->slots[0]--; } nritems = btrfs_header_nritems(path->nodes[0]); process_slot: if (path->slots[0] >= nritems) { ret = btrfs_next_leaf(BTRFS_I(src)->root, path); if (ret < 0) goto out; if (ret > 0) break; nritems = btrfs_header_nritems(path->nodes[0]); } leaf = path->nodes[0]; slot = path->slots[0]; btrfs_item_key_to_cpu(leaf, &key, slot); if (key.type > BTRFS_EXTENT_DATA_KEY || key.objectid != btrfs_ino(BTRFS_I(src))) break; ASSERT(key.type == BTRFS_EXTENT_DATA_KEY); extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); extent_gen = btrfs_file_extent_generation(leaf, extent); comp = btrfs_file_extent_compression(leaf, extent); type = btrfs_file_extent_type(leaf, extent); if (type == BTRFS_FILE_EXTENT_REG || type == BTRFS_FILE_EXTENT_PREALLOC) { disko = btrfs_file_extent_disk_bytenr(leaf, extent); diskl = btrfs_file_extent_disk_num_bytes(leaf, extent); datao = btrfs_file_extent_offset(leaf, extent); datal = btrfs_file_extent_num_bytes(leaf, extent); } else if (type == BTRFS_FILE_EXTENT_INLINE) { /* Take upper bound, may be compressed */ datal = btrfs_file_extent_ram_bytes(leaf, extent); } /* * The first search might have left us at an extent item that * ends before our target range's start, can happen if we have * holes and NO_HOLES feature enabled. * * Subsequent searches may leave us on a file range we have * processed before - this happens due to a race with ordered * extent completion for a file range that is outside our source * range, but that range was part of a file extent item that * also covered a leading part of our source range. */ if (key.offset + datal <= prev_extent_end) { path->slots[0]++; goto process_slot; } else if (key.offset >= off + len) { break; } prev_extent_end = key.offset + datal; size = btrfs_item_size(leaf, slot); read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot), size); btrfs_release_path(path); memcpy(&new_key, &key, sizeof(new_key)); new_key.objectid = btrfs_ino(BTRFS_I(inode)); if (off <= key.offset) new_key.offset = key.offset + destoff - off; else new_key.offset = destoff; /* * Deal with a hole that doesn't have an extent item that * represents it (NO_HOLES feature enabled). * This hole is either in the middle of the cloning range or at * the beginning (fully overlaps it or partially overlaps it). */ if (new_key.offset != last_dest_end) drop_start = last_dest_end; else drop_start = new_key.offset; if (type == BTRFS_FILE_EXTENT_REG || type == BTRFS_FILE_EXTENT_PREALLOC) { struct btrfs_replace_extent_info clone_info; /* * a | --- range to clone ---| b * | ------------- extent ------------- | */ /* Subtract range b */ if (key.offset + datal > off + len) datal = off + len - key.offset; /* Subtract range a */ if (off > key.offset) { datao += off - key.offset; datal -= off - key.offset; } clone_info.disk_offset = disko; clone_info.disk_len = diskl; clone_info.data_offset = datao; clone_info.data_len = datal; clone_info.file_offset = new_key.offset; clone_info.extent_buf = buf; clone_info.is_new_extent = false; clone_info.update_times = !no_time_update; ret = btrfs_replace_file_extents(BTRFS_I(inode), path, drop_start, new_key.offset + datal - 1, &clone_info, &trans); if (ret) goto out; } else { ASSERT(type == BTRFS_FILE_EXTENT_INLINE); /* * Inline extents always have to start at file offset 0 * and can never be bigger then the sector size. We can * never clone only parts of an inline extent, since all * reflink operations must start at a sector size aligned * offset, and the length must be aligned too or end at * the i_size (which implies the whole inlined data). */ ASSERT(key.offset == 0); ASSERT(datal <= fs_info->sectorsize); if (WARN_ON(type != BTRFS_FILE_EXTENT_INLINE) || WARN_ON(key.offset != 0) || WARN_ON(datal > fs_info->sectorsize)) { ret = -EUCLEAN; goto out; } ret = clone_copy_inline_extent(BTRFS_I(inode), path, &new_key, drop_start, datal, size, comp, buf, &trans); if (ret) goto out; } btrfs_release_path(path); /* * Whenever we share an extent we update the last_reflink_trans * of each inode to the current transaction. This is needed to * make sure fsync does not log multiple checksum items with * overlapping ranges (because some extent items might refer * only to sections of the original extent). For the destination * inode we do this regardless of the generation of the extents * or even if they are inline extents or explicit holes, to make * sure a full fsync does not skip them. For the source inode, * we only need to update last_reflink_trans in case it's a new * extent that is not a hole or an inline extent, to deal with * the checksums problem on fsync. */ if (extent_gen == trans->transid && disko > 0) BTRFS_I(src)->last_reflink_trans = trans->transid; BTRFS_I(inode)->last_reflink_trans = trans->transid; last_dest_end = ALIGN(new_key.offset + datal, fs_info->sectorsize); ret = clone_finish_inode_update(trans, inode, last_dest_end, destoff, olen, no_time_update); if (ret) goto out; if (new_key.offset + datal >= destoff + len) break; btrfs_release_path(path); key.offset = prev_extent_end; if (fatal_signal_pending(current)) { ret = -EINTR; goto out; } cond_resched(); } ret = 0; if (last_dest_end < destoff + len) { /* * We have an implicit hole that fully or partially overlaps our * cloning range at its end. This means that we either have the * NO_HOLES feature enabled or the implicit hole happened due to * mixing buffered and direct IO writes against this file. */ btrfs_release_path(path); /* * When using NO_HOLES and we are cloning a range that covers * only a hole (no extents) into a range beyond the current * i_size, punching a hole in the target range will not create * an extent map defining a hole, because the range starts at or * beyond current i_size. If the file previously had an i_size * greater than the new i_size set by this clone operation, we * need to make sure the next fsync is a full fsync, so that it * detects and logs a hole covering a range from the current * i_size to the new i_size. If the clone range covers extents, * besides a hole, then we know the full sync flag was already * set by previous calls to btrfs_replace_file_extents() that * replaced file extent items. */ if (last_dest_end >= i_size_read(inode)) btrfs_set_inode_full_sync(BTRFS_I(inode)); ret = btrfs_replace_file_extents(BTRFS_I(inode), path, last_dest_end, destoff + len - 1, NULL, &trans); if (ret) goto out; ret = clone_finish_inode_update(trans, inode, destoff + len, destoff, olen, no_time_update); } out: clear_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &BTRFS_I(inode)->runtime_flags); return ret; } static void btrfs_double_mmap_lock(struct btrfs_inode *inode1, struct btrfs_inode *inode2) { if (inode1 < inode2) swap(inode1, inode2); down_write(&inode1->i_mmap_lock); down_write_nested(&inode2->i_mmap_lock, SINGLE_DEPTH_NESTING); } static void btrfs_double_mmap_unlock(struct btrfs_inode *inode1, struct btrfs_inode *inode2) { up_write(&inode1->i_mmap_lock); up_write(&inode2->i_mmap_lock); } static int btrfs_extent_same_range(struct btrfs_inode *src, u64 loff, u64 len, struct btrfs_inode *dst, u64 dst_loff) { const u64 end = dst_loff + len - 1; struct extent_state *cached_state = NULL; struct btrfs_fs_info *fs_info = src->root->fs_info; const u64 bs = fs_info->sectorsize; int ret; /* * Lock destination range to serialize with concurrent readahead(), and * we are safe from concurrency with relocation of source extents * because we have already locked the inode's i_mmap_lock in exclusive * mode. */ btrfs_lock_extent(&dst->io_tree, dst_loff, end, &cached_state); ret = btrfs_clone(&src->vfs_inode, &dst->vfs_inode, loff, len, ALIGN(len, bs), dst_loff, 1); btrfs_unlock_extent(&dst->io_tree, dst_loff, end, &cached_state); btrfs_btree_balance_dirty(fs_info); return ret; } static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, struct inode *dst, u64 dst_loff) { int ret = 0; u64 i, tail_len, chunk_count; struct btrfs_root *root_dst = BTRFS_I(dst)->root; spin_lock(&root_dst->root_item_lock); if (root_dst->send_in_progress) { btrfs_warn_rl(root_dst->fs_info, "cannot deduplicate to root %llu while send operations are using it (%d in progress)", btrfs_root_id(root_dst), root_dst->send_in_progress); spin_unlock(&root_dst->root_item_lock); return -EAGAIN; } root_dst->dedupe_in_progress++; spin_unlock(&root_dst->root_item_lock); tail_len = olen % BTRFS_MAX_DEDUPE_LEN; chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN); for (i = 0; i < chunk_count; i++) { ret = btrfs_extent_same_range(BTRFS_I(src), loff, BTRFS_MAX_DEDUPE_LEN, BTRFS_I(dst), dst_loff); if (ret) goto out; loff += BTRFS_MAX_DEDUPE_LEN; dst_loff += BTRFS_MAX_DEDUPE_LEN; } if (tail_len > 0) ret = btrfs_extent_same_range(BTRFS_I(src), loff, tail_len, BTRFS_I(dst), dst_loff); out: spin_lock(&root_dst->root_item_lock); root_dst->dedupe_in_progress--; spin_unlock(&root_dst->root_item_lock); return ret; } static noinline int btrfs_clone_files(struct file *file, struct file *file_src, u64 off, u64 olen, u64 destoff) { struct extent_state *cached_state = NULL; struct inode *inode = file_inode(file); struct inode *src = file_inode(file_src); struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); int ret; u64 len = olen; u64 bs = fs_info->sectorsize; u64 end; /* * VFS's generic_remap_file_range_prep() protects us from cloning the * eof block into the middle of a file, which would result in corruption * if the file size is not blocksize aligned. So we don't need to check * for that case here. */ if (off + len == src->i_size) len = ALIGN(src->i_size, bs) - off; if (destoff > inode->i_size) { const u64 wb_start = ALIGN_DOWN(inode->i_size, bs); ret = btrfs_cont_expand(BTRFS_I(inode), inode->i_size, destoff); if (ret) return ret; /* * We may have truncated the last block if the inode's size is * not sector size aligned, so we need to wait for writeback to * complete before proceeding further, otherwise we can race * with cloning and attempt to increment a reference to an * extent that no longer exists (writeback completed right after * we found the previous extent covering eof and before we * attempted to increment its reference count). */ ret = btrfs_wait_ordered_range(BTRFS_I(inode), wb_start, destoff - wb_start); if (ret) return ret; } /* * Lock destination range to serialize with concurrent readahead(), and * we are safe from concurrency with relocation of source extents * because we have already locked the inode's i_mmap_lock in exclusive * mode. */ end = destoff + len - 1; btrfs_lock_extent(&BTRFS_I(inode)->io_tree, destoff, end, &cached_state); ret = btrfs_clone(src, inode, off, olen, len, destoff, 0); btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, destoff, end, &cached_state); if (ret < 0) return ret; /* * We may have copied an inline extent into a page of the destination * range. So flush delalloc and wait for ordered extent completion. * This is to ensure the invalidation below does not fail, as if for * example it finds a dirty folio, our folio release callback * (btrfs_release_folio()) returns false, which makes the invalidation * return an -EBUSY error. We can't ignore such failures since they * could come from some range other than the copied inline extent's * destination range and we have no way to know that. */ ret = btrfs_wait_ordered_range(BTRFS_I(inode), destoff, len); if (ret < 0) return ret; /* * Invalidate page cache so that future reads will see the cloned data * immediately and not the previous data. */ ret = filemap_invalidate_inode(inode, false, destoff, end); if (ret < 0) return ret; btrfs_btree_balance_dirty(fs_info); return 0; } static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t *len, unsigned int remap_flags) { struct btrfs_inode *inode_in = BTRFS_I(file_inode(file_in)); struct btrfs_inode *inode_out = BTRFS_I(file_inode(file_out)); u64 bs = inode_out->root->fs_info->sectorsize; u64 wb_len; int ret; if (!(remap_flags & REMAP_FILE_DEDUP)) { struct btrfs_root *root_out = inode_out->root; if (btrfs_root_readonly(root_out)) return -EROFS; ASSERT(inode_in->vfs_inode.i_sb == inode_out->vfs_inode.i_sb); } /* Can only reflink encrypted files if both files are encrypted. */ if (IS_ENCRYPTED(&inode_in->vfs_inode) != IS_ENCRYPTED(&inode_out->vfs_inode)) return -EINVAL; /* Don't make the dst file partly checksummed */ if ((inode_in->flags & BTRFS_INODE_NODATASUM) != (inode_out->flags & BTRFS_INODE_NODATASUM)) { return -EINVAL; } /* * Now that the inodes are locked, we need to start writeback ourselves * and can not rely on the writeback from the VFS's generic helper * generic_remap_file_range_prep() because: * * 1) For compression we must call filemap_fdatawrite_range() range * twice (btrfs_fdatawrite_range() does it for us), and the generic * helper only calls it once; * * 2) filemap_fdatawrite_range(), called by the generic helper only * waits for the writeback to complete, i.e. for IO to be done, and * not for the ordered extents to complete. We need to wait for them * to complete so that new file extent items are in the fs tree. */ if (*len == 0 && !(remap_flags & REMAP_FILE_DEDUP)) wb_len = ALIGN(inode_in->vfs_inode.i_size, bs) - ALIGN_DOWN(pos_in, bs); else wb_len = ALIGN(*len, bs); /* * Workaround to make sure NOCOW buffered write reach disk as NOCOW. * * Btrfs' back references do not have a block level granularity, they * work at the whole extent level. * NOCOW buffered write without data space reserved may not be able * to fall back to CoW due to lack of data space, thus could cause * data loss. * * Here we take a shortcut by flushing the whole inode, so that all * nocow write should reach disk as nocow before we increase the * reference of the extent. We could do better by only flushing NOCOW * data, but that needs extra accounting. * * Also we don't need to check ASYNC_EXTENT, as async extent will be * CoWed anyway, not affecting nocow part. */ ret = filemap_flush(inode_in->vfs_inode.i_mapping); if (ret < 0) return ret; ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs), wb_len); if (ret < 0) return ret; ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs), wb_len); if (ret < 0) return ret; return generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out, len, remap_flags); } static bool file_sync_write(const struct file *file) { if (file->f_flags & (__O_SYNC | O_DSYNC)) return true; if (IS_SYNC(file_inode(file))) return true; return false; } loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, struct file *dst_file, loff_t destoff, loff_t len, unsigned int remap_flags) { struct btrfs_inode *src_inode = BTRFS_I(file_inode(src_file)); struct btrfs_inode *dst_inode = BTRFS_I(file_inode(dst_file)); bool same_inode = dst_inode == src_inode; int ret; if (btrfs_is_shutdown(inode_to_fs_info(file_inode(src_file)))) return -EIO; if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) return -EINVAL; if (same_inode) { btrfs_inode_lock(src_inode, BTRFS_ILOCK_MMAP); } else { lock_two_nondirectories(&src_inode->vfs_inode, &dst_inode->vfs_inode); btrfs_double_mmap_lock(src_inode, dst_inode); } ret = btrfs_remap_file_range_prep(src_file, off, dst_file, destoff, &len, remap_flags); if (ret < 0 || len == 0) goto out_unlock; if (remap_flags & REMAP_FILE_DEDUP) ret = btrfs_extent_same(&src_inode->vfs_inode, off, len, &dst_inode->vfs_inode, destoff); else ret = btrfs_clone_files(dst_file, src_file, off, len, destoff); out_unlock: if (same_inode) { btrfs_inode_unlock(src_inode, BTRFS_ILOCK_MMAP); } else { btrfs_double_mmap_unlock(src_inode, dst_inode); unlock_two_nondirectories(&src_inode->vfs_inode, &dst_inode->vfs_inode); } /* * If either the source or the destination file was opened with O_SYNC, * O_DSYNC or has the S_SYNC attribute, fsync both the destination and * source files/ranges, so that after a successful return (0) followed * by a power failure results in the reflinked data to be readable from * both files/ranges. */ if (ret == 0 && len > 0 && (file_sync_write(src_file) || file_sync_write(dst_file))) { ret = btrfs_sync_file(src_file, off, off + len - 1, 0); if (ret == 0) ret = btrfs_sync_file(dst_file, destoff, destoff + len - 1, 0); } return ret < 0 ? ret : len; }
3 3041 3043 2 2 3 2 3 3 2 3044 3039 2 1793 1795 2 822 2 2 2 2 2 2 2 2 2 2 820 822 12 9 3 11 8 3 2 2 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 // SPDX-License-Identifier: GPL-2.0 /* * Disk events - monitor disk events like media change and eject request. */ #include <linux/export.h> #include <linux/moduleparam.h> #include <linux/blkdev.h> #include "blk.h" struct disk_events { struct list_head node; /* all disk_event's */ struct gendisk *disk; /* the associated disk */ spinlock_t lock; struct mutex block_mutex; /* protects blocking */ int block; /* event blocking depth */ unsigned int pending; /* events already sent out */ unsigned int clearing; /* events being cleared */ long poll_msecs; /* interval, -1 for default */ struct delayed_work dwork; }; static const char *disk_events_strs[] = { [ilog2(DISK_EVENT_MEDIA_CHANGE)] = "media_change", [ilog2(DISK_EVENT_EJECT_REQUEST)] = "eject_request", }; static char *disk_uevents[] = { [ilog2(DISK_EVENT_MEDIA_CHANGE)] = "DISK_MEDIA_CHANGE=1", [ilog2(DISK_EVENT_EJECT_REQUEST)] = "DISK_EJECT_REQUEST=1", }; /* list of all disk_events */ static DEFINE_MUTEX(disk_events_mutex); static LIST_HEAD(disk_events); /* disable in-kernel polling by default */ static unsigned long disk_events_dfl_poll_msecs; static unsigned long disk_events_poll_jiffies(struct gendisk *disk) { struct disk_events *ev = disk->ev; long intv_msecs = 0; /* * If device-specific poll interval is set, always use it. If * the default is being used, poll if the POLL flag is set. */ if (ev->poll_msecs >= 0) intv_msecs = ev->poll_msecs; else if (disk->event_flags & DISK_EVENT_FLAG_POLL) intv_msecs = disk_events_dfl_poll_msecs; return msecs_to_jiffies(intv_msecs); } /** * disk_block_events - block and flush disk event checking * @disk: disk to block events for * * On return from this function, it is guaranteed that event checking * isn't in progress and won't happen until unblocked by * disk_unblock_events(). Events blocking is counted and the actual * unblocking happens after the matching number of unblocks are done. * * Note that this intentionally does not block event checking from * disk_clear_events(). * * CONTEXT: * Might sleep. */ void disk_block_events(struct gendisk *disk) { struct disk_events *ev = disk->ev; unsigned long flags; bool cancel; if (!ev) return; /* * Outer mutex ensures that the first blocker completes canceling * the event work before further blockers are allowed to finish. */ mutex_lock(&ev->block_mutex); spin_lock_irqsave(&ev->lock, flags); cancel = !ev->block++; spin_unlock_irqrestore(&ev->lock, flags); if (cancel) cancel_delayed_work_sync(&disk->ev->dwork); mutex_unlock(&ev->block_mutex); } static void __disk_unblock_events(struct gendisk *disk, bool check_now) { struct disk_events *ev = disk->ev; unsigned long intv; unsigned long flags; spin_lock_irqsave(&ev->lock, flags); if (WARN_ON_ONCE(ev->block <= 0)) goto out_unlock; if (--ev->block) goto out_unlock; intv = disk_events_poll_jiffies(disk); if (check_now) queue_delayed_work(system_freezable_power_efficient_wq, &ev->dwork, 0); else if (intv) queue_delayed_work(system_freezable_power_efficient_wq, &ev->dwork, intv); out_unlock: spin_unlock_irqrestore(&ev->lock, flags); } /** * disk_unblock_events - unblock disk event checking * @disk: disk to unblock events for * * Undo disk_block_events(). When the block count reaches zero, it * starts events polling if configured. * * CONTEXT: * Don't care. Safe to call from irq context. */ void disk_unblock_events(struct gendisk *disk) { if (disk->ev) __disk_unblock_events(disk, false); } /** * disk_flush_events - schedule immediate event checking and flushing * @disk: disk to check and flush events for * @mask: events to flush * * Schedule immediate event checking on @disk if not blocked. Events in * @mask are scheduled to be cleared from the driver. Note that this * doesn't clear the events from @disk->ev. * * CONTEXT: * If @mask is non-zero must be called with disk->open_mutex held. */ void disk_flush_events(struct gendisk *disk, unsigned int mask) { struct disk_events *ev = disk->ev; if (!ev) return; spin_lock_irq(&ev->lock); ev->clearing |= mask; if (!ev->block) mod_delayed_work(system_freezable_power_efficient_wq, &ev->dwork, 0); spin_unlock_irq(&ev->lock); } /* * Tell userland about new events. Only the events listed in @disk->events are * reported, and only if DISK_EVENT_FLAG_UEVENT is set. Otherwise, events are * processed internally but never get reported to userland. */ static void disk_event_uevent(struct gendisk *disk, unsigned int events) { char *envp[ARRAY_SIZE(disk_uevents) + 1] = { }; int nr_events = 0, i; for (i = 0; i < ARRAY_SIZE(disk_uevents); i++) if (events & disk->events & (1 << i)) envp[nr_events++] = disk_uevents[i]; if (nr_events) kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp); } static void disk_check_events(struct disk_events *ev, unsigned int *clearing_ptr) { struct gendisk *disk = ev->disk; unsigned int clearing = *clearing_ptr; unsigned int events; unsigned long intv; /* check events */ events = disk->fops->check_events(disk, clearing); /* accumulate pending events and schedule next poll if necessary */ spin_lock_irq(&ev->lock); events &= ~ev->pending; ev->pending |= events; *clearing_ptr &= ~clearing; intv = disk_events_poll_jiffies(disk); if (!ev->block && intv) queue_delayed_work(system_freezable_power_efficient_wq, &ev->dwork, intv); spin_unlock_irq(&ev->lock); if (events & DISK_EVENT_MEDIA_CHANGE) inc_diskseq(disk); if (disk->event_flags & DISK_EVENT_FLAG_UEVENT) disk_event_uevent(disk, events); } /** * disk_clear_events - synchronously check, clear and return pending events * @disk: disk to fetch and clear events from * @mask: mask of events to be fetched and cleared * * Disk events are synchronously checked and pending events in @mask * are cleared and returned. This ignores the block count. * * CONTEXT: * Might sleep. */ static unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask) { struct disk_events *ev = disk->ev; unsigned int pending; unsigned int clearing = mask; if (!ev) return 0; disk_block_events(disk); /* * store the union of mask and ev->clearing on the stack so that the * race with disk_flush_events does not cause ambiguity (ev->clearing * can still be modified even if events are blocked). */ spin_lock_irq(&ev->lock); clearing |= ev->clearing; ev->clearing = 0; spin_unlock_irq(&ev->lock); disk_check_events(ev, &clearing); /* * if ev->clearing is not 0, the disk_flush_events got called in the * middle of this function, so we want to run the workfn without delay. */ __disk_unblock_events(disk, ev->clearing ? true : false); /* then, fetch and clear pending events */ spin_lock_irq(&ev->lock); pending = ev->pending & mask; ev->pending &= ~mask; spin_unlock_irq(&ev->lock); WARN_ON_ONCE(clearing & mask); return pending; } /** * disk_check_media_change - check if a removable media has been changed * @disk: gendisk to check * * Returns %true and marks the disk for a partition rescan whether a removable * media has been changed, and %false if the media did not change. */ bool disk_check_media_change(struct gendisk *disk) { unsigned int events; events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE | DISK_EVENT_EJECT_REQUEST); if (events & DISK_EVENT_MEDIA_CHANGE) { set_bit(GD_NEED_PART_SCAN, &disk->state); return true; } return false; } EXPORT_SYMBOL(disk_check_media_change); /** * disk_force_media_change - force a media change event * @disk: the disk which will raise the event * * Should be called when the media changes for @disk. Generates a uevent * and attempts to free all dentries and inodes and invalidates all block * device page cache entries in that case. */ void disk_force_media_change(struct gendisk *disk) { disk_event_uevent(disk, DISK_EVENT_MEDIA_CHANGE); inc_diskseq(disk); bdev_mark_dead(disk->part0, true); set_bit(GD_NEED_PART_SCAN, &disk->state); } EXPORT_SYMBOL_GPL(disk_force_media_change); /* * Separate this part out so that a different pointer for clearing_ptr can be * passed in for disk_clear_events. */ static void disk_events_workfn(struct work_struct *work) { struct delayed_work *dwork = to_delayed_work(work); struct disk_events *ev = container_of(dwork, struct disk_events, dwork); disk_check_events(ev, &ev->clearing); } /* * A disk events enabled device has the following sysfs nodes under * its /sys/block/X/ directory. * * events : list of all supported events * events_async : list of events which can be detected w/o polling * (always empty, only for backwards compatibility) * events_poll_msecs : polling interval, 0: disable, -1: system default */ static ssize_t __disk_events_show(unsigned int events, char *buf) { const char *delim = ""; ssize_t pos = 0; int i; for (i = 0; i < ARRAY_SIZE(disk_events_strs); i++) if (events & (1 << i)) { pos += sprintf(buf + pos, "%s%s", delim, disk_events_strs[i]); delim = " "; } if (pos) pos += sprintf(buf + pos, "\n"); return pos; } static ssize_t disk_events_show(struct device *dev, struct device_attribute *attr, char *buf) { struct gendisk *disk = dev_to_disk(dev); if (!(disk->event_flags & DISK_EVENT_FLAG_UEVENT)) return 0; return __disk_events_show(disk->events, buf); } static ssize_t disk_events_async_show(struct device *dev, struct device_attribute *attr, char *buf) { return 0; } static ssize_t disk_events_poll_msecs_show(struct device *dev, struct device_attribute *attr, char *buf) { struct gendisk *disk = dev_to_disk(dev); if (!disk->ev) return sprintf(buf, "-1\n"); return sprintf(buf, "%ld\n", disk->ev->poll_msecs); } static ssize_t disk_events_poll_msecs_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct gendisk *disk = dev_to_disk(dev); long intv; if (!count || !sscanf(buf, "%ld", &intv)) return -EINVAL; if (intv < 0 && intv != -1) return -EINVAL; if (!disk->ev) return -ENODEV; disk_block_events(disk); disk->ev->poll_msecs = intv; __disk_unblock_events(disk, true); return count; } DEVICE_ATTR(events, 0444, disk_events_show, NULL); DEVICE_ATTR(events_async, 0444, disk_events_async_show, NULL); DEVICE_ATTR(events_poll_msecs, 0644, disk_events_poll_msecs_show, disk_events_poll_msecs_store); /* * The default polling interval can be specified by the kernel * parameter block.events_dfl_poll_msecs which defaults to 0 * (disable). This can also be modified runtime by writing to * /sys/module/block/parameters/events_dfl_poll_msecs. */ static int disk_events_set_dfl_poll_msecs(const char *val, const struct kernel_param *kp) { struct disk_events *ev; int ret; ret = param_set_ulong(val, kp); if (ret < 0) return ret; mutex_lock(&disk_events_mutex); list_for_each_entry(ev, &disk_events, node) disk_flush_events(ev->disk, 0); mutex_unlock(&disk_events_mutex); return 0; } static const struct kernel_param_ops disk_events_dfl_poll_msecs_param_ops = { .set = disk_events_set_dfl_poll_msecs, .get = param_get_ulong, }; #undef MODULE_PARAM_PREFIX #define MODULE_PARAM_PREFIX "block." module_param_cb(events_dfl_poll_msecs, &disk_events_dfl_poll_msecs_param_ops, &disk_events_dfl_poll_msecs, 0644); /* * disk_{alloc|add|del|release}_events - initialize and destroy disk_events. */ int disk_alloc_events(struct gendisk *disk) { struct disk_events *ev; if (!disk->fops->check_events || !disk->events) return 0; ev = kzalloc_obj(*ev); if (!ev) { pr_warn("%s: failed to initialize events\n", disk->disk_name); return -ENOMEM; } INIT_LIST_HEAD(&ev->node); ev->disk = disk; spin_lock_init(&ev->lock); mutex_init(&ev->block_mutex); ev->block = 1; ev->poll_msecs = -1; INIT_DELAYED_WORK(&ev->dwork, disk_events_workfn); disk->ev = ev; return 0; } void disk_add_events(struct gendisk *disk) { if (!disk->ev) return; mutex_lock(&disk_events_mutex); list_add_tail(&disk->ev->node, &disk_events); mutex_unlock(&disk_events_mutex); /* * Block count is initialized to 1 and the following initial * unblock kicks it into action. */ __disk_unblock_events(disk, true); } void disk_del_events(struct gendisk *disk) { if (disk->ev) { disk_block_events(disk); mutex_lock(&disk_events_mutex); list_del_init(&disk->ev->node); mutex_unlock(&disk_events_mutex); } } void disk_release_events(struct gendisk *disk) { /* the block count should be 1 from disk_del_events() */ WARN_ON_ONCE(disk->ev && disk->ev->block != 1); kfree(disk->ev); }
1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 // SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/profile.c * Simple profiling. Manages a direct-mapped profile hit count buffer, * with configurable resolution, support for restricting the cpus on * which profiling is done, and switching between cpu time and * schedule() calls via kernel command line parameters passed at boot. * * Scheduler profiling support, Arjan van de Ven and Ingo Molnar, * Red Hat, July 2004 * Consolidation of architecture support code for profiling, * Nadia Yvette Chambers, Oracle, July 2004 * Amortized hit count accounting via per-cpu open-addressed hashtables * to resolve timer interrupt livelocks, Nadia Yvette Chambers, * Oracle, 2004 */ #include <linux/export.h> #include <linux/profile.h> #include <linux/memblock.h> #include <linux/notifier.h> #include <linux/mm.h> #include <linux/cpumask.h> #include <linux/cpu.h> #include <linux/highmem.h> #include <linux/mutex.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/sched/stat.h> #include <asm/sections.h> #include <asm/irq_regs.h> #include <asm/ptrace.h> struct profile_hit { u32 pc, hits; }; #define PROFILE_GRPSHIFT 3 #define PROFILE_GRPSZ (1 << PROFILE_GRPSHIFT) #define NR_PROFILE_HIT (PAGE_SIZE/sizeof(struct profile_hit)) #define NR_PROFILE_GRP (NR_PROFILE_HIT/PROFILE_GRPSZ) static atomic_t *prof_buffer; static unsigned long prof_len; static unsigned short int prof_shift; int prof_on __read_mostly; EXPORT_SYMBOL_GPL(prof_on); int profile_setup(char *str) { static const char schedstr[] = "schedule"; static const char kvmstr[] = "kvm"; const char *select = NULL; int par; if (!strncmp(str, schedstr, strlen(schedstr))) { prof_on = SCHED_PROFILING; select = schedstr; } else if (!strncmp(str, kvmstr, strlen(kvmstr))) { prof_on = KVM_PROFILING; select = kvmstr; } else if (get_option(&str, &par)) { prof_shift = clamp(par, 0, BITS_PER_LONG - 1); prof_on = CPU_PROFILING; pr_info("kernel profiling enabled (shift: %u)\n", prof_shift); } if (select) { if (str[strlen(select)] == ',') str += strlen(select) + 1; if (get_option(&str, &par)) prof_shift = clamp(par, 0, BITS_PER_LONG - 1); pr_info("kernel %s profiling enabled (shift: %u)\n", select, prof_shift); } return 1; } __setup("profile=", profile_setup); int __ref profile_init(void) { int buffer_bytes; if (!prof_on) return 0; /* only text is profiled */ prof_len = (_etext - _stext) >> prof_shift; if (!prof_len) { pr_warn("profiling shift: %u too large\n", prof_shift); prof_on = 0; return -EINVAL; } buffer_bytes = prof_len*sizeof(atomic_t); prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL|__GFP_NOWARN); if (prof_buffer) return 0; prof_buffer = alloc_pages_exact(buffer_bytes, GFP_KERNEL|__GFP_ZERO|__GFP_NOWARN); if (prof_buffer) return 0; prof_buffer = vzalloc(buffer_bytes); if (prof_buffer) return 0; return -ENOMEM; } static void do_profile_hits(int type, void *__pc, unsigned int nr_hits) { unsigned long pc; pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift; if (pc < prof_len) atomic_add(nr_hits, &prof_buffer[pc]); } void profile_hits(int type, void *__pc, unsigned int nr_hits) { if (prof_on != type || !prof_buffer) return; do_profile_hits(type, __pc, nr_hits); } EXPORT_SYMBOL_GPL(profile_hits); void profile_tick(int type) { struct pt_regs *regs = get_irq_regs(); /* This is the old kernel-only legacy profiling */ if (!user_mode(regs)) profile_hit(type, (void *)profile_pc(regs)); } #ifdef CONFIG_PROC_FS #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/uaccess.h> /* * This function accesses profiling information. The returned data is * binary: the sampling step and the actual contents of the profile * buffer. Use of the program readprofile is recommended in order to * get meaningful info out of these data. */ static ssize_t read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos) { unsigned long p = *ppos; ssize_t read; char *pnt; unsigned long sample_step = 1UL << prof_shift; if (p >= (prof_len+1)*sizeof(unsigned int)) return 0; if (count > (prof_len+1)*sizeof(unsigned int) - p) count = (prof_len+1)*sizeof(unsigned int) - p; read = 0; while (p < sizeof(unsigned int) && count > 0) { if (put_user(*((char *)(&sample_step)+p), buf)) return -EFAULT; buf++; p++; count--; read++; } pnt = (char *)prof_buffer + p - sizeof(atomic_t); if (copy_to_user(buf, (void *)pnt, count)) return -EFAULT; read += count; *ppos += read; return read; } /* default is to not implement this call */ int __weak setup_profiling_timer(unsigned mult) { return -EINVAL; } /* * Writing to /proc/profile resets the counters * * Writing a 'profiling multiplier' value into it also re-sets the profiling * interrupt frequency, on architectures that support this. */ static ssize_t write_profile(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { #ifdef CONFIG_SMP if (count == sizeof(int)) { unsigned int multiplier; if (copy_from_user(&multiplier, buf, sizeof(int))) return -EFAULT; if (setup_profiling_timer(multiplier)) return -EINVAL; } #endif memset(prof_buffer, 0, prof_len * sizeof(atomic_t)); return count; } static const struct proc_ops profile_proc_ops = { .proc_read = read_profile, .proc_write = write_profile, .proc_lseek = default_llseek, }; int __ref create_proc_profile(void) { struct proc_dir_entry *entry; int err = 0; if (!prof_on) return 0; entry = proc_create("profile", S_IWUSR | S_IRUGO, NULL, &profile_proc_ops); if (entry) proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t)); return err; } subsys_initcall(create_proc_profile); #endif /* CONFIG_PROC_FS */
1 1 1 1 1 1 4 4 1 1 1 1 4 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 // SPDX-License-Identifier: GPL-2.0-or-later /* * OSS emulation layer for the mixer interface * Copyright (c) by Jaroslav Kysela <perex@perex.cz> */ #include <linux/init.h> #include <linux/slab.h> #include <linux/time.h> #include <linux/string.h> #include <linux/module.h> #include <linux/compat.h> #include <sound/core.h> #include <sound/minors.h> #include <sound/control.h> #include <sound/info.h> #include <sound/mixer_oss.h> #include <linux/soundcard.h> #define OSS_ALSAEMULVER _SIOR ('M', 249, int) MODULE_AUTHOR("Jaroslav Kysela <perex@perex.cz>"); MODULE_DESCRIPTION("Mixer OSS emulation for ALSA."); MODULE_LICENSE("GPL"); MODULE_ALIAS_SNDRV_MINOR(SNDRV_MINOR_OSS_MIXER); static int snd_mixer_oss_open(struct inode *inode, struct file *file) { struct snd_card *card; struct snd_mixer_oss_file *fmixer; int err; err = nonseekable_open(inode, file); if (err < 0) return err; card = snd_lookup_oss_minor_data(iminor(inode), SNDRV_OSS_DEVICE_TYPE_MIXER); if (card == NULL) return -ENODEV; if (card->mixer_oss == NULL) { snd_card_unref(card); return -ENODEV; } err = snd_card_file_add(card, file); if (err < 0) { snd_card_unref(card); return err; } fmixer = kzalloc_obj(*fmixer); if (fmixer == NULL) { snd_card_file_remove(card, file); snd_card_unref(card); return -ENOMEM; } fmixer->card = card; fmixer->mixer = card->mixer_oss; file->private_data = fmixer; if (!try_module_get(card->module)) { kfree(fmixer); snd_card_file_remove(card, file); snd_card_unref(card); return -EFAULT; } snd_card_unref(card); return 0; } static int snd_mixer_oss_release(struct inode *inode, struct file *file) { struct snd_mixer_oss_file *fmixer; if (file->private_data) { fmixer = file->private_data; module_put(fmixer->card->module); snd_card_file_remove(fmixer->card, file); kfree(fmixer); } return 0; } static int snd_mixer_oss_info(struct snd_mixer_oss_file *fmixer, mixer_info __user *_info) { struct snd_card *card = fmixer->card; struct snd_mixer_oss *mixer = fmixer->mixer; struct mixer_info info; memset(&info, 0, sizeof(info)); strscpy(info.id, mixer && mixer->id[0] ? mixer->id : card->driver, sizeof(info.id)); strscpy(info.name, mixer && mixer->name[0] ? mixer->name : card->mixername, sizeof(info.name)); info.modify_counter = card->mixer_oss_change_count; if (copy_to_user(_info, &info, sizeof(info))) return -EFAULT; return 0; } static int snd_mixer_oss_info_obsolete(struct snd_mixer_oss_file *fmixer, _old_mixer_info __user *_info) { struct snd_card *card = fmixer->card; struct snd_mixer_oss *mixer = fmixer->mixer; _old_mixer_info info; memset(&info, 0, sizeof(info)); strscpy(info.id, mixer && mixer->id[0] ? mixer->id : card->driver, sizeof(info.id)); strscpy(info.name, mixer && mixer->name[0] ? mixer->name : card->mixername, sizeof(info.name)); if (copy_to_user(_info, &info, sizeof(info))) return -EFAULT; return 0; } static int snd_mixer_oss_caps(struct snd_mixer_oss_file *fmixer) { struct snd_mixer_oss *mixer = fmixer->mixer; int result = 0; if (mixer == NULL) return -EIO; if (mixer->get_recsrc && mixer->put_recsrc) result |= SOUND_CAP_EXCL_INPUT; return result; } static int snd_mixer_oss_devmask(struct snd_mixer_oss_file *fmixer) { struct snd_mixer_oss *mixer = fmixer->mixer; struct snd_mixer_oss_slot *pslot; int result = 0, chn; if (mixer == NULL) return -EIO; guard(mutex)(&mixer->reg_mutex); for (chn = 0; chn < 31; chn++) { pslot = &mixer->slots[chn]; if (pslot->put_volume || pslot->put_recsrc) result |= 1 << chn; } return result; } static int snd_mixer_oss_stereodevs(struct snd_mixer_oss_file *fmixer) { struct snd_mixer_oss *mixer = fmixer->mixer; struct snd_mixer_oss_slot *pslot; int result = 0, chn; if (mixer == NULL) return -EIO; guard(mutex)(&mixer->reg_mutex); for (chn = 0; chn < 31; chn++) { pslot = &mixer->slots[chn]; if (pslot->put_volume && pslot->stereo) result |= 1 << chn; } return result; } static int snd_mixer_oss_recmask(struct snd_mixer_oss_file *fmixer) { struct snd_mixer_oss *mixer = fmixer->mixer; int result = 0; if (mixer == NULL) return -EIO; guard(mutex)(&mixer->reg_mutex); if (mixer->put_recsrc && mixer->get_recsrc) { /* exclusive */ result = mixer->mask_recsrc; } else { struct snd_mixer_oss_slot *pslot; int chn; for (chn = 0; chn < 31; chn++) { pslot = &mixer->slots[chn]; if (pslot->put_recsrc) result |= 1 << chn; } } return result; } static int snd_mixer_oss_get_recsrc(struct snd_mixer_oss_file *fmixer) { struct snd_mixer_oss *mixer = fmixer->mixer; int result = 0; if (mixer == NULL) return -EIO; guard(mutex)(&mixer->reg_mutex); if (mixer->put_recsrc && mixer->get_recsrc) { /* exclusive */ unsigned int index; result = mixer->get_recsrc(fmixer, &index); if (result < 0) return result; result = 1 << index; } else { struct snd_mixer_oss_slot *pslot; int chn; for (chn = 0; chn < 31; chn++) { pslot = &mixer->slots[chn]; if (pslot->get_recsrc) { int active = 0; pslot->get_recsrc(fmixer, pslot, &active); if (active) result |= 1 << chn; } } } mixer->oss_recsrc = result; return result; } static int snd_mixer_oss_set_recsrc(struct snd_mixer_oss_file *fmixer, int recsrc) { struct snd_mixer_oss *mixer = fmixer->mixer; struct snd_mixer_oss_slot *pslot; int chn, active; unsigned int index; int result = 0; if (mixer == NULL) return -EIO; guard(mutex)(&mixer->reg_mutex); if (mixer->get_recsrc && mixer->put_recsrc) { /* exclusive input */ if (recsrc & ~mixer->oss_recsrc) recsrc &= ~mixer->oss_recsrc; mixer->put_recsrc(fmixer, ffz(~recsrc)); mixer->get_recsrc(fmixer, &index); result = 1 << index; } for (chn = 0; chn < 31; chn++) { pslot = &mixer->slots[chn]; if (pslot->put_recsrc) { active = (recsrc & (1 << chn)) ? 1 : 0; pslot->put_recsrc(fmixer, pslot, active); } } if (! result) { for (chn = 0; chn < 31; chn++) { pslot = &mixer->slots[chn]; if (pslot->get_recsrc) { active = 0; pslot->get_recsrc(fmixer, pslot, &active); if (active) result |= 1 << chn; } } } return result; } static int snd_mixer_oss_get_volume(struct snd_mixer_oss_file *fmixer, int slot) { struct snd_mixer_oss *mixer = fmixer->mixer; struct snd_mixer_oss_slot *pslot; int result = 0, left, right; if (mixer == NULL || slot > 30) return -EIO; guard(mutex)(&mixer->reg_mutex); pslot = &mixer->slots[slot]; left = pslot->volume[0]; right = pslot->volume[1]; if (pslot->get_volume) result = pslot->get_volume(fmixer, pslot, &left, &right); if (!pslot->stereo) right = left; if (snd_BUG_ON(left < 0 || left > 100)) return -EIO; if (snd_BUG_ON(right < 0 || right > 100)) return -EIO; if (result >= 0) { pslot->volume[0] = left; pslot->volume[1] = right; result = (left & 0xff) | ((right & 0xff) << 8); } return result; } static int snd_mixer_oss_set_volume(struct snd_mixer_oss_file *fmixer, int slot, int volume) { struct snd_mixer_oss *mixer = fmixer->mixer; struct snd_mixer_oss_slot *pslot; int result = 0, left = volume & 0xff, right = (volume >> 8) & 0xff; if (mixer == NULL || slot > 30) return -EIO; guard(mutex)(&mixer->reg_mutex); pslot = &mixer->slots[slot]; if (left > 100) left = 100; if (right > 100) right = 100; if (!pslot->stereo) right = left; if (pslot->put_volume) result = pslot->put_volume(fmixer, pslot, left, right); if (result < 0) return result; pslot->volume[0] = left; pslot->volume[1] = right; result = (left & 0xff) | ((right & 0xff) << 8); return result; } static int snd_mixer_oss_ioctl1(struct snd_mixer_oss_file *fmixer, unsigned int cmd, unsigned long arg) { void __user *argp = (void __user *)arg; int __user *p = argp; int tmp; if (snd_BUG_ON(!fmixer)) return -ENXIO; if (((cmd >> 8) & 0xff) == 'M') { switch (cmd) { case SOUND_MIXER_INFO: return snd_mixer_oss_info(fmixer, argp); case SOUND_OLD_MIXER_INFO: return snd_mixer_oss_info_obsolete(fmixer, argp); case SOUND_MIXER_WRITE_RECSRC: if (get_user(tmp, p)) return -EFAULT; tmp = snd_mixer_oss_set_recsrc(fmixer, tmp); if (tmp < 0) return tmp; return put_user(tmp, p); case OSS_GETVERSION: return put_user(SNDRV_OSS_VERSION, p); case OSS_ALSAEMULVER: return put_user(1, p); case SOUND_MIXER_READ_DEVMASK: tmp = snd_mixer_oss_devmask(fmixer); if (tmp < 0) return tmp; return put_user(tmp, p); case SOUND_MIXER_READ_STEREODEVS: tmp = snd_mixer_oss_stereodevs(fmixer); if (tmp < 0) return tmp; return put_user(tmp, p); case SOUND_MIXER_READ_RECMASK: tmp = snd_mixer_oss_recmask(fmixer); if (tmp < 0) return tmp; return put_user(tmp, p); case SOUND_MIXER_READ_CAPS: tmp = snd_mixer_oss_caps(fmixer); if (tmp < 0) return tmp; return put_user(tmp, p); case SOUND_MIXER_READ_RECSRC: tmp = snd_mixer_oss_get_recsrc(fmixer); if (tmp < 0) return tmp; return put_user(tmp, p); } } if (cmd & SIOC_IN) { if (get_user(tmp, p)) return -EFAULT; tmp = snd_mixer_oss_set_volume(fmixer, cmd & 0xff, tmp); if (tmp < 0) return tmp; return put_user(tmp, p); } else if (cmd & SIOC_OUT) { tmp = snd_mixer_oss_get_volume(fmixer, cmd & 0xff); if (tmp < 0) return tmp; return put_user(tmp, p); } return -ENXIO; } static long snd_mixer_oss_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { return snd_mixer_oss_ioctl1(file->private_data, cmd, arg); } int snd_mixer_oss_ioctl_card(struct snd_card *card, unsigned int cmd, unsigned long arg) { struct snd_mixer_oss_file fmixer; if (snd_BUG_ON(!card)) return -ENXIO; if (card->mixer_oss == NULL) return -ENXIO; memset(&fmixer, 0, sizeof(fmixer)); fmixer.card = card; fmixer.mixer = card->mixer_oss; return snd_mixer_oss_ioctl1(&fmixer, cmd, arg); } EXPORT_SYMBOL(snd_mixer_oss_ioctl_card); #ifdef CONFIG_COMPAT /* all compatible */ static long snd_mixer_oss_ioctl_compat(struct file *file, unsigned int cmd, unsigned long arg) { return snd_mixer_oss_ioctl1(file->private_data, cmd, (unsigned long)compat_ptr(arg)); } #else #define snd_mixer_oss_ioctl_compat NULL #endif /* * REGISTRATION PART */ static const struct file_operations snd_mixer_oss_f_ops = { .owner = THIS_MODULE, .open = snd_mixer_oss_open, .release = snd_mixer_oss_release, .unlocked_ioctl = snd_mixer_oss_ioctl, .compat_ioctl = snd_mixer_oss_ioctl_compat, }; /* * utilities */ static long snd_mixer_oss_conv(long val, long omin, long omax, long nmin, long nmax) { long orange = omax - omin, nrange = nmax - nmin; if (orange == 0) return 0; return DIV_ROUND_CLOSEST(nrange * (val - omin), orange) + nmin; } /* convert from alsa native to oss values (0-100) */ static long snd_mixer_oss_conv1(long val, long min, long max, int *old) { if (val == snd_mixer_oss_conv(*old, 0, 100, min, max)) return *old; return snd_mixer_oss_conv(val, min, max, 0, 100); } /* convert from oss to alsa native values */ static long snd_mixer_oss_conv2(long val, long min, long max) { return snd_mixer_oss_conv(val, 0, 100, min, max); } #if 0 static void snd_mixer_oss_recsrce_set(struct snd_card *card, int slot) { struct snd_mixer_oss *mixer = card->mixer_oss; if (mixer) mixer->mask_recsrc |= 1 << slot; } static int snd_mixer_oss_recsrce_get(struct snd_card *card, int slot) { struct snd_mixer_oss *mixer = card->mixer_oss; if (mixer && (mixer->mask_recsrc & (1 << slot))) return 1; return 0; } #endif #define SNDRV_MIXER_OSS_SIGNATURE 0x65999250 #define SNDRV_MIXER_OSS_ITEM_GLOBAL 0 #define SNDRV_MIXER_OSS_ITEM_GSWITCH 1 #define SNDRV_MIXER_OSS_ITEM_GROUTE 2 #define SNDRV_MIXER_OSS_ITEM_GVOLUME 3 #define SNDRV_MIXER_OSS_ITEM_PSWITCH 4 #define SNDRV_MIXER_OSS_ITEM_PROUTE 5 #define SNDRV_MIXER_OSS_ITEM_PVOLUME 6 #define SNDRV_MIXER_OSS_ITEM_CSWITCH 7 #define SNDRV_MIXER_OSS_ITEM_CROUTE 8 #define SNDRV_MIXER_OSS_ITEM_CVOLUME 9 #define SNDRV_MIXER_OSS_ITEM_CAPTURE 10 #define SNDRV_MIXER_OSS_ITEM_COUNT 11 #define SNDRV_MIXER_OSS_PRESENT_GLOBAL (1<<0) #define SNDRV_MIXER_OSS_PRESENT_GSWITCH (1<<1) #define SNDRV_MIXER_OSS_PRESENT_GROUTE (1<<2) #define SNDRV_MIXER_OSS_PRESENT_GVOLUME (1<<3) #define SNDRV_MIXER_OSS_PRESENT_PSWITCH (1<<4) #define SNDRV_MIXER_OSS_PRESENT_PROUTE (1<<5) #define SNDRV_MIXER_OSS_PRESENT_PVOLUME (1<<6) #define SNDRV_MIXER_OSS_PRESENT_CSWITCH (1<<7) #define SNDRV_MIXER_OSS_PRESENT_CROUTE (1<<8) #define SNDRV_MIXER_OSS_PRESENT_CVOLUME (1<<9) #define SNDRV_MIXER_OSS_PRESENT_CAPTURE (1<<10) struct slot { unsigned int signature; unsigned int present; unsigned int channels; unsigned int numid[SNDRV_MIXER_OSS_ITEM_COUNT]; unsigned int capture_item; const struct snd_mixer_oss_assign_table *assigned; unsigned int allocated: 1; }; #define ID_UNKNOWN ((unsigned int)-1) static struct snd_kcontrol *snd_mixer_oss_test_id(struct snd_mixer_oss *mixer, const char *name, int index) { struct snd_card *card = mixer->card; struct snd_ctl_elem_id id; memset(&id, 0, sizeof(id)); id.iface = SNDRV_CTL_ELEM_IFACE_MIXER; strscpy(id.name, name, sizeof(id.name)); id.index = index; return snd_ctl_find_id(card, &id); } static void snd_mixer_oss_get_volume1_vol(struct snd_mixer_oss_file *fmixer, struct snd_mixer_oss_slot *pslot, unsigned int numid, int *left, int *right) { struct snd_kcontrol *kctl; struct snd_card *card = fmixer->card; if (numid == ID_UNKNOWN) return; guard(rwsem_read)(&card->controls_rwsem); if (card->shutdown) return; kctl = snd_ctl_find_numid(card, numid); if (!kctl) return; struct snd_ctl_elem_info *uinfo __free(kfree) = kzalloc_obj(*uinfo); struct snd_ctl_elem_value *uctl __free(kfree) = kzalloc_obj(*uctl); if (uinfo == NULL || uctl == NULL) return; if (kctl->info(kctl, uinfo)) return; if (kctl->get(kctl, uctl)) return; if (uinfo->type == SNDRV_CTL_ELEM_TYPE_BOOLEAN && uinfo->value.integer.min == 0 && uinfo->value.integer.max == 1) return; *left = snd_mixer_oss_conv1(uctl->value.integer.value[0], uinfo->value.integer.min, uinfo->value.integer.max, &pslot->volume[0]); if (uinfo->count > 1) *right = snd_mixer_oss_conv1(uctl->value.integer.value[1], uinfo->value.integer.min, uinfo->value.integer.max, &pslot->volume[1]); } static void snd_mixer_oss_get_volume1_sw(struct snd_mixer_oss_file *fmixer, struct snd_mixer_oss_slot *pslot, unsigned int numid, int *left, int *right, int route) { struct snd_kcontrol *kctl; struct snd_card *card = fmixer->card; if (numid == ID_UNKNOWN) return; guard(rwsem_read)(&card->controls_rwsem); if (card->shutdown) return; kctl = snd_ctl_find_numid(card, numid); if (!kctl) return; struct snd_ctl_elem_info *uinfo __free(kfree) = kzalloc_obj(*uinfo); struct snd_ctl_elem_value *uctl __free(kfree) = kzalloc_obj(*uctl); if (uinfo == NULL || uctl == NULL) return; if (kctl->info(kctl, uinfo)) return; if (kctl->get(kctl, uctl)) return; if (!uctl->value.integer.value[0]) { *left = 0; if (uinfo->count == 1) *right = 0; } if (uinfo->count > 1 && !uctl->value.integer.value[route ? 3 : 1]) *right = 0; } static int snd_mixer_oss_get_volume1(struct snd_mixer_oss_file *fmixer, struct snd_mixer_oss_slot *pslot, int *left, int *right) { struct slot *slot = pslot->private_data; *left = *right = 100; if (slot->present & SNDRV_MIXER_OSS_PRESENT_PVOLUME) { snd_mixer_oss_get_volume1_vol(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_PVOLUME], left, right); } else if (slot->present & SNDRV_MIXER_OSS_PRESENT_GVOLUME) { snd_mixer_oss_get_volume1_vol(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_GVOLUME], left, right); } else if (slot->present & SNDRV_MIXER_OSS_PRESENT_GLOBAL) { snd_mixer_oss_get_volume1_vol(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_GLOBAL], left, right); } if (slot->present & SNDRV_MIXER_OSS_PRESENT_PSWITCH) { snd_mixer_oss_get_volume1_sw(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_PSWITCH], left, right, 0); } else if (slot->present & SNDRV_MIXER_OSS_PRESENT_GSWITCH) { snd_mixer_oss_get_volume1_sw(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_GSWITCH], left, right, 0); } else if (slot->present & SNDRV_MIXER_OSS_PRESENT_PROUTE) { snd_mixer_oss_get_volume1_sw(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_PROUTE], left, right, 1); } else if (slot->present & SNDRV_MIXER_OSS_PRESENT_GROUTE) { snd_mixer_oss_get_volume1_sw(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_GROUTE], left, right, 1); } return 0; } static void snd_mixer_oss_put_volume1_vol(struct snd_mixer_oss_file *fmixer, struct snd_mixer_oss_slot *pslot, unsigned int numid, int left, int right) { struct snd_kcontrol *kctl; struct snd_card *card = fmixer->card; int res; if (numid == ID_UNKNOWN) return; guard(rwsem_read)(&card->controls_rwsem); if (card->shutdown) return; kctl = snd_ctl_find_numid(card, numid); if (!kctl) return; struct snd_ctl_elem_info *uinfo __free(kfree) = kzalloc_obj(*uinfo); struct snd_ctl_elem_value *uctl __free(kfree) = kzalloc_obj(*uctl); if (uinfo == NULL || uctl == NULL) return; if (kctl->info(kctl, uinfo)) return; if (uinfo->type == SNDRV_CTL_ELEM_TYPE_BOOLEAN && uinfo->value.integer.min == 0 && uinfo->value.integer.max == 1) return; uctl->value.integer.value[0] = snd_mixer_oss_conv2(left, uinfo->value.integer.min, uinfo->value.integer.max); if (uinfo->count > 1) uctl->value.integer.value[1] = snd_mixer_oss_conv2(right, uinfo->value.integer.min, uinfo->value.integer.max); res = kctl->put(kctl, uctl); if (res < 0) return; if (res > 0) snd_ctl_notify(card, SNDRV_CTL_EVENT_MASK_VALUE, &kctl->id); } static void snd_mixer_oss_put_volume1_sw(struct snd_mixer_oss_file *fmixer, struct snd_mixer_oss_slot *pslot, unsigned int numid, int left, int right, int route) { struct snd_kcontrol *kctl; struct snd_card *card = fmixer->card; int res; if (numid == ID_UNKNOWN) return; guard(rwsem_read)(&card->controls_rwsem); if (card->shutdown) return; kctl = snd_ctl_find_numid(card, numid); if (!kctl) return; struct snd_ctl_elem_info *uinfo __free(kfree) = kzalloc_obj(*uinfo); struct snd_ctl_elem_value *uctl __free(kfree) = kzalloc_obj(*uctl); if (uinfo == NULL || uctl == NULL) return; if (kctl->info(kctl, uinfo)) return; if (uinfo->count > 1) { uctl->value.integer.value[0] = left > 0 ? 1 : 0; uctl->value.integer.value[route ? 3 : 1] = right > 0 ? 1 : 0; if (route) { uctl->value.integer.value[1] = uctl->value.integer.value[2] = 0; } } else { uctl->value.integer.value[0] = (left > 0 || right > 0) ? 1 : 0; } res = kctl->put(kctl, uctl); if (res < 0) return; if (res > 0) snd_ctl_notify(card, SNDRV_CTL_EVENT_MASK_VALUE, &kctl->id); } static int snd_mixer_oss_put_volume1(struct snd_mixer_oss_file *fmixer, struct snd_mixer_oss_slot *pslot, int left, int right) { struct slot *slot = pslot->private_data; if (slot->present & SNDRV_MIXER_OSS_PRESENT_PVOLUME) { snd_mixer_oss_put_volume1_vol(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_PVOLUME], left, right); if (slot->present & SNDRV_MIXER_OSS_PRESENT_CVOLUME) snd_mixer_oss_put_volume1_vol(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_CVOLUME], left, right); } else if (slot->present & SNDRV_MIXER_OSS_PRESENT_CVOLUME) { snd_mixer_oss_put_volume1_vol(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_CVOLUME], left, right); } else if (slot->present & SNDRV_MIXER_OSS_PRESENT_GVOLUME) { snd_mixer_oss_put_volume1_vol(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_GVOLUME], left, right); } else if (slot->present & SNDRV_MIXER_OSS_PRESENT_GLOBAL) { snd_mixer_oss_put_volume1_vol(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_GLOBAL], left, right); } if (left || right) { if (slot->present & SNDRV_MIXER_OSS_PRESENT_PSWITCH) snd_mixer_oss_put_volume1_sw(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_PSWITCH], left, right, 0); if (slot->present & SNDRV_MIXER_OSS_PRESENT_CSWITCH) snd_mixer_oss_put_volume1_sw(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_CSWITCH], left, right, 0); if (slot->present & SNDRV_MIXER_OSS_PRESENT_GSWITCH) snd_mixer_oss_put_volume1_sw(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_GSWITCH], left, right, 0); if (slot->present & SNDRV_MIXER_OSS_PRESENT_PROUTE) snd_mixer_oss_put_volume1_sw(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_PROUTE], left, right, 1); if (slot->present & SNDRV_MIXER_OSS_PRESENT_CROUTE) snd_mixer_oss_put_volume1_sw(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_CROUTE], left, right, 1); if (slot->present & SNDRV_MIXER_OSS_PRESENT_GROUTE) snd_mixer_oss_put_volume1_sw(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_GROUTE], left, right, 1); } else { if (slot->present & SNDRV_MIXER_OSS_PRESENT_PSWITCH) { snd_mixer_oss_put_volume1_sw(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_PSWITCH], left, right, 0); } else if (slot->present & SNDRV_MIXER_OSS_PRESENT_CSWITCH) { snd_mixer_oss_put_volume1_sw(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_CSWITCH], left, right, 0); } else if (slot->present & SNDRV_MIXER_OSS_PRESENT_GSWITCH) { snd_mixer_oss_put_volume1_sw(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_GSWITCH], left, right, 0); } else if (slot->present & SNDRV_MIXER_OSS_PRESENT_PROUTE) { snd_mixer_oss_put_volume1_sw(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_PROUTE], left, right, 1); } else if (slot->present & SNDRV_MIXER_OSS_PRESENT_CROUTE) { snd_mixer_oss_put_volume1_sw(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_CROUTE], left, right, 1); } else if (slot->present & SNDRV_MIXER_OSS_PRESENT_GROUTE) { snd_mixer_oss_put_volume1_sw(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_GROUTE], left, right, 1); } } return 0; } static int snd_mixer_oss_get_recsrc1_sw(struct snd_mixer_oss_file *fmixer, struct snd_mixer_oss_slot *pslot, int *active) { struct slot *slot = pslot->private_data; int left, right; left = right = 1; snd_mixer_oss_get_volume1_sw(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_CSWITCH], &left, &right, 0); *active = (left || right) ? 1 : 0; return 0; } static int snd_mixer_oss_get_recsrc1_route(struct snd_mixer_oss_file *fmixer, struct snd_mixer_oss_slot *pslot, int *active) { struct slot *slot = pslot->private_data; int left, right; left = right = 1; snd_mixer_oss_get_volume1_sw(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_CROUTE], &left, &right, 1); *active = (left || right) ? 1 : 0; return 0; } static int snd_mixer_oss_put_recsrc1_sw(struct snd_mixer_oss_file *fmixer, struct snd_mixer_oss_slot *pslot, int active) { struct slot *slot = pslot->private_data; snd_mixer_oss_put_volume1_sw(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_CSWITCH], active, active, 0); return 0; } static int snd_mixer_oss_put_recsrc1_route(struct snd_mixer_oss_file *fmixer, struct snd_mixer_oss_slot *pslot, int active) { struct slot *slot = pslot->private_data; snd_mixer_oss_put_volume1_sw(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_CROUTE], active, active, 1); return 0; } static int snd_mixer_oss_get_recsrc2(struct snd_mixer_oss_file *fmixer, unsigned int *active_index) { struct snd_card *card = fmixer->card; struct snd_mixer_oss *mixer = fmixer->mixer; struct snd_kcontrol *kctl; struct snd_mixer_oss_slot *pslot; struct slot *slot; int err, idx; struct snd_ctl_elem_info *uinfo __free(kfree) = kzalloc_obj(*uinfo); struct snd_ctl_elem_value *uctl __free(kfree) = kzalloc_obj(*uctl); if (uinfo == NULL || uctl == NULL) return -ENOMEM; guard(rwsem_read)(&card->controls_rwsem); if (card->shutdown) return -ENODEV; kctl = snd_mixer_oss_test_id(mixer, "Capture Source", 0); if (!kctl) return -ENOENT; err = kctl->info(kctl, uinfo); if (err < 0) return err; err = kctl->get(kctl, uctl); if (err < 0) return err; for (idx = 0; idx < 32; idx++) { if (!(mixer->mask_recsrc & (1 << idx))) continue; pslot = &mixer->slots[idx]; slot = pslot->private_data; if (slot->signature != SNDRV_MIXER_OSS_SIGNATURE) continue; if (!(slot->present & SNDRV_MIXER_OSS_PRESENT_CAPTURE)) continue; if (slot->capture_item == uctl->value.enumerated.item[0]) { *active_index = idx; break; } } return 0; } static int snd_mixer_oss_put_recsrc2(struct snd_mixer_oss_file *fmixer, unsigned int active_index) { struct snd_card *card = fmixer->card; struct snd_mixer_oss *mixer = fmixer->mixer; struct snd_kcontrol *kctl; struct snd_mixer_oss_slot *pslot; struct slot *slot = NULL; int err; unsigned int idx; struct snd_ctl_elem_info *uinfo __free(kfree) = kzalloc_obj(*uinfo); struct snd_ctl_elem_value *uctl __free(kfree) = kzalloc_obj(*uctl); if (uinfo == NULL || uctl == NULL) return -ENOMEM; guard(rwsem_read)(&card->controls_rwsem); if (card->shutdown) return -ENODEV; kctl = snd_mixer_oss_test_id(mixer, "Capture Source", 0); if (!kctl) return -ENOENT; err = kctl->info(kctl, uinfo); if (err < 0) return err; for (idx = 0; idx < 32; idx++) { if (!(mixer->mask_recsrc & (1 << idx))) continue; pslot = &mixer->slots[idx]; slot = pslot->private_data; if (slot->signature != SNDRV_MIXER_OSS_SIGNATURE) continue; if (!(slot->present & SNDRV_MIXER_OSS_PRESENT_CAPTURE)) continue; if (idx == active_index) break; slot = NULL; } if (!slot) return 0; for (idx = 0; idx < uinfo->count; idx++) uctl->value.enumerated.item[idx] = slot->capture_item; err = kctl->put(kctl, uctl); if (err > 0) snd_ctl_notify(fmixer->card, SNDRV_CTL_EVENT_MASK_VALUE, &kctl->id); return 0; } struct snd_mixer_oss_assign_table { int oss_id; const char *name; int index; }; static int snd_mixer_oss_build_test(struct snd_mixer_oss *mixer, struct slot *slot, const char *name, int index, int item) { struct snd_kcontrol *kcontrol; struct snd_card *card = mixer->card; int err; struct snd_ctl_elem_info *info __free(kfree) = kmalloc(sizeof(*info), GFP_KERNEL); if (!info) return -ENOMEM; scoped_guard(rwsem_read, &card->controls_rwsem) { if (card->shutdown) return -ENODEV; kcontrol = snd_mixer_oss_test_id(mixer, name, index); if (kcontrol == NULL) return 0; err = kcontrol->info(kcontrol, info); if (err < 0) return err; slot->numid[item] = kcontrol->id.numid; } if (info->count > slot->channels) slot->channels = info->count; slot->present |= 1 << item; return 0; } static void snd_mixer_oss_slot_free(struct snd_mixer_oss_slot *chn) { struct slot *p = chn->private_data; if (p) { if (p->allocated && p->assigned) { kfree(p->assigned->name); kfree(p->assigned); } kfree(p); } } static void mixer_slot_clear(struct snd_mixer_oss_slot *rslot) { int idx = rslot->number; /* remember this */ if (rslot->private_free) rslot->private_free(rslot); memset(rslot, 0, sizeof(*rslot)); rslot->number = idx; } /* In a separate function to keep gcc 3.2 happy - do NOT merge this in snd_mixer_oss_build_input! */ static int snd_mixer_oss_build_test_all(struct snd_mixer_oss *mixer, const struct snd_mixer_oss_assign_table *ptr, struct slot *slot) { char str[64]; int err; err = snd_mixer_oss_build_test(mixer, slot, ptr->name, ptr->index, SNDRV_MIXER_OSS_ITEM_GLOBAL); if (err) return err; sprintf(str, "%s Switch", ptr->name); err = snd_mixer_oss_build_test(mixer, slot, str, ptr->index, SNDRV_MIXER_OSS_ITEM_GSWITCH); if (err) return err; sprintf(str, "%s Route", ptr->name); err = snd_mixer_oss_build_test(mixer, slot, str, ptr->index, SNDRV_MIXER_OSS_ITEM_GROUTE); if (err) return err; sprintf(str, "%s Volume", ptr->name); err = snd_mixer_oss_build_test(mixer, slot, str, ptr->index, SNDRV_MIXER_OSS_ITEM_GVOLUME); if (err) return err; sprintf(str, "%s Playback Switch", ptr->name); err = snd_mixer_oss_build_test(mixer, slot, str, ptr->index, SNDRV_MIXER_OSS_ITEM_PSWITCH); if (err) return err; sprintf(str, "%s Playback Route", ptr->name); err = snd_mixer_oss_build_test(mixer, slot, str, ptr->index, SNDRV_MIXER_OSS_ITEM_PROUTE); if (err) return err; sprintf(str, "%s Playback Volume", ptr->name); err = snd_mixer_oss_build_test(mixer, slot, str, ptr->index, SNDRV_MIXER_OSS_ITEM_PVOLUME); if (err) return err; sprintf(str, "%s Capture Switch", ptr->name); err = snd_mixer_oss_build_test(mixer, slot, str, ptr->index, SNDRV_MIXER_OSS_ITEM_CSWITCH); if (err) return err; sprintf(str, "%s Capture Route", ptr->name); err = snd_mixer_oss_build_test(mixer, slot, str, ptr->index, SNDRV_MIXER_OSS_ITEM_CROUTE); if (err) return err; sprintf(str, "%s Capture Volume", ptr->name); err = snd_mixer_oss_build_test(mixer, slot, str, ptr->index, SNDRV_MIXER_OSS_ITEM_CVOLUME); if (err) return err; return 0; } /* * build an OSS mixer element. * ptr_allocated means the entry is dynamically allocated (change via proc file). * when replace_old = 1, the old entry is replaced with the new one. */ static int snd_mixer_oss_build_input(struct snd_mixer_oss *mixer, const struct snd_mixer_oss_assign_table *ptr, int ptr_allocated, int replace_old) { struct slot slot; struct slot *pslot; struct snd_kcontrol *kctl; struct snd_mixer_oss_slot *rslot; const char *str; /* check if already assigned */ if (mixer->slots[ptr->oss_id].get_volume && ! replace_old) return 0; memset(&slot, 0, sizeof(slot)); memset(slot.numid, 0xff, sizeof(slot.numid)); /* ID_UNKNOWN */ if (snd_mixer_oss_build_test_all(mixer, ptr, &slot)) return 0; guard(rwsem_read)(&mixer->card->controls_rwsem); if (mixer->card->shutdown) return -ENODEV; kctl = NULL; if (!ptr->index) kctl = snd_mixer_oss_test_id(mixer, "Capture Source", 0); if (kctl) { struct snd_ctl_elem_info *uinfo __free(kfree) = kzalloc_obj(*uinfo); if (!uinfo) return -ENOMEM; if (kctl->info(kctl, uinfo)) return 0; str = ptr->name; if (!strcmp(str, "Master")) str = "Mix"; else if (!strcmp(str, "Master Mono")) str = "Mix Mono"; slot.capture_item = 0; if (!strcmp(uinfo->value.enumerated.name, str)) { slot.present |= SNDRV_MIXER_OSS_PRESENT_CAPTURE; } else { for (slot.capture_item = 1; slot.capture_item < uinfo->value.enumerated.items; slot.capture_item++) { uinfo->value.enumerated.item = slot.capture_item; if (kctl->info(kctl, uinfo)) return 0; if (!strcmp(uinfo->value.enumerated.name, str)) { slot.present |= SNDRV_MIXER_OSS_PRESENT_CAPTURE; break; } } } } if (slot.present != 0) { pslot = kmalloc_obj(slot); if (! pslot) return -ENOMEM; *pslot = slot; pslot->signature = SNDRV_MIXER_OSS_SIGNATURE; pslot->assigned = ptr; pslot->allocated = ptr_allocated; rslot = &mixer->slots[ptr->oss_id]; mixer_slot_clear(rslot); rslot->stereo = slot.channels > 1 ? 1 : 0; rslot->get_volume = snd_mixer_oss_get_volume1; rslot->put_volume = snd_mixer_oss_put_volume1; /* note: ES18xx have both Capture Source and XX Capture Volume !!! */ if (slot.present & SNDRV_MIXER_OSS_PRESENT_CSWITCH) { rslot->get_recsrc = snd_mixer_oss_get_recsrc1_sw; rslot->put_recsrc = snd_mixer_oss_put_recsrc1_sw; } else if (slot.present & SNDRV_MIXER_OSS_PRESENT_CROUTE) { rslot->get_recsrc = snd_mixer_oss_get_recsrc1_route; rslot->put_recsrc = snd_mixer_oss_put_recsrc1_route; } else if (slot.present & SNDRV_MIXER_OSS_PRESENT_CAPTURE) { mixer->mask_recsrc |= 1 << ptr->oss_id; } rslot->private_data = pslot; rslot->private_free = snd_mixer_oss_slot_free; return 1; } return 0; } #ifdef CONFIG_SND_PROC_FS /* */ #define MIXER_VOL(name) [SOUND_MIXER_##name] = #name static const char * const oss_mixer_names[SNDRV_OSS_MAX_MIXERS] = { MIXER_VOL(VOLUME), MIXER_VOL(BASS), MIXER_VOL(TREBLE), MIXER_VOL(SYNTH), MIXER_VOL(PCM), MIXER_VOL(SPEAKER), MIXER_VOL(LINE), MIXER_VOL(MIC), MIXER_VOL(CD), MIXER_VOL(IMIX), MIXER_VOL(ALTPCM), MIXER_VOL(RECLEV), MIXER_VOL(IGAIN), MIXER_VOL(OGAIN), MIXER_VOL(LINE1), MIXER_VOL(LINE2), MIXER_VOL(LINE3), MIXER_VOL(DIGITAL1), MIXER_VOL(DIGITAL2), MIXER_VOL(DIGITAL3), MIXER_VOL(PHONEIN), MIXER_VOL(PHONEOUT), MIXER_VOL(VIDEO), MIXER_VOL(RADIO), MIXER_VOL(MONITOR), }; /* * /proc interface */ static void snd_mixer_oss_proc_read(struct snd_info_entry *entry, struct snd_info_buffer *buffer) { struct snd_mixer_oss *mixer = entry->private_data; int i; guard(mutex)(&mixer->reg_mutex); for (i = 0; i < SNDRV_OSS_MAX_MIXERS; i++) { struct slot *p; if (! oss_mixer_names[i]) continue; p = (struct slot *)mixer->slots[i].private_data; snd_iprintf(buffer, "%s ", oss_mixer_names[i]); if (p && p->assigned) snd_iprintf(buffer, "\"%s\" %d\n", p->assigned->name, p->assigned->index); else snd_iprintf(buffer, "\"\" 0\n"); } } static void snd_mixer_oss_proc_write(struct snd_info_entry *entry, struct snd_info_buffer *buffer) { struct snd_mixer_oss *mixer = entry->private_data; char line[128], str[32], idxstr[16]; const char *cptr; unsigned int idx; int ch; struct snd_mixer_oss_assign_table *tbl; struct slot *slot; while (!snd_info_get_line(buffer, line, sizeof(line))) { cptr = snd_info_get_str(str, line, sizeof(str)); for (ch = 0; ch < SNDRV_OSS_MAX_MIXERS; ch++) if (oss_mixer_names[ch] && strcmp(oss_mixer_names[ch], str) == 0) break; if (ch >= SNDRV_OSS_MAX_MIXERS) { pr_err("ALSA: mixer_oss: invalid OSS volume '%s'\n", str); continue; } cptr = snd_info_get_str(str, cptr, sizeof(str)); if (! *str) { /* remove the entry */ scoped_guard(mutex, &mixer->reg_mutex) mixer_slot_clear(&mixer->slots[ch]); continue; } snd_info_get_str(idxstr, cptr, sizeof(idxstr)); idx = simple_strtoul(idxstr, NULL, 10); if (idx >= 0x4000) { /* too big */ pr_err("ALSA: mixer_oss: invalid index %d\n", idx); continue; } scoped_guard(mutex, &mixer->reg_mutex) { slot = (struct slot *)mixer->slots[ch].private_data; if (slot && slot->assigned && slot->assigned->index == idx && !strcmp(slot->assigned->name, str)) /* not changed */ break; tbl = kmalloc(sizeof(*tbl), GFP_KERNEL); if (!tbl) break; tbl->oss_id = ch; tbl->name = kstrdup(str, GFP_KERNEL); if (!tbl->name) { kfree(tbl); break; } tbl->index = idx; if (snd_mixer_oss_build_input(mixer, tbl, 1, 1) <= 0) { kfree(tbl->name); kfree(tbl); } } } } static void snd_mixer_oss_proc_init(struct snd_mixer_oss *mixer) { struct snd_info_entry *entry; entry = snd_info_create_card_entry(mixer->card, "oss_mixer", mixer->card->proc_root); if (! entry) return; entry->content = SNDRV_INFO_CONTENT_TEXT; entry->mode = S_IFREG | 0644; entry->c.text.read = snd_mixer_oss_proc_read; entry->c.text.write = snd_mixer_oss_proc_write; entry->private_data = mixer; if (snd_info_register(entry) < 0) { snd_info_free_entry(entry); entry = NULL; } mixer->proc_entry = entry; } static void snd_mixer_oss_proc_done(struct snd_mixer_oss *mixer) { snd_info_free_entry(mixer->proc_entry); mixer->proc_entry = NULL; } #else /* !CONFIG_SND_PROC_FS */ #define snd_mixer_oss_proc_init(mix) #define snd_mixer_oss_proc_done(mix) #endif /* CONFIG_SND_PROC_FS */ static void snd_mixer_oss_build(struct snd_mixer_oss *mixer) { static const struct snd_mixer_oss_assign_table table[] = { { SOUND_MIXER_VOLUME, "Master", 0 }, { SOUND_MIXER_VOLUME, "Front", 0 }, /* fallback */ { SOUND_MIXER_BASS, "Tone Control - Bass", 0 }, { SOUND_MIXER_TREBLE, "Tone Control - Treble", 0 }, { SOUND_MIXER_SYNTH, "Synth", 0 }, { SOUND_MIXER_SYNTH, "FM", 0 }, /* fallback */ { SOUND_MIXER_SYNTH, "Music", 0 }, /* fallback */ { SOUND_MIXER_PCM, "PCM", 0 }, { SOUND_MIXER_SPEAKER, "Beep", 0 }, { SOUND_MIXER_SPEAKER, "PC Speaker", 0 }, /* fallback */ { SOUND_MIXER_SPEAKER, "Speaker", 0 }, /* fallback */ { SOUND_MIXER_LINE, "Line", 0 }, { SOUND_MIXER_MIC, "Mic", 0 }, { SOUND_MIXER_CD, "CD", 0 }, { SOUND_MIXER_IMIX, "Monitor Mix", 0 }, { SOUND_MIXER_ALTPCM, "PCM", 1 }, { SOUND_MIXER_ALTPCM, "Headphone", 0 }, /* fallback */ { SOUND_MIXER_ALTPCM, "Wave", 0 }, /* fallback */ { SOUND_MIXER_RECLEV, "-- nothing --", 0 }, { SOUND_MIXER_IGAIN, "Capture", 0 }, { SOUND_MIXER_OGAIN, "Playback", 0 }, { SOUND_MIXER_LINE1, "Aux", 0 }, { SOUND_MIXER_LINE2, "Aux", 1 }, { SOUND_MIXER_LINE3, "Aux", 2 }, { SOUND_MIXER_DIGITAL1, "Digital", 0 }, { SOUND_MIXER_DIGITAL1, "IEC958", 0 }, /* fallback */ { SOUND_MIXER_DIGITAL1, "IEC958 Optical", 0 }, /* fallback */ { SOUND_MIXER_DIGITAL1, "IEC958 Coaxial", 0 }, /* fallback */ { SOUND_MIXER_DIGITAL2, "Digital", 1 }, { SOUND_MIXER_DIGITAL3, "Digital", 2 }, { SOUND_MIXER_PHONEIN, "Phone", 0 }, { SOUND_MIXER_PHONEOUT, "Master Mono", 0 }, { SOUND_MIXER_PHONEOUT, "Speaker", 0 }, /*fallback*/ { SOUND_MIXER_PHONEOUT, "Mono", 0 }, /*fallback*/ { SOUND_MIXER_PHONEOUT, "Phone", 0 }, /* fallback */ { SOUND_MIXER_VIDEO, "Video", 0 }, { SOUND_MIXER_RADIO, "Radio", 0 }, { SOUND_MIXER_MONITOR, "Monitor", 0 } }; unsigned int idx; for (idx = 0; idx < ARRAY_SIZE(table); idx++) snd_mixer_oss_build_input(mixer, &table[idx], 0, 0); if (mixer->mask_recsrc) { mixer->get_recsrc = snd_mixer_oss_get_recsrc2; mixer->put_recsrc = snd_mixer_oss_put_recsrc2; } } /* * */ static int snd_mixer_oss_free1(void *private) { struct snd_mixer_oss *mixer = private; struct snd_card *card; int idx; if (!mixer) return 0; card = mixer->card; if (snd_BUG_ON(mixer != card->mixer_oss)) return -ENXIO; card->mixer_oss = NULL; for (idx = 0; idx < SNDRV_OSS_MAX_MIXERS; idx++) { struct snd_mixer_oss_slot *chn = &mixer->slots[idx]; if (chn->private_free) chn->private_free(chn); } kfree(mixer); return 0; } static int snd_mixer_oss_notify_handler(struct snd_card *card, int cmd) { struct snd_mixer_oss *mixer; if (cmd == SND_MIXER_OSS_NOTIFY_REGISTER) { int idx, err; mixer = kzalloc_objs(*mixer, 2); if (mixer == NULL) return -ENOMEM; mutex_init(&mixer->reg_mutex); err = snd_register_oss_device(SNDRV_OSS_DEVICE_TYPE_MIXER, card, 0, &snd_mixer_oss_f_ops, card); if (err < 0) { dev_err(card->dev, "unable to register OSS mixer device %i:%i\n", card->number, 0); kfree(mixer); return err; } mixer->oss_dev_alloc = 1; mixer->card = card; if (*card->mixername) strscpy(mixer->name, card->mixername, sizeof(mixer->name)); else snprintf(mixer->name, sizeof(mixer->name), "mixer%i", card->number); #ifdef SNDRV_OSS_INFO_DEV_MIXERS snd_oss_info_register(SNDRV_OSS_INFO_DEV_MIXERS, card->number, mixer->name); #endif for (idx = 0; idx < SNDRV_OSS_MAX_MIXERS; idx++) mixer->slots[idx].number = idx; card->mixer_oss = mixer; snd_mixer_oss_build(mixer); snd_mixer_oss_proc_init(mixer); } else { mixer = card->mixer_oss; if (mixer == NULL) return 0; if (mixer->oss_dev_alloc) { #ifdef SNDRV_OSS_INFO_DEV_MIXERS snd_oss_info_unregister(SNDRV_OSS_INFO_DEV_MIXERS, mixer->card->number); #endif snd_unregister_oss_device(SNDRV_OSS_DEVICE_TYPE_MIXER, mixer->card, 0); mixer->oss_dev_alloc = 0; } if (cmd == SND_MIXER_OSS_NOTIFY_DISCONNECT) return 0; snd_mixer_oss_proc_done(mixer); return snd_mixer_oss_free1(mixer); } return 0; } static int __init alsa_mixer_oss_init(void) { struct snd_card *card; int idx; snd_mixer_oss_notify_callback = snd_mixer_oss_notify_handler; for (idx = 0; idx < SNDRV_CARDS; idx++) { card = snd_card_ref(idx); if (card) { snd_mixer_oss_notify_handler(card, SND_MIXER_OSS_NOTIFY_REGISTER); snd_card_unref(card); } } return 0; } static void __exit alsa_mixer_oss_exit(void) { struct snd_card *card; int idx; snd_mixer_oss_notify_callback = NULL; for (idx = 0; idx < SNDRV_CARDS; idx++) { card = snd_card_ref(idx); if (card) { snd_mixer_oss_notify_handler(card, SND_MIXER_OSS_NOTIFY_FREE); snd_card_unref(card); } } } module_init(alsa_mixer_oss_init) module_exit(alsa_mixer_oss_exit)
20 15 18 29 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 /* * linux/fs/nls/nls_cp869.c * * Charset cp869 translation tables. * Generated automatically from the Unicode and charset * tables from the Unicode Organization (www.unicode.org). * The Unicode to charset table has only exact mappings. */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/nls.h> #include <linux/errno.h> static const wchar_t charset2uni[256] = { /* 0x00*/ 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, /* 0x10*/ 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, /* 0x20*/ 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, /* 0x30*/ 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, /* 0x40*/ 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, /* 0x50*/ 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, /* 0x60*/ 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, /* 0x70*/ 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x007f, /* 0x80*/ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0386, 0x0000, 0x00b7, 0x00ac, 0x00a6, 0x2018, 0x2019, 0x0388, 0x2015, 0x0389, /* 0x90*/ 0x038a, 0x03aa, 0x038c, 0x0000, 0x0000, 0x038e, 0x03ab, 0x00a9, 0x038f, 0x00b2, 0x00b3, 0x03ac, 0x00a3, 0x03ad, 0x03ae, 0x03af, /* 0xa0*/ 0x03ca, 0x0390, 0x03cc, 0x03cd, 0x0391, 0x0392, 0x0393, 0x0394, 0x0395, 0x0396, 0x0397, 0x00bd, 0x0398, 0x0399, 0x00ab, 0x00bb, /* 0xb0*/ 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x039a, 0x039b, 0x039c, 0x039d, 0x2563, 0x2551, 0x2557, 0x255d, 0x039e, 0x039f, 0x2510, /* 0xc0*/ 0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x03a0, 0x03a1, 0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x03a3, /* 0xd0*/ 0x03a4, 0x03a5, 0x03a6, 0x03a7, 0x03a8, 0x03a9, 0x03b1, 0x03b2, 0x03b3, 0x2518, 0x250c, 0x2588, 0x2584, 0x03b4, 0x03b5, 0x2580, /* 0xe0*/ 0x03b6, 0x03b7, 0x03b8, 0x03b9, 0x03ba, 0x03bb, 0x03bc, 0x03bd, 0x03be, 0x03bf, 0x03c0, 0x03c1, 0x03c3, 0x03c2, 0x03c4, 0x0384, /* 0xf0*/ 0x00ad, 0x00b1, 0x03c5, 0x03c6, 0x03c7, 0x00a7, 0x03c8, 0x0385, 0x00b0, 0x00a8, 0x03c9, 0x03cb, 0x03b0, 0x03ce, 0x25a0, 0x00a0, }; static const unsigned char page00[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0xff, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x8a, 0xf5, /* 0xa0-0xa7 */ 0xf9, 0x97, 0x00, 0xae, 0x89, 0xf0, 0x00, 0x00, /* 0xa8-0xaf */ 0xf8, 0xf1, 0x99, 0x9a, 0x00, 0x00, 0x00, 0x88, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0xaf, 0x00, 0xab, 0x00, 0x00, /* 0xb8-0xbf */ }; static const unsigned char page03[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0xef, 0xf7, 0x86, 0x00, /* 0x80-0x87 */ 0x8d, 0x8f, 0x90, 0x00, 0x92, 0x00, 0x95, 0x98, /* 0x88-0x8f */ 0xa1, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, /* 0x90-0x97 */ 0xac, 0xad, 0xb5, 0xb6, 0xb7, 0xb8, 0xbd, 0xbe, /* 0x98-0x9f */ 0xc6, 0xc7, 0x00, 0xcf, 0xd0, 0xd1, 0xd2, 0xd3, /* 0xa0-0xa7 */ 0xd4, 0xd5, 0x91, 0x96, 0x9b, 0x9d, 0x9e, 0x9f, /* 0xa8-0xaf */ 0xfc, 0xd6, 0xd7, 0xd8, 0xdd, 0xde, 0xe0, 0xe1, /* 0xb0-0xb7 */ 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, /* 0xb8-0xbf */ 0xea, 0xeb, 0xed, 0xec, 0xee, 0xf2, 0xf3, 0xf4, /* 0xc0-0xc7 */ 0xf6, 0xfa, 0xa0, 0xfb, 0xa2, 0xa3, 0xfd, 0x00, /* 0xc8-0xcf */ }; static const unsigned char page20[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x8e, 0x00, 0x00, /* 0x10-0x17 */ 0x8b, 0x8c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ }; static const unsigned char page25[256] = { 0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0xcd, 0xba, 0x00, 0x00, 0xc9, 0x00, 0x00, 0xbb, /* 0x50-0x57 */ 0x00, 0x00, 0xc8, 0x00, 0x00, 0xbc, 0x00, 0x00, /* 0x58-0x5f */ 0xcc, 0x00, 0x00, 0xb9, 0x00, 0x00, 0xcb, 0x00, /* 0x60-0x67 */ 0x00, 0xca, 0x00, 0x00, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0xdb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ }; static const unsigned char *const page_uni2charset[256] = { page00, NULL, NULL, page03, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, page20, NULL, NULL, NULL, NULL, page25, NULL, NULL, }; static const unsigned char charset2lower[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9b, 0x00, /* 0x80-0x87 */ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x9d, 0x8e, 0x9e, /* 0x88-0x8f */ 0x9f, 0xa0, 0xa2, 0x00, 0x00, 0xa3, 0xfb, 0x97, /* 0x90-0x97 */ 0xfd, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ 0xa0, 0xa1, 0xa2, 0xa3, 0xd6, 0xd7, 0xd8, 0xdd, /* 0xa0-0xa7 */ 0xde, 0xe0, 0xe1, 0xab, 0xe2, 0xe3, 0xae, 0xaf, /* 0xa8-0xaf */ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xe4, 0xe5, 0xe6, /* 0xb0-0xb7 */ 0xe7, 0xb9, 0xba, 0xbb, 0xbc, 0xe8, 0xe9, 0xbf, /* 0xb8-0xbf */ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xea, 0xeb, /* 0xc0-0xc7 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xec, /* 0xc8-0xcf */ 0xee, 0xf2, 0xf3, 0xf4, 0xf6, 0xfa, 0xd6, 0xd7, /* 0xd0-0xd7 */ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ }; static const unsigned char charset2upper[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x86, 0x00, /* 0x80-0x87 */ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ 0x90, 0x91, 0x92, 0x00, 0x00, 0x95, 0x96, 0x97, /* 0x90-0x97 */ 0x98, 0x99, 0x9a, 0x86, 0x9c, 0x8d, 0x8f, 0x90, /* 0x98-0x9f */ 0x91, 0xa1, 0x92, 0x95, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xa4, 0xa5, /* 0xd0-0xd7 */ 0xa6, 0xd9, 0xda, 0xdb, 0xdc, 0xa7, 0xa8, 0xdf, /* 0xd8-0xdf */ 0xa9, 0xaa, 0xac, 0xad, 0xb5, 0xb6, 0xb7, 0xb8, /* 0xe0-0xe7 */ 0xbd, 0xbe, 0xc6, 0xc7, 0xcf, 0xcf, 0xd0, 0xef, /* 0xe8-0xef */ 0xf0, 0xf1, 0xd1, 0xd2, 0xd3, 0xf5, 0xd4, 0xf7, /* 0xf0-0xf7 */ 0xf8, 0xf9, 0xd5, 0x96, 0xfc, 0x98, 0xfe, 0xff, /* 0xf8-0xff */ }; static int uni2char(wchar_t uni, unsigned char *out, int boundlen) { const unsigned char *uni2charset; unsigned char cl = uni & 0x00ff; unsigned char ch = (uni & 0xff00) >> 8; if (boundlen <= 0) return -ENAMETOOLONG; uni2charset = page_uni2charset[ch]; if (uni2charset && uni2charset[cl]) out[0] = uni2charset[cl]; else return -EINVAL; return 1; } static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) { *uni = charset2uni[*rawstring]; if (*uni == 0x0000) return -EINVAL; return 1; } static struct nls_table table = { .charset = "cp869", .uni2char = uni2char, .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, }; static int __init init_nls_cp869(void) { return register_nls(&table); } static void __exit exit_nls_cp869(void) { unregister_nls(&table); } module_init(init_nls_cp869) module_exit(exit_nls_cp869) MODULE_DESCRIPTION("NLS Codepage 869 (Greek)"); MODULE_LICENSE("Dual BSD/GPL");
1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 // SPDX-License-Identifier: GPL-2.0 /* * (C) Copyright 2002-2004, 2007 Greg Kroah-Hartman <greg@kroah.com> * (C) Copyright 2007 Novell Inc. */ #include <linux/pci.h> #include <linux/module.h> #include <linux/init.h> #include <linux/device.h> #include <linux/mempolicy.h> #include <linux/string.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/sched/isolation.h> #include <linux/cpu.h> #include <linux/pm_runtime.h> #include <linux/suspend.h> #include <linux/kexec.h> #include <linux/of_device.h> #include <linux/acpi.h> #include <linux/dma-map-ops.h> #include <linux/iommu.h> #include "pci.h" #include "pcie/portdrv.h" struct pci_dynid { struct list_head node; struct pci_device_id id; }; /** * pci_add_dynid - add a new PCI device ID to this driver and re-probe devices * @drv: target pci driver * @vendor: PCI vendor ID * @device: PCI device ID * @subvendor: PCI subvendor ID * @subdevice: PCI subdevice ID * @class: PCI class * @class_mask: PCI class mask * @driver_data: private driver data * * Adds a new dynamic pci device ID to this driver and causes the * driver to probe for all devices again. @drv must have been * registered prior to calling this function. * * CONTEXT: * Does GFP_KERNEL allocation. * * RETURNS: * 0 on success, -errno on failure. */ int pci_add_dynid(struct pci_driver *drv, unsigned int vendor, unsigned int device, unsigned int subvendor, unsigned int subdevice, unsigned int class, unsigned int class_mask, unsigned long driver_data) { struct pci_dynid *dynid; dynid = kzalloc_obj(*dynid); if (!dynid) return -ENOMEM; dynid->id.vendor = vendor; dynid->id.device = device; dynid->id.subvendor = subvendor; dynid->id.subdevice = subdevice; dynid->id.class = class; dynid->id.class_mask = class_mask; dynid->id.driver_data = driver_data; spin_lock(&drv->dynids.lock); list_add_tail(&dynid->node, &drv->dynids.list); spin_unlock(&drv->dynids.lock); return driver_attach(&drv->driver); } EXPORT_SYMBOL_GPL(pci_add_dynid); static void pci_free_dynids(struct pci_driver *drv) { struct pci_dynid *dynid, *n; spin_lock(&drv->dynids.lock); list_for_each_entry_safe(dynid, n, &drv->dynids.list, node) { list_del(&dynid->node); kfree(dynid); } spin_unlock(&drv->dynids.lock); } /** * pci_match_id - See if a PCI device matches a given pci_id table * @ids: array of PCI device ID structures to search in * @dev: the PCI device structure to match against. * * Used by a driver to check whether a PCI device is in its list of * supported devices. Returns the matching pci_device_id structure or * %NULL if there is no match. * * Deprecated; don't use this as it will not catch any dynamic IDs * that a driver might want to check for. */ const struct pci_device_id *pci_match_id(const struct pci_device_id *ids, struct pci_dev *dev) { if (ids) { while (ids->vendor || ids->subvendor || ids->class_mask) { if (pci_match_one_device(ids, dev)) return ids; ids++; } } return NULL; } EXPORT_SYMBOL(pci_match_id); static const struct pci_device_id pci_device_id_any = { .vendor = PCI_ANY_ID, .device = PCI_ANY_ID, .subvendor = PCI_ANY_ID, .subdevice = PCI_ANY_ID, }; /** * pci_match_device - See if a device matches a driver's list of IDs * @drv: the PCI driver to match against * @dev: the PCI device structure to match against * * Used by a driver to check whether a PCI device is in its list of * supported devices or in the dynids list, which may have been augmented * via the sysfs "new_id" file. Returns the matching pci_device_id * structure or %NULL if there is no match. */ static const struct pci_device_id *pci_match_device(struct pci_driver *drv, struct pci_dev *dev) { struct pci_dynid *dynid; const struct pci_device_id *found_id = NULL, *ids; /* When driver_override is set, only bind to the matching driver */ if (dev->driver_override && strcmp(dev->driver_override, drv->name)) return NULL; /* Look at the dynamic ids first, before the static ones */ spin_lock(&drv->dynids.lock); list_for_each_entry(dynid, &drv->dynids.list, node) { if (pci_match_one_device(&dynid->id, dev)) { found_id = &dynid->id; break; } } spin_unlock(&drv->dynids.lock); if (found_id) return found_id; for (ids = drv->id_table; (found_id = pci_match_id(ids, dev)); ids = found_id + 1) { /* * The match table is split based on driver_override. * In case override_only was set, enforce driver_override * matching. */ if (found_id->override_only) { if (dev->driver_override) return found_id; } else { return found_id; } } /* driver_override will always match, send a dummy id */ if (dev->driver_override) return &pci_device_id_any; return NULL; } /** * new_id_store - sysfs frontend to pci_add_dynid() * @driver: target device driver * @buf: buffer for scanning device ID data * @count: input size * * Allow PCI IDs to be added to an existing driver via sysfs. */ static ssize_t new_id_store(struct device_driver *driver, const char *buf, size_t count) { struct pci_driver *pdrv = to_pci_driver(driver); const struct pci_device_id *ids = pdrv->id_table; u32 vendor, device, subvendor = PCI_ANY_ID, subdevice = PCI_ANY_ID, class = 0, class_mask = 0; unsigned long driver_data = 0; int fields; int retval = 0; fields = sscanf(buf, "%x %x %x %x %x %x %lx", &vendor, &device, &subvendor, &subdevice, &class, &class_mask, &driver_data); if (fields < 2) return -EINVAL; if (fields != 7) { struct pci_dev *pdev = kzalloc_obj(*pdev); if (!pdev) return -ENOMEM; pdev->vendor = vendor; pdev->device = device; pdev->subsystem_vendor = subvendor; pdev->subsystem_device = subdevice; pdev->class = class; if (pci_match_device(pdrv, pdev)) retval = -EEXIST; kfree(pdev); if (retval) return retval; } /* Only accept driver_data values that match an existing id_table entry */ if (ids) { retval = -EINVAL; while (ids->vendor || ids->subvendor || ids->class_mask) { if (driver_data == ids->driver_data) { retval = 0; break; } ids++; } if (retval) /* No match */ return retval; } retval = pci_add_dynid(pdrv, vendor, device, subvendor, subdevice, class, class_mask, driver_data); if (retval) return retval; return count; } static DRIVER_ATTR_WO(new_id); /** * remove_id_store - remove a PCI device ID from this driver * @driver: target device driver * @buf: buffer for scanning device ID data * @count: input size * * Removes a dynamic pci device ID to this driver. */ static ssize_t remove_id_store(struct device_driver *driver, const char *buf, size_t count) { struct pci_dynid *dynid, *n; struct pci_driver *pdrv = to_pci_driver(driver); u32 vendor, device, subvendor = PCI_ANY_ID, subdevice = PCI_ANY_ID, class = 0, class_mask = 0; int fields; size_t retval = -ENODEV; fields = sscanf(buf, "%x %x %x %x %x %x", &vendor, &device, &subvendor, &subdevice, &class, &class_mask); if (fields < 2) return -EINVAL; spin_lock(&pdrv->dynids.lock); list_for_each_entry_safe(dynid, n, &pdrv->dynids.list, node) { struct pci_device_id *id = &dynid->id; if ((id->vendor == vendor) && (id->device == device) && (subvendor == PCI_ANY_ID || id->subvendor == subvendor) && (subdevice == PCI_ANY_ID || id->subdevice == subdevice) && !((id->class ^ class) & class_mask)) { list_del(&dynid->node); kfree(dynid); retval = count; break; } } spin_unlock(&pdrv->dynids.lock); return retval; } static DRIVER_ATTR_WO(remove_id); static struct attribute *pci_drv_attrs[] = { &driver_attr_new_id.attr, &driver_attr_remove_id.attr, NULL, }; ATTRIBUTE_GROUPS(pci_drv); struct drv_dev_and_id { struct pci_driver *drv; struct pci_dev *dev; const struct pci_device_id *id; }; static int local_pci_probe(struct drv_dev_and_id *ddi) { struct pci_dev *pci_dev = ddi->dev; struct pci_driver *pci_drv = ddi->drv; struct device *dev = &pci_dev->dev; int rc; /* * Unbound PCI devices are always put in D0, regardless of * runtime PM status. During probe, the device is set to * active and the usage count is incremented. If the driver * supports runtime PM, it should call pm_runtime_put_noidle(), * or any other runtime PM helper function decrementing the usage * count, in its probe routine and pm_runtime_get_noresume() in * its remove routine. */ pm_runtime_get_sync(dev); pci_dev->driver = pci_drv; rc = pci_drv->probe(pci_dev, ddi->id); if (!rc) return rc; if (rc < 0) { pci_dev->driver = NULL; pm_runtime_put_sync(dev); return rc; } /* * Probe function should return < 0 for failure, 0 for success * Treat values > 0 as success, but warn. */ pci_warn(pci_dev, "Driver probe function unexpectedly returned %d\n", rc); return 0; } static struct workqueue_struct *pci_probe_wq; struct pci_probe_arg { struct drv_dev_and_id *ddi; struct work_struct work; int ret; }; static void local_pci_probe_callback(struct work_struct *work) { struct pci_probe_arg *arg = container_of(work, struct pci_probe_arg, work); arg->ret = local_pci_probe(arg->ddi); } static bool pci_physfn_is_probed(struct pci_dev *dev) { #ifdef CONFIG_PCI_IOV return dev->is_virtfn && dev->physfn->is_probed; #else return false; #endif } static int pci_call_probe(struct pci_driver *drv, struct pci_dev *dev, const struct pci_device_id *id) { int error, node, cpu; struct drv_dev_and_id ddi = { drv, dev, id }; /* * Execute driver initialization on node where the device is * attached. This way the driver likely allocates its local memory * on the right node. */ node = dev_to_node(&dev->dev); dev->is_probed = 1; cpu_hotplug_disable(); /* * Prevent nesting work_on_cpu() for the case where a Virtual Function * device is probed from work_on_cpu() of the Physical device. */ if (node < 0 || node >= MAX_NUMNODES || !node_online(node) || pci_physfn_is_probed(dev)) { error = local_pci_probe(&ddi); } else { struct pci_probe_arg arg = { .ddi = &ddi }; INIT_WORK_ONSTACK(&arg.work, local_pci_probe_callback); /* * The target election and the enqueue of the work must be within * the same RCU read side section so that when the workqueue pool * is flushed after a housekeeping cpumask update, further readers * are guaranteed to queue the probing work to the appropriate * targets. */ rcu_read_lock(); cpu = cpumask_any_and(cpumask_of_node(node), housekeeping_cpumask(HK_TYPE_DOMAIN)); if (cpu < nr_cpu_ids) { struct workqueue_struct *wq = pci_probe_wq; if (WARN_ON_ONCE(!wq)) wq = system_percpu_wq; queue_work_on(cpu, wq, &arg.work); rcu_read_unlock(); flush_work(&arg.work); error = arg.ret; } else { rcu_read_unlock(); error = local_pci_probe(&ddi); } destroy_work_on_stack(&arg.work); } dev->is_probed = 0; cpu_hotplug_enable(); return error; } void pci_probe_flush_workqueue(void) { flush_workqueue(pci_probe_wq); } /** * __pci_device_probe - check if a driver wants to claim a specific PCI device * @drv: driver to call to check if it wants the PCI device * @pci_dev: PCI device being probed * * returns 0 on success, else error. * side-effect: pci_dev->driver is set to drv when drv claims pci_dev. */ static int __pci_device_probe(struct pci_driver *drv, struct pci_dev *pci_dev) { const struct pci_device_id *id; int error = 0; if (drv->probe) { error = -ENODEV; id = pci_match_device(drv, pci_dev); if (id) error = pci_call_probe(drv, pci_dev, id); } return error; } #ifdef CONFIG_PCI_IOV static inline bool pci_device_can_probe(struct pci_dev *pdev) { return (!pdev->is_virtfn || pdev->physfn->sriov->drivers_autoprobe || pdev->driver_override); } #else static inline bool pci_device_can_probe(struct pci_dev *pdev) { return true; } #endif static int pci_device_probe(struct device *dev) { int error; struct pci_dev *pci_dev = to_pci_dev(dev); struct pci_driver *drv = to_pci_driver(dev->driver); if (!pci_device_can_probe(pci_dev)) return -ENODEV; pci_assign_irq(pci_dev); error = pcibios_alloc_irq(pci_dev); if (error < 0) return error; pci_dev_get(pci_dev); error = __pci_device_probe(drv, pci_dev); if (error) { pcibios_free_irq(pci_dev); pci_dev_put(pci_dev); } return error; } static void pci_device_remove(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); struct pci_driver *drv = pci_dev->driver; if (drv->remove) { pm_runtime_get_sync(dev); /* * If the driver provides a .runtime_idle() callback and it has * started to run already, it may continue to run in parallel * with the code below, so wait until all of the runtime PM * activity has completed. */ pm_runtime_barrier(dev); drv->remove(pci_dev); pm_runtime_put_noidle(dev); } pcibios_free_irq(pci_dev); pci_dev->driver = NULL; pci_iov_remove(pci_dev); /* Undo the runtime PM settings in local_pci_probe() */ pm_runtime_put_sync(dev); /* * If the device is still on, set the power state as "unknown", * since it might change by the next time we load the driver. */ if (pci_dev->current_state == PCI_D0) pci_dev->current_state = PCI_UNKNOWN; /* * We would love to complain here if pci_dev->is_enabled is set, that * the driver should have called pci_disable_device(), but the * unfortunate fact is there are too many odd BIOS and bridge setups * that don't like drivers doing that all of the time. * Oh well, we can dream of sane hardware when we sleep, no matter how * horrible the crap we have to deal with is when we are awake... */ pci_dev_put(pci_dev); } static void pci_device_shutdown(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); struct pci_driver *drv = pci_dev->driver; pm_runtime_resume(dev); if (drv && drv->shutdown) drv->shutdown(pci_dev); /* * If this is a kexec reboot, turn off Bus Master bit on the * device to tell it to not continue to do DMA. Don't touch * devices in D3cold or unknown states. * If it is not a kexec reboot, firmware will hit the PCI * devices with big hammer and stop their DMA any way. */ if (kexec_in_progress && (pci_dev->current_state <= PCI_D3hot)) pci_clear_master(pci_dev); } #ifdef CONFIG_PM_SLEEP /* Auxiliary functions used for system resume */ /** * pci_restore_standard_config - restore standard config registers of PCI device * @pci_dev: PCI device to handle */ static int pci_restore_standard_config(struct pci_dev *pci_dev) { pci_update_current_state(pci_dev, PCI_UNKNOWN); if (pci_dev->current_state != PCI_D0) { int error = pci_set_power_state(pci_dev, PCI_D0); if (error) return error; } pci_restore_state(pci_dev); pci_pme_restore(pci_dev); return 0; } #endif /* CONFIG_PM_SLEEP */ #ifdef CONFIG_PM /* Auxiliary functions used for system resume and run-time resume */ static void pci_pm_default_resume(struct pci_dev *pci_dev) { pci_fixup_device(pci_fixup_resume, pci_dev); pci_enable_wake(pci_dev, PCI_D0, false); } static void pci_pm_default_resume_early(struct pci_dev *pci_dev) { pci_pm_power_up_and_verify_state(pci_dev); pci_restore_state(pci_dev); pci_pme_restore(pci_dev); } static void pci_pm_bridge_power_up_actions(struct pci_dev *pci_dev) { int ret; ret = pci_bridge_wait_for_secondary_bus(pci_dev, "resume"); if (ret) { /* * The downstream link failed to come up, so mark the * devices below as disconnected to make sure we don't * attempt to resume them. */ pci_walk_bus(pci_dev->subordinate, pci_dev_set_disconnected, NULL); return; } /* * When powering on a bridge from D3cold, the whole hierarchy may be * powered on into D0uninitialized state, resume them to give them a * chance to suspend again */ pci_resume_bus(pci_dev->subordinate); } #endif /* CONFIG_PM */ #ifdef CONFIG_PM_SLEEP /* * Default "suspend" method for devices that have no driver provided suspend, * or not even a driver at all (second part). */ static void pci_pm_set_unknown_state(struct pci_dev *pci_dev) { /* * mark its power state as "unknown", since we don't know if * e.g. the BIOS will change its device state when we suspend. */ if (pci_dev->current_state == PCI_D0) pci_dev->current_state = PCI_UNKNOWN; } /* * Default "resume" method for devices that have no driver provided resume, * or not even a driver at all (second part). */ static int pci_pm_reenable_device(struct pci_dev *pci_dev) { int retval; /* if the device was enabled before suspend, re-enable */ retval = pci_reenable_device(pci_dev); /* * if the device was busmaster before the suspend, make it busmaster * again */ if (pci_dev->is_busmaster) pci_set_master(pci_dev); return retval; } static int pci_legacy_suspend(struct device *dev, pm_message_t state) { struct pci_dev *pci_dev = to_pci_dev(dev); struct pci_driver *drv = pci_dev->driver; pci_dev->state_saved = false; if (drv && drv->suspend) { pci_power_t prev = pci_dev->current_state; int error; error = drv->suspend(pci_dev, state); suspend_report_result(dev, drv->suspend, error); if (error) return error; if (!pci_dev->state_saved && pci_dev->current_state != PCI_D0 && pci_dev->current_state != PCI_UNKNOWN) { pci_WARN_ONCE(pci_dev, pci_dev->current_state != prev, "PCI PM: Device state not saved by %pS\n", drv->suspend); } } pci_fixup_device(pci_fixup_suspend, pci_dev); return 0; } static int pci_legacy_suspend_late(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); if (!pci_dev->state_saved) pci_save_state(pci_dev); pci_pm_set_unknown_state(pci_dev); pci_fixup_device(pci_fixup_suspend_late, pci_dev); return 0; } static int pci_legacy_resume(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); struct pci_driver *drv = pci_dev->driver; pci_fixup_device(pci_fixup_resume, pci_dev); return drv && drv->resume ? drv->resume(pci_dev) : pci_pm_reenable_device(pci_dev); } /* Auxiliary functions used by the new power management framework */ static void pci_pm_default_suspend(struct pci_dev *pci_dev) { /* Disable non-bridge devices without PM support */ if (!pci_has_subordinate(pci_dev)) pci_disable_enabled_device(pci_dev); } static bool pci_has_legacy_pm_support(struct pci_dev *pci_dev) { struct pci_driver *drv = pci_dev->driver; bool ret = drv && (drv->suspend || drv->resume); /* * Legacy PM support is used by default, so warn if the new framework is * supported as well. Drivers are supposed to support either the * former, or the latter, but not both at the same time. */ pci_WARN(pci_dev, ret && drv->driver.pm, "device %04x:%04x\n", pci_dev->vendor, pci_dev->device); return ret; } /* New power management framework */ static int pci_pm_prepare(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; dev_pm_set_strict_midlayer(dev, true); if (pm && pm->prepare) { int error = pm->prepare(dev); if (error < 0) return error; if (!error && dev_pm_test_driver_flags(dev, DPM_FLAG_SMART_PREPARE)) return 0; } if (pci_dev_need_resume(pci_dev)) return 0; /* * The PME setting needs to be adjusted here in case the direct-complete * optimization is used with respect to this device. */ pci_dev_adjust_pme(pci_dev); return 1; } static void pci_pm_complete(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); pci_dev_complete_resume(pci_dev); pm_generic_complete(dev); /* Resume device if platform firmware has put it in reset-power-on */ if (pm_runtime_suspended(dev) && pm_resume_via_firmware()) { pci_power_t pre_sleep_state = pci_dev->current_state; pci_refresh_power_state(pci_dev); /* * On platforms with ACPI this check may also trigger for * devices sharing power resources if one of those power * resources has been activated as a result of a change of the * power state of another device sharing it. However, in that * case it is also better to resume the device, in general. */ if (pci_dev->current_state < pre_sleep_state) pm_request_resume(dev); } dev_pm_set_strict_midlayer(dev, false); } #else /* !CONFIG_PM_SLEEP */ #define pci_pm_prepare NULL #define pci_pm_complete NULL #endif /* !CONFIG_PM_SLEEP */ #ifdef CONFIG_SUSPEND static void pcie_pme_root_status_cleanup(struct pci_dev *pci_dev) { /* * Some BIOSes forget to clear Root PME Status bits after system * wakeup, which breaks ACPI-based runtime wakeup on PCI Express. * Clear those bits now just in case (shouldn't hurt). */ if (pci_is_pcie(pci_dev) && (pci_pcie_type(pci_dev) == PCI_EXP_TYPE_ROOT_PORT || pci_pcie_type(pci_dev) == PCI_EXP_TYPE_RC_EC)) pcie_clear_root_pme_status(pci_dev); } static int pci_pm_suspend(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; pci_dev->skip_bus_pm = false; /* * Disabling PTM allows some systems, e.g., Intel mobile chips * since Coffee Lake, to enter a lower-power PM state. */ pci_suspend_ptm(pci_dev); if (pci_has_legacy_pm_support(pci_dev)) return pci_legacy_suspend(dev, PMSG_SUSPEND); if (!pm) { pci_pm_default_suspend(pci_dev); return 0; } /* * PCI devices suspended at run time may need to be resumed at this * point, because in general it may be necessary to reconfigure them for * system suspend. Namely, if the device is expected to wake up the * system from the sleep state, it may have to be reconfigured for this * purpose, or if the device is not expected to wake up the system from * the sleep state, it should be prevented from signaling wakeup events * going forward. * * Also if the driver of the device does not indicate that its system * suspend callbacks can cope with runtime-suspended devices, it is * better to resume the device from runtime suspend here. */ if (!dev_pm_smart_suspend(dev) || pci_dev_need_resume(pci_dev)) { pm_runtime_resume(dev); pci_dev->state_saved = false; } else { pci_dev_adjust_pme(pci_dev); } if (pm->suspend) { pci_power_t prev = pci_dev->current_state; int error; error = pm->suspend(dev); suspend_report_result(dev, pm->suspend, error); if (error) return error; if (!pci_dev->state_saved && pci_dev->current_state != PCI_D0 && pci_dev->current_state != PCI_UNKNOWN) { pci_WARN_ONCE(pci_dev, pci_dev->current_state != prev, "PCI PM: State of device not saved by %pS\n", pm->suspend); } } return 0; } static int pci_pm_suspend_late(struct device *dev) { if (dev_pm_skip_suspend(dev)) return 0; pci_fixup_device(pci_fixup_suspend, to_pci_dev(dev)); return pm_generic_suspend_late(dev); } static int pci_pm_suspend_noirq(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; if (dev_pm_skip_suspend(dev)) return 0; if (pci_has_legacy_pm_support(pci_dev)) return pci_legacy_suspend_late(dev); if (!pm) { pci_save_state(pci_dev); goto Fixup; } if (pm->suspend_noirq) { pci_power_t prev = pci_dev->current_state; int error; error = pm->suspend_noirq(dev); suspend_report_result(dev, pm->suspend_noirq, error); if (error) return error; if (!pci_dev->state_saved && pci_dev->current_state != PCI_D0 && pci_dev->current_state != PCI_UNKNOWN) { pci_WARN_ONCE(pci_dev, pci_dev->current_state != prev, "PCI PM: State of device not saved by %pS\n", pm->suspend_noirq); goto Fixup; } } if (!pci_dev->state_saved) { pci_save_state(pci_dev); /* * If the device is a bridge with a child in D0 below it, * it needs to stay in D0, so check skip_bus_pm to avoid * putting it into a low-power state in that case. */ if (!pci_dev->skip_bus_pm && pci_power_manageable(pci_dev)) pci_prepare_to_sleep(pci_dev); } pci_dbg(pci_dev, "PCI PM: Suspend power state: %s\n", pci_power_name(pci_dev->current_state)); if (pci_dev->current_state == PCI_D0) { pci_dev->skip_bus_pm = true; /* * Per PCI PM r1.2, table 6-1, a bridge must be in D0 if any * downstream device is in D0, so avoid changing the power state * of the parent bridge by setting the skip_bus_pm flag for it. */ if (pci_dev->bus->self) pci_dev->bus->self->skip_bus_pm = true; } if (pci_dev->skip_bus_pm && pm_suspend_no_platform()) { pci_dbg(pci_dev, "PCI PM: Skipped\n"); goto Fixup; } pci_pm_set_unknown_state(pci_dev); /* * Some BIOSes from ASUS have a bug: If a USB EHCI host controller's * PCI COMMAND register isn't 0, the BIOS assumes that the controller * hasn't been quiesced and tries to turn it off. If the controller * is already in D3, this can hang or cause memory corruption. * * Since the value of the COMMAND register doesn't matter once the * device has been suspended, we can safely set it to 0 here. */ if (pci_dev->class == PCI_CLASS_SERIAL_USB_EHCI) pci_write_config_word(pci_dev, PCI_COMMAND, 0); Fixup: pci_fixup_device(pci_fixup_suspend_late, pci_dev); /* * If the target system sleep state is suspend-to-idle, it is sufficient * to check whether or not the device's wakeup settings are good for * runtime PM. Otherwise, the pm_resume_via_firmware() check will cause * pci_pm_complete() to take care of fixing up the device's state * anyway, if need be. */ if (device_can_wakeup(dev) && !device_may_wakeup(dev)) dev->power.may_skip_resume = false; return 0; } static int pci_pm_resume_noirq(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; pci_power_t prev_state = pci_dev->current_state; bool skip_bus_pm = pci_dev->skip_bus_pm; if (dev_pm_skip_resume(dev)) return 0; /* * In the suspend-to-idle case, devices left in D0 during suspend will * stay in D0, so it is not necessary to restore or update their * configuration here and attempting to put them into D0 again is * pointless, so avoid doing that. */ if (!(skip_bus_pm && pm_suspend_no_platform())) pci_pm_default_resume_early(pci_dev); pci_fixup_device(pci_fixup_resume_early, pci_dev); pcie_pme_root_status_cleanup(pci_dev); if (!skip_bus_pm && prev_state == PCI_D3cold) pci_pm_bridge_power_up_actions(pci_dev); if (pci_has_legacy_pm_support(pci_dev)) return 0; if (pm && pm->resume_noirq) return pm->resume_noirq(dev); return 0; } static int pci_pm_resume_early(struct device *dev) { if (dev_pm_skip_resume(dev)) return 0; return pm_generic_resume_early(dev); } static int pci_pm_resume(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; /* * This is necessary for the suspend error path in which resume is * called without restoring the standard config registers of the device. */ if (pci_dev->state_saved) pci_restore_standard_config(pci_dev); pci_resume_ptm(pci_dev); if (pci_has_legacy_pm_support(pci_dev)) return pci_legacy_resume(dev); pci_pm_default_resume(pci_dev); if (pm) { if (pm->resume) return pm->resume(dev); } else { pci_pm_reenable_device(pci_dev); } return 0; } #else /* !CONFIG_SUSPEND */ #define pci_pm_suspend NULL #define pci_pm_suspend_late NULL #define pci_pm_suspend_noirq NULL #define pci_pm_resume NULL #define pci_pm_resume_early NULL #define pci_pm_resume_noirq NULL #endif /* !CONFIG_SUSPEND */ #ifdef CONFIG_HIBERNATE_CALLBACKS static int pci_pm_freeze(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; if (pci_has_legacy_pm_support(pci_dev)) return pci_legacy_suspend(dev, PMSG_FREEZE); if (!pm) { pci_pm_default_suspend(pci_dev); if (!pm_runtime_suspended(dev)) pci_dev->state_saved = false; return 0; } /* * Resume all runtime-suspended devices before creating a snapshot * image of system memory, because the restore kernel generally cannot * be expected to always handle them consistently and they need to be * put into the runtime-active metastate during system resume anyway, * so it is better to ensure that the state saved in the image will be * always consistent with that. */ pm_runtime_resume(dev); pci_dev->state_saved = false; if (pm->freeze) { int error; error = pm->freeze(dev); suspend_report_result(dev, pm->freeze, error); if (error) return error; } return 0; } static int pci_pm_freeze_noirq(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; if (pci_has_legacy_pm_support(pci_dev)) return pci_legacy_suspend_late(dev); if (pm && pm->freeze_noirq) { int error; error = pm->freeze_noirq(dev); suspend_report_result(dev, pm->freeze_noirq, error); if (error) return error; } if (!pci_dev->state_saved) pci_save_state(pci_dev); pci_pm_set_unknown_state(pci_dev); return 0; } static int pci_pm_thaw_noirq(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; /* * The pm->thaw_noirq() callback assumes the device has been * returned to D0 and its config state has been restored. * * In addition, pci_restore_state() restores MSI-X state in MMIO * space, which requires the device to be in D0, so return it to D0 * in case the driver's "freeze" callbacks put it into a low-power * state. */ pci_pm_power_up_and_verify_state(pci_dev); pci_restore_state(pci_dev); if (pci_has_legacy_pm_support(pci_dev)) return 0; if (pm && pm->thaw_noirq) return pm->thaw_noirq(dev); return 0; } static int pci_pm_thaw(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; int error = 0; if (pci_has_legacy_pm_support(pci_dev)) return pci_legacy_resume(dev); if (pm) { if (pm->thaw) error = pm->thaw(dev); } else { pci_pm_reenable_device(pci_dev); } return error; } static int pci_pm_poweroff(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; if (pci_has_legacy_pm_support(pci_dev)) return pci_legacy_suspend(dev, PMSG_HIBERNATE); if (!pm) { pci_pm_default_suspend(pci_dev); return 0; } /* The reason to do that is the same as in pci_pm_suspend(). */ if (!dev_pm_smart_suspend(dev) || pci_dev_need_resume(pci_dev)) { pm_runtime_resume(dev); pci_dev->state_saved = false; } else { pci_dev_adjust_pme(pci_dev); } if (pm->poweroff) { int error; error = pm->poweroff(dev); suspend_report_result(dev, pm->poweroff, error); if (error) return error; } return 0; } static int pci_pm_poweroff_late(struct device *dev) { if (dev_pm_skip_suspend(dev)) return 0; pci_fixup_device(pci_fixup_suspend, to_pci_dev(dev)); return pm_generic_poweroff_late(dev); } static int pci_pm_poweroff_noirq(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; if (dev_pm_skip_suspend(dev)) return 0; if (pci_has_legacy_pm_support(pci_dev)) return pci_legacy_suspend_late(dev); if (!pm) { pci_fixup_device(pci_fixup_suspend_late, pci_dev); return 0; } if (pm->poweroff_noirq) { int error; error = pm->poweroff_noirq(dev); suspend_report_result(dev, pm->poweroff_noirq, error); if (error) return error; } if (!pci_dev->state_saved && !pci_has_subordinate(pci_dev)) pci_prepare_to_sleep(pci_dev); /* * The reason for doing this here is the same as for the analogous code * in pci_pm_suspend_noirq(). */ if (pci_dev->class == PCI_CLASS_SERIAL_USB_EHCI) pci_write_config_word(pci_dev, PCI_COMMAND, 0); pci_fixup_device(pci_fixup_suspend_late, pci_dev); return 0; } static int pci_pm_restore_noirq(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; pci_pm_default_resume_early(pci_dev); pci_fixup_device(pci_fixup_resume_early, pci_dev); if (pci_has_legacy_pm_support(pci_dev)) return 0; if (pm && pm->restore_noirq) return pm->restore_noirq(dev); return 0; } static int pci_pm_restore(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; /* * This is necessary for the hibernation error path in which restore is * called without restoring the standard config registers of the device. */ if (pci_dev->state_saved) pci_restore_standard_config(pci_dev); if (pci_has_legacy_pm_support(pci_dev)) return pci_legacy_resume(dev); pci_pm_default_resume(pci_dev); if (pm) { if (pm->restore) return pm->restore(dev); } else { pci_pm_reenable_device(pci_dev); } return 0; } #else /* !CONFIG_HIBERNATE_CALLBACKS */ #define pci_pm_freeze NULL #define pci_pm_freeze_noirq NULL #define pci_pm_thaw NULL #define pci_pm_thaw_noirq NULL #define pci_pm_poweroff NULL #define pci_pm_poweroff_late NULL #define pci_pm_poweroff_noirq NULL #define pci_pm_restore NULL #define pci_pm_restore_noirq NULL #endif /* !CONFIG_HIBERNATE_CALLBACKS */ #ifdef CONFIG_PM static int pci_pm_runtime_suspend(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; pci_power_t prev = pci_dev->current_state; int error; pci_suspend_ptm(pci_dev); /* * If pci_dev->driver is not set (unbound), we leave the device in D0, * but it may go to D3cold when the bridge above it runtime suspends. * Save its config space in case that happens. */ if (!pci_dev->driver) { pci_save_state(pci_dev); return 0; } pci_dev->state_saved = false; if (pm && pm->runtime_suspend) { error = pm->runtime_suspend(dev); /* * -EBUSY and -EAGAIN is used to request the runtime PM core * to schedule a new suspend, so log the event only with debug * log level. */ if (error == -EBUSY || error == -EAGAIN) { pci_dbg(pci_dev, "can't suspend now (%ps returned %d)\n", pm->runtime_suspend, error); return error; } else if (error) { pci_err(pci_dev, "can't suspend (%ps returned %d)\n", pm->runtime_suspend, error); return error; } } pci_fixup_device(pci_fixup_suspend, pci_dev); if (pm && pm->runtime_suspend && !pci_dev->state_saved && pci_dev->current_state != PCI_D0 && pci_dev->current_state != PCI_UNKNOWN) { pci_WARN_ONCE(pci_dev, pci_dev->current_state != prev, "PCI PM: State of device not saved by %pS\n", pm->runtime_suspend); return 0; } if (!pci_dev->state_saved) { pci_save_state(pci_dev); pci_finish_runtime_suspend(pci_dev); } return 0; } static int pci_pm_runtime_resume(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; pci_power_t prev_state = pci_dev->current_state; int error = 0; /* * Restoring config space is necessary even if the device is not bound * to a driver because although we left it in D0, it may have gone to * D3cold when the bridge above it runtime suspended. */ pci_pm_default_resume_early(pci_dev); pci_resume_ptm(pci_dev); if (!pci_dev->driver) return 0; pci_fixup_device(pci_fixup_resume_early, pci_dev); pci_pm_default_resume(pci_dev); if (prev_state == PCI_D3cold) pci_pm_bridge_power_up_actions(pci_dev); if (pm && pm->runtime_resume) error = pm->runtime_resume(dev); return error; } static int pci_pm_runtime_idle(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; /* * If pci_dev->driver is not set (unbound), the device should * always remain in D0 regardless of the runtime PM status */ if (!pci_dev->driver) return 0; if (pm && pm->runtime_idle) return pm->runtime_idle(dev); return 0; } static const struct dev_pm_ops pci_dev_pm_ops = { .prepare = pci_pm_prepare, .complete = pci_pm_complete, .suspend = pci_pm_suspend, .suspend_late = pci_pm_suspend_late, .resume = pci_pm_resume, .resume_early = pci_pm_resume_early, .freeze = pci_pm_freeze, .thaw = pci_pm_thaw, .poweroff = pci_pm_poweroff, .poweroff_late = pci_pm_poweroff_late, .restore = pci_pm_restore, .suspend_noirq = pci_pm_suspend_noirq, .resume_noirq = pci_pm_resume_noirq, .freeze_noirq = pci_pm_freeze_noirq, .thaw_noirq = pci_pm_thaw_noirq, .poweroff_noirq = pci_pm_poweroff_noirq, .restore_noirq = pci_pm_restore_noirq, .runtime_suspend = pci_pm_runtime_suspend, .runtime_resume = pci_pm_runtime_resume, .runtime_idle = pci_pm_runtime_idle, }; #define PCI_PM_OPS_PTR (&pci_dev_pm_ops) #else /* !CONFIG_PM */ #define pci_pm_runtime_suspend NULL #define pci_pm_runtime_resume NULL #define pci_pm_runtime_idle NULL #define PCI_PM_OPS_PTR NULL #endif /* !CONFIG_PM */ /** * __pci_register_driver - register a new pci driver * @drv: the driver structure to register * @owner: owner module of drv * @mod_name: module name string * * Adds the driver structure to the list of registered drivers. * Returns a negative value on error, otherwise 0. * If no error occurred, the driver remains registered even if * no device was claimed during registration. */ int __pci_register_driver(struct pci_driver *drv, struct module *owner, const char *mod_name) { /* initialize common driver fields */ drv->driver.name = drv->name; drv->driver.bus = &pci_bus_type; drv->driver.owner = owner; drv->driver.mod_name = mod_name; drv->driver.groups = drv->groups; drv->driver.dev_groups = drv->dev_groups; spin_lock_init(&drv->dynids.lock); INIT_LIST_HEAD(&drv->dynids.list); /* register with core */ return driver_register(&drv->driver); } EXPORT_SYMBOL(__pci_register_driver); /** * pci_unregister_driver - unregister a pci driver * @drv: the driver structure to unregister * * Deletes the driver structure from the list of registered PCI drivers, * gives it a chance to clean up by calling its remove() function for * each device it was responsible for, and marks those devices as * driverless. */ void pci_unregister_driver(struct pci_driver *drv) { driver_unregister(&drv->driver); pci_free_dynids(drv); } EXPORT_SYMBOL(pci_unregister_driver); static struct pci_driver pci_compat_driver = { .name = "compat" }; /** * pci_dev_driver - get the pci_driver of a device * @dev: the device to query * * Returns the appropriate pci_driver structure or %NULL if there is no * registered driver for the device. */ struct pci_driver *pci_dev_driver(const struct pci_dev *dev) { int i; if (dev->driver) return dev->driver; for (i = 0; i <= PCI_ROM_RESOURCE; i++) if (dev->resource[i].flags & IORESOURCE_BUSY) return &pci_compat_driver; return NULL; } EXPORT_SYMBOL(pci_dev_driver); /** * pci_bus_match - Tell if a PCI device structure has a matching PCI device id structure * @dev: the PCI device structure to match against * @drv: the device driver to search for matching PCI device id structures * * Used by a driver to check whether a PCI device present in the * system is in its list of supported devices. Returns the matching * pci_device_id structure or %NULL if there is no match. */ static int pci_bus_match(struct device *dev, const struct device_driver *drv) { struct pci_dev *pci_dev = to_pci_dev(dev); struct pci_driver *pci_drv; const struct pci_device_id *found_id; if (pci_dev_binding_disallowed(pci_dev)) return 0; pci_drv = (struct pci_driver *)to_pci_driver(drv); found_id = pci_match_device(pci_drv, pci_dev); if (found_id) return 1; return 0; } /** * pci_dev_get - increments the reference count of the pci device structure * @dev: the device being referenced * * Each live reference to a device should be refcounted. * * Drivers for PCI devices should normally record such references in * their probe() methods, when they bind to a device, and release * them by calling pci_dev_put(), in their disconnect() methods. * * A pointer to the device with the incremented reference counter is returned. */ struct pci_dev *pci_dev_get(struct pci_dev *dev) { if (dev) get_device(&dev->dev); return dev; } EXPORT_SYMBOL(pci_dev_get); /** * pci_dev_put - release a use of the pci device structure * @dev: device that's been disconnected * * Must be called when a user of a device is finished with it. When the last * user of the device calls this function, the memory of the device is freed. */ void pci_dev_put(struct pci_dev *dev) { if (dev) put_device(&dev->dev); } EXPORT_SYMBOL(pci_dev_put); static int pci_uevent(const struct device *dev, struct kobj_uevent_env *env) { const struct pci_dev *pdev; if (!dev) return -ENODEV; pdev = to_pci_dev(dev); if (add_uevent_var(env, "PCI_CLASS=%04X", pdev->class)) return -ENOMEM; if (add_uevent_var(env, "PCI_ID=%04X:%04X", pdev->vendor, pdev->device)) return -ENOMEM; if (add_uevent_var(env, "PCI_SUBSYS_ID=%04X:%04X", pdev->subsystem_vendor, pdev->subsystem_device)) return -ENOMEM; if (add_uevent_var(env, "PCI_SLOT_NAME=%s", pci_name(pdev))) return -ENOMEM; if (add_uevent_var(env, "MODALIAS=pci:v%08Xd%08Xsv%08Xsd%08Xbc%02Xsc%02Xi%02X", pdev->vendor, pdev->device, pdev->subsystem_vendor, pdev->subsystem_device, (u8)(pdev->class >> 16), (u8)(pdev->class >> 8), (u8)(pdev->class))) return -ENOMEM; return 0; } #if defined(CONFIG_PCIEAER) || defined(CONFIG_EEH) || defined(CONFIG_S390) /** * pci_uevent_ers - emit a uevent during recovery path of PCI device * @pdev: PCI device undergoing error recovery * @err_type: type of error event */ void pci_uevent_ers(struct pci_dev *pdev, enum pci_ers_result err_type) { int idx = 0; char *envp[3]; switch (err_type) { case PCI_ERS_RESULT_NONE: case PCI_ERS_RESULT_CAN_RECOVER: case PCI_ERS_RESULT_NEED_RESET: envp[idx++] = "ERROR_EVENT=BEGIN_RECOVERY"; envp[idx++] = "DEVICE_ONLINE=0"; break; case PCI_ERS_RESULT_RECOVERED: envp[idx++] = "ERROR_EVENT=SUCCESSFUL_RECOVERY"; envp[idx++] = "DEVICE_ONLINE=1"; break; case PCI_ERS_RESULT_DISCONNECT: envp[idx++] = "ERROR_EVENT=FAILED_RECOVERY"; envp[idx++] = "DEVICE_ONLINE=0"; break; default: break; } if (idx > 0) { envp[idx++] = NULL; kobject_uevent_env(&pdev->dev.kobj, KOBJ_CHANGE, envp); } } #endif static int pci_bus_num_vf(struct device *dev) { return pci_num_vf(to_pci_dev(dev)); } /** * pci_dma_configure - Setup DMA configuration * @dev: ptr to dev structure * * Function to update PCI devices's DMA configuration using the same * info from the OF node or ACPI node of host bridge's parent (if any). */ static int pci_dma_configure(struct device *dev) { const struct device_driver *drv = READ_ONCE(dev->driver); struct device *bridge; int ret = 0; bridge = pci_get_host_bridge_device(to_pci_dev(dev)); if (IS_ENABLED(CONFIG_OF) && bridge->parent && bridge->parent->of_node) { ret = of_dma_configure(dev, bridge->parent->of_node, true); } else if (has_acpi_companion(bridge)) { struct acpi_device *adev = to_acpi_device_node(bridge->fwnode); ret = acpi_dma_configure(dev, acpi_get_dma_attr(adev)); } /* * Attempt to enable ACS regardless of capability because some Root * Ports (e.g. those quirked with *_intel_pch_acs_*) do not have * the standard ACS capability but still support ACS via those * quirks. */ pci_enable_acs(to_pci_dev(dev)); pci_put_host_bridge_device(bridge); /* @drv may not be valid when we're called from the IOMMU layer */ if (!ret && drv && !to_pci_driver(drv)->driver_managed_dma) { ret = iommu_device_use_default_domain(dev); if (ret) arch_teardown_dma_ops(dev); } return ret; } static void pci_dma_cleanup(struct device *dev) { struct pci_driver *driver = to_pci_driver(dev->driver); if (!driver->driver_managed_dma) iommu_device_unuse_default_domain(dev); } /* * pci_device_irq_get_affinity - get IRQ affinity mask for device * @dev: ptr to dev structure * @irq_vec: interrupt vector number * * Return the CPU affinity mask for @dev and @irq_vec. */ static const struct cpumask *pci_device_irq_get_affinity(struct device *dev, unsigned int irq_vec) { return pci_irq_get_affinity(to_pci_dev(dev), irq_vec); } const struct bus_type pci_bus_type = { .name = "pci", .match = pci_bus_match, .uevent = pci_uevent, .probe = pci_device_probe, .remove = pci_device_remove, .shutdown = pci_device_shutdown, .irq_get_affinity = pci_device_irq_get_affinity, .dev_groups = pci_dev_groups, .bus_groups = pci_bus_groups, .drv_groups = pci_drv_groups, .pm = PCI_PM_OPS_PTR, .num_vf = pci_bus_num_vf, .dma_configure = pci_dma_configure, .dma_cleanup = pci_dma_cleanup, }; EXPORT_SYMBOL(pci_bus_type); static int __init pci_driver_init(void) { int ret; pci_probe_wq = alloc_workqueue("sync_wq", WQ_PERCPU, 0); if (!pci_probe_wq) return -ENOMEM; ret = bus_register(&pci_bus_type); if (ret) return ret; #ifdef CONFIG_PCIEPORTBUS ret = bus_register(&pcie_port_bus_type); if (ret) return ret; #endif dma_debug_add_bus(&pci_bus_type); return 0; } postcore_initcall(pci_driver_init);
36 36 36 36 36 36 9 9 9 1 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 /* FUSE: Filesystem in Userspace Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu> This program can be distributed under the terms of the GNU GPL. See the file COPYING. */ #include "fuse_i.h" #include <linux/init.h> #include <linux/module.h> #include <linux/fs_context.h> #include <linux/namei.h> #define FUSE_CTL_SUPER_MAGIC 0x65735543 /* * This is non-NULL when the single instance of the control filesystem * exists. Protected by fuse_mutex */ static struct super_block *fuse_control_sb; static struct fuse_conn *fuse_ctl_file_conn_get(struct file *file) { struct fuse_conn *fc; mutex_lock(&fuse_mutex); fc = file_inode(file)->i_private; if (fc) fc = fuse_conn_get(fc); mutex_unlock(&fuse_mutex); return fc; } static ssize_t fuse_conn_abort_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct fuse_conn *fc = fuse_ctl_file_conn_get(file); if (fc) { if (fc->abort_err) fc->aborted = true; fuse_abort_conn(fc); fuse_conn_put(fc); } return count; } static ssize_t fuse_conn_waiting_read(struct file *file, char __user *buf, size_t len, loff_t *ppos) { char tmp[32]; size_t size; if (!*ppos) { long value; struct fuse_conn *fc = fuse_ctl_file_conn_get(file); if (!fc) return 0; value = atomic_read(&fc->num_waiting); file->private_data = (void *)value; fuse_conn_put(fc); } size = sprintf(tmp, "%ld\n", (long)file->private_data); return simple_read_from_buffer(buf, len, ppos, tmp, size); } static ssize_t fuse_conn_limit_read(struct file *file, char __user *buf, size_t len, loff_t *ppos, unsigned val) { char tmp[32]; size_t size = sprintf(tmp, "%u\n", val); return simple_read_from_buffer(buf, len, ppos, tmp, size); } static ssize_t fuse_conn_limit_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos, unsigned *val, unsigned global_limit) { unsigned long t; unsigned limit = (1 << 16) - 1; int err; if (*ppos) return -EINVAL; err = kstrtoul_from_user(buf, count, 0, &t); if (err) return err; if (!capable(CAP_SYS_ADMIN)) limit = min(limit, global_limit); if (t > limit) return -EINVAL; *val = t; return count; } static ssize_t fuse_conn_max_background_read(struct file *file, char __user *buf, size_t len, loff_t *ppos) { struct fuse_conn *fc; unsigned val; fc = fuse_ctl_file_conn_get(file); if (!fc) return 0; val = READ_ONCE(fc->max_background); fuse_conn_put(fc); return fuse_conn_limit_read(file, buf, len, ppos, val); } static ssize_t fuse_conn_max_background_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { unsigned val; ssize_t ret; ret = fuse_conn_limit_write(file, buf, count, ppos, &val, max_user_bgreq); if (ret > 0) { struct fuse_conn *fc = fuse_ctl_file_conn_get(file); if (fc) { spin_lock(&fc->bg_lock); fc->max_background = val; fc->blocked = fc->num_background >= fc->max_background; if (!fc->blocked) wake_up(&fc->blocked_waitq); spin_unlock(&fc->bg_lock); fuse_conn_put(fc); } } return ret; } static ssize_t fuse_conn_congestion_threshold_read(struct file *file, char __user *buf, size_t len, loff_t *ppos) { struct fuse_conn *fc; unsigned val; fc = fuse_ctl_file_conn_get(file); if (!fc) return 0; val = READ_ONCE(fc->congestion_threshold); fuse_conn_put(fc); return fuse_conn_limit_read(file, buf, len, ppos, val); } static ssize_t fuse_conn_congestion_threshold_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { unsigned val; struct fuse_conn *fc; ssize_t ret; ret = fuse_conn_limit_write(file, buf, count, ppos, &val, max_user_congthresh); if (ret <= 0) goto out; fc = fuse_ctl_file_conn_get(file); if (!fc) goto out; WRITE_ONCE(fc->congestion_threshold, val); fuse_conn_put(fc); out: return ret; } static const struct file_operations fuse_ctl_abort_ops = { .open = nonseekable_open, .write = fuse_conn_abort_write, }; static const struct file_operations fuse_ctl_waiting_ops = { .open = nonseekable_open, .read = fuse_conn_waiting_read, }; static const struct file_operations fuse_conn_max_background_ops = { .open = nonseekable_open, .read = fuse_conn_max_background_read, .write = fuse_conn_max_background_write, }; static const struct file_operations fuse_conn_congestion_threshold_ops = { .open = nonseekable_open, .read = fuse_conn_congestion_threshold_read, .write = fuse_conn_congestion_threshold_write, }; static struct dentry *fuse_ctl_add_dentry(struct dentry *parent, struct fuse_conn *fc, const char *name, int mode, const struct inode_operations *iop, const struct file_operations *fop) { struct dentry *dentry; struct inode *inode; dentry = d_alloc_name(parent, name); if (!dentry) return NULL; inode = new_inode(fuse_control_sb); if (!inode) { dput(dentry); return NULL; } inode->i_ino = get_next_ino(); inode->i_mode = mode; inode->i_uid = fc->user_id; inode->i_gid = fc->group_id; simple_inode_init_ts(inode); /* setting ->i_op to NULL is not allowed */ if (iop) inode->i_op = iop; inode->i_fop = fop; if (S_ISDIR(mode)) { inc_nlink(d_inode(parent)); inc_nlink(inode); } inode->i_private = fc; d_make_persistent(dentry, inode); dput(dentry); /* * We are returning a borrowed reference here - it's only good while * fuse_mutex is held. Actually it's d_make_persistent() return * value... */ return dentry; } /* * Add a connection to the control filesystem (if it exists). Caller * must hold fuse_mutex */ int fuse_ctl_add_conn(struct fuse_conn *fc) { struct dentry *parent; char name[32]; if (!fuse_control_sb || fc->no_control) return 0; parent = fuse_control_sb->s_root; sprintf(name, "%u", fc->dev); parent = fuse_ctl_add_dentry(parent, fc, name, S_IFDIR | 0500, &simple_dir_inode_operations, &simple_dir_operations); if (!parent) goto err; if (!fuse_ctl_add_dentry(parent, fc, "waiting", S_IFREG | 0400, NULL, &fuse_ctl_waiting_ops) || !fuse_ctl_add_dentry(parent, fc, "abort", S_IFREG | 0200, NULL, &fuse_ctl_abort_ops) || !fuse_ctl_add_dentry(parent, fc, "max_background", S_IFREG | 0600, NULL, &fuse_conn_max_background_ops) || !fuse_ctl_add_dentry(parent, fc, "congestion_threshold", S_IFREG | 0600, NULL, &fuse_conn_congestion_threshold_ops)) goto err; return 0; err: fuse_ctl_remove_conn(fc); return -ENOMEM; } static void remove_one(struct dentry *dentry) { d_inode(dentry)->i_private = NULL; } /* * Remove a connection from the control filesystem (if it exists). * Caller must hold fuse_mutex */ void fuse_ctl_remove_conn(struct fuse_conn *fc) { char name[32]; if (!fuse_control_sb || fc->no_control) return; sprintf(name, "%u", fc->dev); simple_remove_by_name(fuse_control_sb->s_root, name, remove_one); } static int fuse_ctl_fill_super(struct super_block *sb, struct fs_context *fsc) { static const struct tree_descr empty_descr = {""}; struct fuse_conn *fc; int err; err = simple_fill_super(sb, FUSE_CTL_SUPER_MAGIC, &empty_descr); if (err) return err; mutex_lock(&fuse_mutex); BUG_ON(fuse_control_sb); fuse_control_sb = sb; list_for_each_entry(fc, &fuse_conn_list, entry) { err = fuse_ctl_add_conn(fc); if (err) { fuse_control_sb = NULL; mutex_unlock(&fuse_mutex); return err; } } mutex_unlock(&fuse_mutex); return 0; } static int fuse_ctl_get_tree(struct fs_context *fsc) { return get_tree_single(fsc, fuse_ctl_fill_super); } static const struct fs_context_operations fuse_ctl_context_ops = { .get_tree = fuse_ctl_get_tree, }; static int fuse_ctl_init_fs_context(struct fs_context *fsc) { fsc->ops = &fuse_ctl_context_ops; return 0; } static void fuse_ctl_kill_sb(struct super_block *sb) { mutex_lock(&fuse_mutex); fuse_control_sb = NULL; mutex_unlock(&fuse_mutex); kill_anon_super(sb); } static struct file_system_type fuse_ctl_fs_type = { .owner = THIS_MODULE, .name = "fusectl", .init_fs_context = fuse_ctl_init_fs_context, .kill_sb = fuse_ctl_kill_sb, }; MODULE_ALIAS_FS("fusectl"); int __init fuse_ctl_init(void) { return register_filesystem(&fuse_ctl_fs_type); } void __exit fuse_ctl_cleanup(void) { unregister_filesystem(&fuse_ctl_fs_type); }
11 1440 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 /* SPDX-License-Identifier: GPL-2.0-only */ /* * include/linux/idr.h * * 2002-10-18 written by Jim Houston jim.houston@ccur.com * Copyright (C) 2002 by Concurrent Computer Corporation * * Small id to pointer translation service avoiding fixed sized * tables. */ #ifndef __IDR_H__ #define __IDR_H__ #include <linux/radix-tree.h> #include <linux/gfp.h> #include <linux/percpu.h> #include <linux/cleanup.h> struct idr { struct radix_tree_root idr_rt; unsigned int idr_base; unsigned int idr_next; }; /* * The IDR API does not expose the tagging functionality of the radix tree * to users. Use tag 0 to track whether a node has free space below it. */ #define IDR_FREE 0 /* Set the IDR flag and the IDR_FREE tag */ #define IDR_RT_MARKER (ROOT_IS_IDR | (__force gfp_t) \ (1 << (ROOT_TAG_SHIFT + IDR_FREE))) #define IDR_INIT_BASE(name, base) { \ .idr_rt = RADIX_TREE_INIT(name, IDR_RT_MARKER), \ .idr_base = (base), \ .idr_next = 0, \ } /** * IDR_INIT() - Initialise an IDR. * @name: Name of IDR. * * A freshly-initialised IDR contains no IDs. */ #define IDR_INIT(name) IDR_INIT_BASE(name, 0) /** * DEFINE_IDR() - Define a statically-allocated IDR. * @name: Name of IDR. * * An IDR defined using this macro is ready for use with no additional * initialisation required. It contains no IDs. */ #define DEFINE_IDR(name) struct idr name = IDR_INIT(name) /** * idr_get_cursor - Return the current position of the cyclic allocator * @idr: idr handle * * The value returned is the value that will be next returned from * idr_alloc_cyclic() if it is free (otherwise the search will start from * this position). */ static inline unsigned int idr_get_cursor(const struct idr *idr) { return READ_ONCE(idr->idr_next); } /** * idr_set_cursor - Set the current position of the cyclic allocator * @idr: idr handle * @val: new position * * The next call to idr_alloc_cyclic() will return @val if it is free * (otherwise the search will start from this position). */ static inline void idr_set_cursor(struct idr *idr, unsigned int val) { WRITE_ONCE(idr->idr_next, val); } /** * DOC: idr sync * idr synchronization (stolen from radix-tree.h) * * idr_find() is able to be called locklessly, using RCU. The caller must * ensure calls to this function are made within rcu_read_lock() regions. * Other readers (lock-free or otherwise) and modifications may be running * concurrently. * * It is still required that the caller manage the synchronization and * lifetimes of the items. So if RCU lock-free lookups are used, typically * this would mean that the items have their own locks, or are amenable to * lock-free access; and that the items are freed by RCU (or only freed after * having been deleted from the idr tree *and* a synchronize_rcu() grace * period). */ #define idr_lock(idr) xa_lock(&(idr)->idr_rt) #define idr_unlock(idr) xa_unlock(&(idr)->idr_rt) #define idr_lock_bh(idr) xa_lock_bh(&(idr)->idr_rt) #define idr_unlock_bh(idr) xa_unlock_bh(&(idr)->idr_rt) #define idr_lock_irq(idr) xa_lock_irq(&(idr)->idr_rt) #define idr_unlock_irq(idr) xa_unlock_irq(&(idr)->idr_rt) #define idr_lock_irqsave(idr, flags) \ xa_lock_irqsave(&(idr)->idr_rt, flags) #define idr_unlock_irqrestore(idr, flags) \ xa_unlock_irqrestore(&(idr)->idr_rt, flags) void idr_preload(gfp_t gfp_mask); int idr_alloc(struct idr *, void *ptr, int start, int end, gfp_t); int __must_check idr_alloc_u32(struct idr *, void *ptr, u32 *id, unsigned long max, gfp_t); int idr_alloc_cyclic(struct idr *, void *ptr, int start, int end, gfp_t); void *idr_remove(struct idr *, unsigned long id); void *idr_find(const struct idr *, unsigned long id); int idr_for_each(const struct idr *, int (*fn)(int id, void *p, void *data), void *data); void *idr_get_next(struct idr *, int *nextid); void *idr_get_next_ul(struct idr *, unsigned long *nextid); void *idr_replace(struct idr *, void *, unsigned long id); void idr_destroy(struct idr *); struct __class_idr { struct idr *idr; int id; }; #define idr_null ((struct __class_idr){ NULL, -1 }) #define take_idr_id(id) __get_and_null(id, idr_null) DEFINE_CLASS(idr_alloc, struct __class_idr, if (_T.id >= 0) idr_remove(_T.idr, _T.id), ((struct __class_idr){ .idr = idr, .id = idr_alloc(idr, ptr, start, end, gfp), }), struct idr *idr, void *ptr, int start, int end, gfp_t gfp); /** * idr_init_base() - Initialise an IDR. * @idr: IDR handle. * @base: The base value for the IDR. * * This variation of idr_init() creates an IDR which will allocate IDs * starting at %base. */ static inline void idr_init_base(struct idr *idr, int base) { INIT_RADIX_TREE(&idr->idr_rt, IDR_RT_MARKER); idr->idr_base = base; idr->idr_next = 0; } /** * idr_init() - Initialise an IDR. * @idr: IDR handle. * * Initialise a dynamically allocated IDR. To initialise a * statically allocated IDR, use DEFINE_IDR(). */ static inline void idr_init(struct idr *idr) { idr_init_base(idr, 0); } /** * idr_is_empty() - Are there any IDs allocated? * @idr: IDR handle. * * Return: %true if any IDs have been allocated from this IDR. */ static inline bool idr_is_empty(const struct idr *idr) { return radix_tree_empty(&idr->idr_rt) && radix_tree_tagged(&idr->idr_rt, IDR_FREE); } /** * idr_preload_end - end preload section started with idr_preload() * * Each idr_preload() should be matched with an invocation of this * function. See idr_preload() for details. */ static inline void idr_preload_end(void) { local_unlock(&radix_tree_preloads.lock); } /** * idr_for_each_entry() - Iterate over an IDR's elements of a given type. * @idr: IDR handle. * @entry: The type * to use as cursor * @id: Entry ID. * * @entry and @id do not need to be initialized before the loop, and * after normal termination @entry is left with the value NULL. This * is convenient for a "not found" value. */ #define idr_for_each_entry(idr, entry, id) \ for (id = 0; ((entry) = idr_get_next(idr, &(id))) != NULL; id += 1U) /** * idr_for_each_entry_ul() - Iterate over an IDR's elements of a given type. * @idr: IDR handle. * @entry: The type * to use as cursor. * @tmp: A temporary placeholder for ID. * @id: Entry ID. * * @entry and @id do not need to be initialized before the loop, and * after normal termination @entry is left with the value NULL. This * is convenient for a "not found" value. */ #define idr_for_each_entry_ul(idr, entry, tmp, id) \ for (tmp = 0, id = 0; \ ((entry) = tmp <= id ? idr_get_next_ul(idr, &(id)) : NULL) != NULL; \ tmp = id, ++id) /** * idr_for_each_entry_continue() - Continue iteration over an IDR's elements of a given type * @idr: IDR handle. * @entry: The type * to use as a cursor. * @id: Entry ID. * * Continue to iterate over entries, continuing after the current position. */ #define idr_for_each_entry_continue(idr, entry, id) \ for ((entry) = idr_get_next((idr), &(id)); \ entry; \ ++id, (entry) = idr_get_next((idr), &(id))) /** * idr_for_each_entry_continue_ul() - Continue iteration over an IDR's elements of a given type * @idr: IDR handle. * @entry: The type * to use as a cursor. * @tmp: A temporary placeholder for ID. * @id: Entry ID. * * Continue to iterate over entries, continuing after the current position. * After normal termination @entry is left with the value NULL. This * is convenient for a "not found" value. */ #define idr_for_each_entry_continue_ul(idr, entry, tmp, id) \ for (tmp = id; \ ((entry) = tmp <= id ? idr_get_next_ul(idr, &(id)) : NULL) != NULL; \ tmp = id, ++id) /* * IDA - ID Allocator, use when translation from id to pointer isn't necessary. */ #define IDA_CHUNK_SIZE 128 /* 128 bytes per chunk */ #define IDA_BITMAP_LONGS (IDA_CHUNK_SIZE / sizeof(long)) #define IDA_BITMAP_BITS (IDA_BITMAP_LONGS * sizeof(long) * 8) struct ida_bitmap { unsigned long bitmap[IDA_BITMAP_LONGS]; }; struct ida { struct xarray xa; }; #define IDA_INIT_FLAGS (XA_FLAGS_LOCK_IRQ | XA_FLAGS_ALLOC) #define IDA_INIT(name) { \ .xa = XARRAY_INIT(name, IDA_INIT_FLAGS) \ } #define DEFINE_IDA(name) struct ida name = IDA_INIT(name) int ida_alloc_range(struct ida *, unsigned int min, unsigned int max, gfp_t); void ida_free(struct ida *, unsigned int id); void ida_destroy(struct ida *ida); int ida_find_first_range(struct ida *ida, unsigned int min, unsigned int max); /** * ida_alloc() - Allocate an unused ID. * @ida: IDA handle. * @gfp: Memory allocation flags. * * Allocate an ID between 0 and %INT_MAX, inclusive. * * Context: Any context. It is safe to call this function without * locking in your code. * Return: The allocated ID, or %-ENOMEM if memory could not be allocated, * or %-ENOSPC if there are no free IDs. */ static inline int ida_alloc(struct ida *ida, gfp_t gfp) { return ida_alloc_range(ida, 0, ~0, gfp); } /** * ida_alloc_min() - Allocate an unused ID. * @ida: IDA handle. * @min: Lowest ID to allocate. * @gfp: Memory allocation flags. * * Allocate an ID between @min and %INT_MAX, inclusive. * * Context: Any context. It is safe to call this function without * locking in your code. * Return: The allocated ID, or %-ENOMEM if memory could not be allocated, * or %-ENOSPC if there are no free IDs. */ static inline int ida_alloc_min(struct ida *ida, unsigned int min, gfp_t gfp) { return ida_alloc_range(ida, min, ~0, gfp); } /** * ida_alloc_max() - Allocate an unused ID. * @ida: IDA handle. * @max: Highest ID to allocate. * @gfp: Memory allocation flags. * * Allocate an ID between 0 and @max, inclusive. * * Context: Any context. It is safe to call this function without * locking in your code. * Return: The allocated ID, or %-ENOMEM if memory could not be allocated, * or %-ENOSPC if there are no free IDs. */ static inline int ida_alloc_max(struct ida *ida, unsigned int max, gfp_t gfp) { return ida_alloc_range(ida, 0, max, gfp); } static inline void ida_init(struct ida *ida) { xa_init_flags(&ida->xa, IDA_INIT_FLAGS); } static inline bool ida_is_empty(const struct ida *ida) { return xa_empty(&ida->xa); } static inline bool ida_exists(struct ida *ida, unsigned int id) { return ida_find_first_range(ida, id, id) == id; } static inline int ida_find_first(struct ida *ida) { return ida_find_first_range(ida, 0, ~0); } #endif /* __IDR_H__ */
3 3 1 3 3 3 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 4 4 4 4 4 4 4 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 3 4 4 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 // SPDX-License-Identifier: GPL-2.0-or-later /* * Linux NET3: Internet Group Management Protocol [IGMP] * * This code implements the IGMP protocol as defined in RFC1112. There has * been a further revision of this protocol since which is now supported. * * If you have trouble with this module be careful what gcc you have used, * the older version didn't come out right using gcc 2.5.8, the newer one * seems to fall out with gcc 2.6.2. * * Authors: * Alan Cox <alan@lxorguk.ukuu.org.uk> * * Fixes: * * Alan Cox : Added lots of __inline__ to optimise * the memory usage of all the tiny little * functions. * Alan Cox : Dumped the header building experiment. * Alan Cox : Minor tweaks ready for multicast routing * and extended IGMP protocol. * Alan Cox : Removed a load of inline directives. Gcc 2.5.8 * writes utterly bogus code otherwise (sigh) * fixed IGMP loopback to behave in the manner * desired by mrouted, fixed the fact it has been * broken since 1.3.6 and cleaned up a few minor * points. * * Chih-Jen Chang : Tried to revise IGMP to Version 2 * Tsu-Sheng Tsao E-mail: chihjenc@scf.usc.edu and tsusheng@scf.usc.edu * The enhancements are mainly based on Steve Deering's * ipmulti-3.5 source code. * Chih-Jen Chang : Added the igmp_get_mrouter_info and * Tsu-Sheng Tsao igmp_set_mrouter_info to keep track of * the mrouted version on that device. * Chih-Jen Chang : Added the max_resp_time parameter to * Tsu-Sheng Tsao igmp_heard_query(). Using this parameter * to identify the multicast router version * and do what the IGMP version 2 specified. * Chih-Jen Chang : Added a timer to revert to IGMP V2 router * Tsu-Sheng Tsao if the specified time expired. * Alan Cox : Stop IGMP from 0.0.0.0 being accepted. * Alan Cox : Use GFP_ATOMIC in the right places. * Christian Daudt : igmp timer wasn't set for local group * memberships but was being deleted, * which caused a "del_timer() called * from %p with timer not initialized\n" * message (960131). * Christian Daudt : removed del_timer from * igmp_timer_expire function (960205). * Christian Daudt : igmp_heard_report now only calls * igmp_timer_expire if tm->running is * true (960216). * Malcolm Beattie : ttl comparison wrong in igmp_rcv made * igmp_heard_query never trigger. Expiry * miscalculation fixed in igmp_heard_query * and random() made to return unsigned to * prevent negative expiry times. * Alexey Kuznetsov: Wrong group leaving behaviour, backport * fix from pending 2.1.x patches. * Alan Cox: Forget to enable FDDI support earlier. * Alexey Kuznetsov: Fixed leaving groups on device down. * Alexey Kuznetsov: Accordance to igmp-v2-06 draft. * David L Stevens: IGMPv3 support, with help from * Vinay Kulkarni */ #include <linux/module.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/jiffies.h> #include <linux/string.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/inetdevice.h> #include <linux/igmp.h> #include "igmp_internal.h" #include <linux/if_arp.h> #include <linux/rtnetlink.h> #include <linux/times.h> #include <linux/pkt_sched.h> #include <linux/byteorder/generic.h> #include <net/net_namespace.h> #include <net/netlink.h> #include <net/addrconf.h> #include <net/arp.h> #include <net/ip.h> #include <net/protocol.h> #include <net/route.h> #include <net/sock.h> #include <net/checksum.h> #include <net/inet_common.h> #include <linux/netfilter_ipv4.h> #ifdef CONFIG_IP_MROUTE #include <linux/mroute.h> #endif #ifdef CONFIG_PROC_FS #include <linux/proc_fs.h> #include <linux/seq_file.h> #endif #ifdef CONFIG_IP_MULTICAST /* Parameter names and values are taken from igmp-v2-06 draft */ #define IGMP_QUERY_INTERVAL (125*HZ) #define IGMP_QUERY_RESPONSE_INTERVAL (10*HZ) #define IGMP_INITIAL_REPORT_DELAY (1) /* IGMP_INITIAL_REPORT_DELAY is not from IGMP specs! * IGMP specs require to report membership immediately after * joining a group, but we delay the first report by a * small interval. It seems more natural and still does not * contradict to specs provided this delay is small enough. */ #define IGMP_V1_SEEN(in_dev) \ (IPV4_DEVCONF_ALL_RO(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 1 || \ IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 1 || \ ((in_dev)->mr_v1_seen && \ time_before(jiffies, (in_dev)->mr_v1_seen))) #define IGMP_V2_SEEN(in_dev) \ (IPV4_DEVCONF_ALL_RO(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 2 || \ IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 2 || \ ((in_dev)->mr_v2_seen && \ time_before(jiffies, (in_dev)->mr_v2_seen))) static int unsolicited_report_interval(struct in_device *in_dev) { int interval_ms, interval_jiffies; if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) interval_ms = IN_DEV_CONF_GET( in_dev, IGMPV2_UNSOLICITED_REPORT_INTERVAL); else /* v3 */ interval_ms = IN_DEV_CONF_GET( in_dev, IGMPV3_UNSOLICITED_REPORT_INTERVAL); interval_jiffies = msecs_to_jiffies(interval_ms); /* _timer functions can't handle a delay of 0 jiffies so ensure * we always return a positive value. */ if (interval_jiffies <= 0) interval_jiffies = 1; return interval_jiffies; } static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im, gfp_t gfp); static void igmpv3_del_delrec(struct in_device *in_dev, struct ip_mc_list *im); static void igmpv3_clear_delrec(struct in_device *in_dev); static int sf_setstate(struct ip_mc_list *pmc); static void sf_markstate(struct ip_mc_list *pmc); #endif static void ip_mc_clear_src(struct ip_mc_list *pmc); static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode, int sfcount, __be32 *psfsrc, int delta); static void ip_ma_put(struct ip_mc_list *im) { if (refcount_dec_and_test(&im->refcnt)) { in_dev_put(im->interface); kfree_rcu(im, rcu); } } #define for_each_pmc_rcu(in_dev, pmc) \ for (pmc = rcu_dereference(in_dev->mc_list); \ pmc != NULL; \ pmc = rcu_dereference(pmc->next_rcu)) #define for_each_pmc_rtnl(in_dev, pmc) \ for (pmc = rtnl_dereference(in_dev->mc_list); \ pmc != NULL; \ pmc = rtnl_dereference(pmc->next_rcu)) static void ip_sf_list_clear_all(struct ip_sf_list *psf) { struct ip_sf_list *next; while (psf) { next = psf->sf_next; kfree(psf); psf = next; } } #ifdef CONFIG_IP_MULTICAST /* * Timer management */ static void igmp_stop_timer(struct ip_mc_list *im) { spin_lock_bh(&im->lock); if (timer_delete(&im->timer)) refcount_dec(&im->refcnt); im->tm_running = 0; im->reporter = 0; im->unsolicit_count = 0; spin_unlock_bh(&im->lock); } /* It must be called with locked im->lock */ static void igmp_start_timer(struct ip_mc_list *im, int max_delay) { int tv = get_random_u32_below(max_delay); im->tm_running = 1; if (refcount_inc_not_zero(&im->refcnt)) { if (mod_timer(&im->timer, jiffies + tv + 2)) ip_ma_put(im); } } static void igmp_gq_start_timer(struct in_device *in_dev) { int tv = get_random_u32_below(READ_ONCE(in_dev->mr_maxdelay)); unsigned long exp = jiffies + tv + 2; if (in_dev->mr_gq_running && time_after_eq(exp, (in_dev->mr_gq_timer).expires)) return; in_dev->mr_gq_running = 1; if (!mod_timer(&in_dev->mr_gq_timer, exp)) in_dev_hold(in_dev); } static void igmp_ifc_start_timer(struct in_device *in_dev, int delay) { int tv = get_random_u32_below(delay); if (!mod_timer(&in_dev->mr_ifc_timer, jiffies+tv+2)) in_dev_hold(in_dev); } static void igmp_mod_timer(struct ip_mc_list *im, int max_delay) { spin_lock_bh(&im->lock); im->unsolicit_count = 0; if (timer_delete(&im->timer)) { if ((long)(im->timer.expires-jiffies) < max_delay) { add_timer(&im->timer); im->tm_running = 1; spin_unlock_bh(&im->lock); return; } refcount_dec(&im->refcnt); } igmp_start_timer(im, max_delay); spin_unlock_bh(&im->lock); } /* * Send an IGMP report. */ #define IGMP_SIZE (sizeof(struct igmphdr)+sizeof(struct iphdr)+4) static int is_in(struct ip_mc_list *pmc, struct ip_sf_list *psf, int type, int gdeleted, int sdeleted) { switch (type) { case IGMPV3_MODE_IS_INCLUDE: case IGMPV3_MODE_IS_EXCLUDE: if (gdeleted || sdeleted) return 0; if (!(pmc->gsquery && !psf->sf_gsresp)) { if (pmc->sfmode == MCAST_INCLUDE) return 1; /* don't include if this source is excluded * in all filters */ if (psf->sf_count[MCAST_INCLUDE]) return type == IGMPV3_MODE_IS_INCLUDE; return pmc->sfcount[MCAST_EXCLUDE] == psf->sf_count[MCAST_EXCLUDE]; } return 0; case IGMPV3_CHANGE_TO_INCLUDE: if (gdeleted || sdeleted) return 0; return psf->sf_count[MCAST_INCLUDE] != 0; case IGMPV3_CHANGE_TO_EXCLUDE: if (gdeleted || sdeleted) return 0; if (pmc->sfcount[MCAST_EXCLUDE] == 0 || psf->sf_count[MCAST_INCLUDE]) return 0; return pmc->sfcount[MCAST_EXCLUDE] == psf->sf_count[MCAST_EXCLUDE]; case IGMPV3_ALLOW_NEW_SOURCES: if (gdeleted || !psf->sf_crcount) return 0; return (pmc->sfmode == MCAST_INCLUDE) ^ sdeleted; case IGMPV3_BLOCK_OLD_SOURCES: if (pmc->sfmode == MCAST_INCLUDE) return gdeleted || (psf->sf_crcount && sdeleted); return psf->sf_crcount && !gdeleted && !sdeleted; } return 0; } static int igmp_scount(struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted) { struct ip_sf_list *psf; int scount = 0; for (psf = pmc->sources; psf; psf = psf->sf_next) { if (!is_in(pmc, psf, type, gdeleted, sdeleted)) continue; scount++; } return scount; } /* source address selection per RFC 3376 section 4.2.13 */ static __be32 igmpv3_get_srcaddr(struct net_device *dev, const struct flowi4 *fl4) { struct in_device *in_dev = __in_dev_get_rcu(dev); const struct in_ifaddr *ifa; if (!in_dev) return htonl(INADDR_ANY); in_dev_for_each_ifa_rcu(ifa, in_dev) { if (fl4->saddr == ifa->ifa_local) return fl4->saddr; } return htonl(INADDR_ANY); } static struct sk_buff *igmpv3_newpack(struct net_device *dev, unsigned int mtu) { struct sk_buff *skb; struct rtable *rt; struct iphdr *pip; struct igmpv3_report *pig; struct net *net = dev_net(dev); struct flowi4 fl4; int hlen = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; unsigned int size; size = min(mtu, IP_MAX_MTU); while (1) { skb = alloc_skb(size + hlen + tlen, GFP_ATOMIC | __GFP_NOWARN); if (skb) break; size >>= 1; if (size < 256) return NULL; } skb->priority = TC_PRIO_CONTROL; rt = ip_route_output_ports(net, &fl4, NULL, IGMPV3_ALL_MCR, 0, 0, 0, IPPROTO_IGMP, 0, dev->ifindex); if (IS_ERR(rt)) { kfree_skb(skb); return NULL; } skb_dst_set(skb, &rt->dst); skb->dev = dev; skb_reserve(skb, hlen); skb_tailroom_reserve(skb, mtu, tlen); skb_reset_network_header(skb); pip = ip_hdr(skb); skb_put(skb, sizeof(struct iphdr) + 4); pip->version = 4; pip->ihl = (sizeof(struct iphdr)+4)>>2; pip->tos = 0xc0; pip->frag_off = htons(IP_DF); pip->ttl = 1; pip->daddr = fl4.daddr; rcu_read_lock(); pip->saddr = igmpv3_get_srcaddr(dev, &fl4); rcu_read_unlock(); pip->protocol = IPPROTO_IGMP; pip->tot_len = 0; /* filled in later */ ip_select_ident(net, skb, NULL); ((u8 *)&pip[1])[0] = IPOPT_RA; ((u8 *)&pip[1])[1] = 4; ((u8 *)&pip[1])[2] = 0; ((u8 *)&pip[1])[3] = 0; skb->transport_header = skb->network_header + sizeof(struct iphdr) + 4; skb_put(skb, sizeof(*pig)); pig = igmpv3_report_hdr(skb); pig->type = IGMPV3_HOST_MEMBERSHIP_REPORT; pig->resv1 = 0; pig->csum = 0; pig->resv2 = 0; pig->ngrec = 0; return skb; } static int igmpv3_sendpack(struct sk_buff *skb) { struct igmphdr *pig = igmp_hdr(skb); const int igmplen = skb_tail_pointer(skb) - skb_transport_header(skb); pig->csum = ip_compute_csum(igmp_hdr(skb), igmplen); return ip_local_out(skb_dst_dev_net(skb), skb->sk, skb); } static int grec_size(struct ip_mc_list *pmc, int type, int gdel, int sdel) { return sizeof(struct igmpv3_grec) + 4*igmp_scount(pmc, type, gdel, sdel); } static struct sk_buff *add_grhead(struct sk_buff *skb, struct ip_mc_list *pmc, int type, struct igmpv3_grec **ppgr, unsigned int mtu) { struct net_device *dev = pmc->interface->dev; struct igmpv3_report *pih; struct igmpv3_grec *pgr; if (!skb) { skb = igmpv3_newpack(dev, mtu); if (!skb) return NULL; } pgr = skb_put(skb, sizeof(struct igmpv3_grec)); pgr->grec_type = type; pgr->grec_auxwords = 0; pgr->grec_nsrcs = 0; pgr->grec_mca = pmc->multiaddr; pih = igmpv3_report_hdr(skb); pih->ngrec = htons(ntohs(pih->ngrec)+1); *ppgr = pgr; return skb; } #define AVAILABLE(skb) ((skb) ? skb_availroom(skb) : 0) static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted) { struct net_device *dev = pmc->interface->dev; struct net *net = dev_net(dev); struct igmpv3_report *pih; struct igmpv3_grec *pgr = NULL; struct ip_sf_list *psf, *psf_next, *psf_prev, **psf_list; int scount, stotal, first, isquery, truncate; unsigned int mtu; if (pmc->multiaddr == IGMP_ALL_HOSTS) return skb; if (ipv4_is_local_multicast(pmc->multiaddr) && !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports)) return skb; mtu = READ_ONCE(dev->mtu); if (mtu < IPV4_MIN_MTU) return skb; isquery = type == IGMPV3_MODE_IS_INCLUDE || type == IGMPV3_MODE_IS_EXCLUDE; truncate = type == IGMPV3_MODE_IS_EXCLUDE || type == IGMPV3_CHANGE_TO_EXCLUDE; stotal = scount = 0; psf_list = sdeleted ? &pmc->tomb : &pmc->sources; if (!*psf_list) goto empty_source; pih = skb ? igmpv3_report_hdr(skb) : NULL; /* EX and TO_EX get a fresh packet, if needed */ if (truncate) { if (pih && pih->ngrec && AVAILABLE(skb) < grec_size(pmc, type, gdeleted, sdeleted)) { if (skb) igmpv3_sendpack(skb); skb = igmpv3_newpack(dev, mtu); } } first = 1; psf_prev = NULL; for (psf = *psf_list; psf; psf = psf_next) { __be32 *psrc; psf_next = psf->sf_next; if (!is_in(pmc, psf, type, gdeleted, sdeleted)) { psf_prev = psf; continue; } /* Based on RFC3376 5.1. Should not send source-list change * records when there is a filter mode change. */ if (((gdeleted && pmc->sfmode == MCAST_EXCLUDE) || (!gdeleted && pmc->crcount)) && (type == IGMPV3_ALLOW_NEW_SOURCES || type == IGMPV3_BLOCK_OLD_SOURCES) && psf->sf_crcount) goto decrease_sf_crcount; /* clear marks on query responses */ if (isquery) psf->sf_gsresp = 0; if (AVAILABLE(skb) < sizeof(__be32) + first*sizeof(struct igmpv3_grec)) { if (truncate && !first) break; /* truncate these */ if (pgr) pgr->grec_nsrcs = htons(scount); if (skb) igmpv3_sendpack(skb); skb = igmpv3_newpack(dev, mtu); first = 1; scount = 0; } if (first) { skb = add_grhead(skb, pmc, type, &pgr, mtu); first = 0; } if (!skb) return NULL; psrc = skb_put(skb, sizeof(__be32)); *psrc = psf->sf_inaddr; scount++; stotal++; if ((type == IGMPV3_ALLOW_NEW_SOURCES || type == IGMPV3_BLOCK_OLD_SOURCES) && psf->sf_crcount) { decrease_sf_crcount: psf->sf_crcount--; if ((sdeleted || gdeleted) && psf->sf_crcount == 0) { if (psf_prev) psf_prev->sf_next = psf->sf_next; else *psf_list = psf->sf_next; kfree(psf); continue; } } psf_prev = psf; } empty_source: if (!stotal) { if (type == IGMPV3_ALLOW_NEW_SOURCES || type == IGMPV3_BLOCK_OLD_SOURCES) return skb; if (pmc->crcount || isquery) { /* make sure we have room for group header */ if (skb && AVAILABLE(skb) < sizeof(struct igmpv3_grec)) { igmpv3_sendpack(skb); skb = NULL; /* add_grhead will get a new one */ } skb = add_grhead(skb, pmc, type, &pgr, mtu); } } if (pgr) pgr->grec_nsrcs = htons(scount); if (isquery) pmc->gsquery = 0; /* clear query state on report */ return skb; } static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc) { struct sk_buff *skb = NULL; struct net *net = dev_net(in_dev->dev); int type; if (!pmc) { rcu_read_lock(); for_each_pmc_rcu(in_dev, pmc) { if (pmc->multiaddr == IGMP_ALL_HOSTS) continue; if (ipv4_is_local_multicast(pmc->multiaddr) && !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports)) continue; spin_lock_bh(&pmc->lock); if (pmc->sfcount[MCAST_EXCLUDE]) type = IGMPV3_MODE_IS_EXCLUDE; else type = IGMPV3_MODE_IS_INCLUDE; skb = add_grec(skb, pmc, type, 0, 0); spin_unlock_bh(&pmc->lock); } rcu_read_unlock(); } else { spin_lock_bh(&pmc->lock); if (pmc->sfcount[MCAST_EXCLUDE]) type = IGMPV3_MODE_IS_EXCLUDE; else type = IGMPV3_MODE_IS_INCLUDE; skb = add_grec(skb, pmc, type, 0, 0); spin_unlock_bh(&pmc->lock); } if (!skb) return 0; return igmpv3_sendpack(skb); } /* * remove zero-count source records from a source filter list */ static void igmpv3_clear_zeros(struct ip_sf_list **ppsf) { struct ip_sf_list *psf_prev, *psf_next, *psf; psf_prev = NULL; for (psf = *ppsf; psf; psf = psf_next) { psf_next = psf->sf_next; if (psf->sf_crcount == 0) { if (psf_prev) psf_prev->sf_next = psf->sf_next; else *ppsf = psf->sf_next; kfree(psf); } else psf_prev = psf; } } static void kfree_pmc(struct ip_mc_list *pmc) { ip_sf_list_clear_all(pmc->sources); ip_sf_list_clear_all(pmc->tomb); kfree(pmc); } static void igmpv3_send_cr(struct in_device *in_dev) { struct ip_mc_list *pmc, *pmc_prev, *pmc_next; struct sk_buff *skb = NULL; int type, dtype; rcu_read_lock(); spin_lock_bh(&in_dev->mc_tomb_lock); /* deleted MCA's */ pmc_prev = NULL; for (pmc = in_dev->mc_tomb; pmc; pmc = pmc_next) { pmc_next = pmc->next; if (pmc->sfmode == MCAST_INCLUDE) { type = IGMPV3_BLOCK_OLD_SOURCES; dtype = IGMPV3_BLOCK_OLD_SOURCES; skb = add_grec(skb, pmc, type, 1, 0); skb = add_grec(skb, pmc, dtype, 1, 1); } if (pmc->crcount) { if (pmc->sfmode == MCAST_EXCLUDE) { type = IGMPV3_CHANGE_TO_INCLUDE; skb = add_grec(skb, pmc, type, 1, 0); } pmc->crcount--; if (pmc->crcount == 0) { igmpv3_clear_zeros(&pmc->tomb); igmpv3_clear_zeros(&pmc->sources); } } if (pmc->crcount == 0 && !pmc->tomb && !pmc->sources) { if (pmc_prev) pmc_prev->next = pmc_next; else in_dev->mc_tomb = pmc_next; in_dev_put(pmc->interface); kfree_pmc(pmc); } else pmc_prev = pmc; } spin_unlock_bh(&in_dev->mc_tomb_lock); /* change recs */ for_each_pmc_rcu(in_dev, pmc) { spin_lock_bh(&pmc->lock); if (pmc->sfcount[MCAST_EXCLUDE]) { type = IGMPV3_BLOCK_OLD_SOURCES; dtype = IGMPV3_ALLOW_NEW_SOURCES; } else { type = IGMPV3_ALLOW_NEW_SOURCES; dtype = IGMPV3_BLOCK_OLD_SOURCES; } skb = add_grec(skb, pmc, type, 0, 0); skb = add_grec(skb, pmc, dtype, 0, 1); /* deleted sources */ /* filter mode changes */ if (pmc->crcount) { if (pmc->sfmode == MCAST_EXCLUDE) type = IGMPV3_CHANGE_TO_EXCLUDE; else type = IGMPV3_CHANGE_TO_INCLUDE; skb = add_grec(skb, pmc, type, 0, 0); pmc->crcount--; } spin_unlock_bh(&pmc->lock); } rcu_read_unlock(); if (!skb) return; (void) igmpv3_sendpack(skb); } static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, int type) { struct sk_buff *skb; struct iphdr *iph; struct igmphdr *ih; struct rtable *rt; struct net_device *dev = in_dev->dev; struct net *net = dev_net(dev); __be32 group = pmc ? pmc->multiaddr : 0; struct flowi4 fl4; __be32 dst; int hlen, tlen; if (type == IGMPV3_HOST_MEMBERSHIP_REPORT) return igmpv3_send_report(in_dev, pmc); if (ipv4_is_local_multicast(group) && !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports)) return 0; if (type == IGMP_HOST_LEAVE_MESSAGE) dst = IGMP_ALL_ROUTER; else dst = group; rt = ip_route_output_ports(net, &fl4, NULL, dst, 0, 0, 0, IPPROTO_IGMP, 0, dev->ifindex); if (IS_ERR(rt)) return -1; hlen = LL_RESERVED_SPACE(dev); tlen = dev->needed_tailroom; skb = alloc_skb(IGMP_SIZE + hlen + tlen, GFP_ATOMIC); if (!skb) { ip_rt_put(rt); return -1; } skb->priority = TC_PRIO_CONTROL; skb_dst_set(skb, &rt->dst); skb_reserve(skb, hlen); skb_reset_network_header(skb); iph = ip_hdr(skb); skb_put(skb, sizeof(struct iphdr) + 4); iph->version = 4; iph->ihl = (sizeof(struct iphdr)+4)>>2; iph->tos = 0xc0; iph->frag_off = htons(IP_DF); iph->ttl = 1; iph->daddr = dst; iph->saddr = fl4.saddr; iph->protocol = IPPROTO_IGMP; ip_select_ident(net, skb, NULL); ((u8 *)&iph[1])[0] = IPOPT_RA; ((u8 *)&iph[1])[1] = 4; ((u8 *)&iph[1])[2] = 0; ((u8 *)&iph[1])[3] = 0; ih = skb_put(skb, sizeof(struct igmphdr)); ih->type = type; ih->code = 0; ih->csum = 0; ih->group = group; ih->csum = ip_compute_csum((void *)ih, sizeof(struct igmphdr)); return ip_local_out(net, skb->sk, skb); } static void igmp_gq_timer_expire(struct timer_list *t) { struct in_device *in_dev = timer_container_of(in_dev, t, mr_gq_timer); in_dev->mr_gq_running = 0; igmpv3_send_report(in_dev, NULL); in_dev_put(in_dev); } static void igmp_ifc_timer_expire(struct timer_list *t) { struct in_device *in_dev = timer_container_of(in_dev, t, mr_ifc_timer); u32 mr_ifc_count; igmpv3_send_cr(in_dev); restart: mr_ifc_count = READ_ONCE(in_dev->mr_ifc_count); if (mr_ifc_count) { if (cmpxchg(&in_dev->mr_ifc_count, mr_ifc_count, mr_ifc_count - 1) != mr_ifc_count) goto restart; igmp_ifc_start_timer(in_dev, unsolicited_report_interval(in_dev)); } in_dev_put(in_dev); } static void igmp_ifc_event(struct in_device *in_dev) { struct net *net = dev_net(in_dev->dev); if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) return; WRITE_ONCE(in_dev->mr_ifc_count, in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv)); igmp_ifc_start_timer(in_dev, 1); } static void igmp_timer_expire(struct timer_list *t) { struct ip_mc_list *im = timer_container_of(im, t, timer); struct in_device *in_dev = im->interface; spin_lock(&im->lock); im->tm_running = 0; if (im->unsolicit_count && --im->unsolicit_count) igmp_start_timer(im, unsolicited_report_interval(in_dev)); im->reporter = 1; spin_unlock(&im->lock); if (IGMP_V1_SEEN(in_dev)) igmp_send_report(in_dev, im, IGMP_HOST_MEMBERSHIP_REPORT); else if (IGMP_V2_SEEN(in_dev)) igmp_send_report(in_dev, im, IGMPV2_HOST_MEMBERSHIP_REPORT); else igmp_send_report(in_dev, im, IGMPV3_HOST_MEMBERSHIP_REPORT); ip_ma_put(im); } /* mark EXCLUDE-mode sources */ static int igmp_xmarksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs) { struct ip_sf_list *psf; int i, scount; scount = 0; for (psf = pmc->sources; psf; psf = psf->sf_next) { if (scount == nsrcs) break; for (i = 0; i < nsrcs; i++) { /* skip inactive filters */ if (psf->sf_count[MCAST_INCLUDE] || pmc->sfcount[MCAST_EXCLUDE] != psf->sf_count[MCAST_EXCLUDE]) break; if (srcs[i] == psf->sf_inaddr) { scount++; break; } } } pmc->gsquery = 0; if (scount == nsrcs) /* all sources excluded */ return 0; return 1; } static int igmp_marksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs) { struct ip_sf_list *psf; int i, scount; if (pmc->sfmode == MCAST_EXCLUDE) return igmp_xmarksources(pmc, nsrcs, srcs); /* mark INCLUDE-mode sources */ scount = 0; for (psf = pmc->sources; psf; psf = psf->sf_next) { if (scount == nsrcs) break; for (i = 0; i < nsrcs; i++) if (srcs[i] == psf->sf_inaddr) { psf->sf_gsresp = 1; scount++; break; } } if (!scount) { pmc->gsquery = 0; return 0; } pmc->gsquery = 1; return 1; } /* return true if packet was dropped */ static bool igmp_heard_report(struct in_device *in_dev, __be32 group) { struct ip_mc_list *im; struct net *net = dev_net(in_dev->dev); /* Timers are only set for non-local groups */ if (group == IGMP_ALL_HOSTS) return false; if (ipv4_is_local_multicast(group) && !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports)) return false; rcu_read_lock(); for_each_pmc_rcu(in_dev, im) { if (im->multiaddr == group) { igmp_stop_timer(im); break; } } rcu_read_unlock(); return false; } /* return true if packet was dropped */ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, int len) { struct igmphdr *ih = igmp_hdr(skb); struct igmpv3_query *ih3 = igmpv3_query_hdr(skb); struct ip_mc_list *im; __be32 group = ih->group; int max_delay; int mark = 0; struct net *net = dev_net(in_dev->dev); if (len == 8) { if (ih->code == 0) { /* Alas, old v1 router presents here. */ max_delay = IGMP_QUERY_RESPONSE_INTERVAL; in_dev->mr_v1_seen = jiffies + (in_dev->mr_qrv * in_dev->mr_qi) + in_dev->mr_qri; group = 0; } else { /* v2 router present */ max_delay = ih->code*(HZ/IGMP_TIMER_SCALE); in_dev->mr_v2_seen = jiffies + (in_dev->mr_qrv * in_dev->mr_qi) + in_dev->mr_qri; } /* cancel the interface change timer */ WRITE_ONCE(in_dev->mr_ifc_count, 0); if (timer_delete(&in_dev->mr_ifc_timer)) __in_dev_put(in_dev); /* clear deleted report items */ igmpv3_clear_delrec(in_dev); } else if (len < 12) { return true; /* ignore bogus packet; freed by caller */ } else if (IGMP_V1_SEEN(in_dev)) { /* This is a v3 query with v1 queriers present */ max_delay = IGMP_QUERY_RESPONSE_INTERVAL; group = 0; } else if (IGMP_V2_SEEN(in_dev)) { /* this is a v3 query with v2 queriers present; * Interpretation of the max_delay code is problematic here. * A real v2 host would use ih_code directly, while v3 has a * different encoding. We use the v3 encoding as more likely * to be intended in a v3 query. */ max_delay = IGMPV3_MRC(ih3->code)*(HZ/IGMP_TIMER_SCALE); if (!max_delay) max_delay = 1; /* can't mod w/ 0 */ } else { /* v3 */ if (!pskb_may_pull(skb, sizeof(struct igmpv3_query))) return true; ih3 = igmpv3_query_hdr(skb); if (ih3->nsrcs) { if (!pskb_may_pull(skb, sizeof(struct igmpv3_query) + ntohs(ih3->nsrcs)*sizeof(__be32))) return true; ih3 = igmpv3_query_hdr(skb); } max_delay = IGMPV3_MRC(ih3->code)*(HZ/IGMP_TIMER_SCALE); if (!max_delay) max_delay = 1; /* can't mod w/ 0 */ WRITE_ONCE(in_dev->mr_maxdelay, max_delay); /* RFC3376, 4.1.6. QRV and 4.1.7. QQIC, when the most recently * received value was zero, use the default or statically * configured value. */ in_dev->mr_qrv = ih3->qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv); in_dev->mr_qi = IGMPV3_QQIC(ih3->qqic)*HZ ?: IGMP_QUERY_INTERVAL; /* RFC3376, 8.3. Query Response Interval: * The number of seconds represented by the [Query Response * Interval] must be less than the [Query Interval]. */ if (in_dev->mr_qri >= in_dev->mr_qi) in_dev->mr_qri = (in_dev->mr_qi/HZ - 1)*HZ; if (!group) { /* general query */ if (ih3->nsrcs) return true; /* no sources allowed */ igmp_gq_start_timer(in_dev); return false; } /* mark sources to include, if group & source-specific */ mark = ih3->nsrcs != 0; } /* * - Start the timers in all of our membership records * that the query applies to for the interface on * which the query arrived excl. those that belong * to a "local" group (224.0.0.X) * - For timers already running check if they need to * be reset. * - Use the igmp->igmp_code field as the maximum * delay possible */ rcu_read_lock(); for_each_pmc_rcu(in_dev, im) { int changed; if (group && group != im->multiaddr) continue; if (im->multiaddr == IGMP_ALL_HOSTS) continue; if (ipv4_is_local_multicast(im->multiaddr) && !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports)) continue; spin_lock_bh(&im->lock); if (im->tm_running) im->gsquery = im->gsquery && mark; else im->gsquery = mark; changed = !im->gsquery || igmp_marksources(im, ntohs(ih3->nsrcs), ih3->srcs); spin_unlock_bh(&im->lock); if (changed) igmp_mod_timer(im, max_delay); } rcu_read_unlock(); return false; } /* called in rcu_read_lock() section */ int igmp_rcv(struct sk_buff *skb) { /* This basically follows the spec line by line -- see RFC1112 */ struct igmphdr *ih; struct net_device *dev = skb->dev; struct in_device *in_dev; int len = skb->len; bool dropped = true; if (netif_is_l3_master(dev)) { dev = dev_get_by_index_rcu(dev_net(dev), IPCB(skb)->iif); if (!dev) goto drop; } in_dev = __in_dev_get_rcu(dev); if (!in_dev) goto drop; if (!pskb_may_pull(skb, sizeof(struct igmphdr))) goto drop; if (skb_checksum_simple_validate(skb)) goto drop; ih = igmp_hdr(skb); switch (ih->type) { case IGMP_HOST_MEMBERSHIP_QUERY: dropped = igmp_heard_query(in_dev, skb, len); break; case IGMP_HOST_MEMBERSHIP_REPORT: case IGMPV2_HOST_MEMBERSHIP_REPORT: /* Is it our report looped back? */ if (rt_is_output_route(skb_rtable(skb))) break; /* don't rely on MC router hearing unicast reports */ if (skb->pkt_type == PACKET_MULTICAST || skb->pkt_type == PACKET_BROADCAST) dropped = igmp_heard_report(in_dev, ih->group); break; case IGMP_PIM: #ifdef CONFIG_IP_PIMSM_V1 return pim_rcv_v1(skb); #endif case IGMPV3_HOST_MEMBERSHIP_REPORT: case IGMP_DVMRP: case IGMP_TRACE: case IGMP_HOST_LEAVE_MESSAGE: case IGMP_MTRACE: case IGMP_MTRACE_RESP: break; default: break; } drop: if (dropped) kfree_skb(skb); else consume_skb(skb); return 0; } #endif /* * Add a filter to a device */ static void ip_mc_filter_add(struct in_device *in_dev, __be32 addr) { char buf[MAX_ADDR_LEN]; struct net_device *dev = in_dev->dev; /* Checking for IFF_MULTICAST here is WRONG-WRONG-WRONG. We will get multicast token leakage, when IFF_MULTICAST is changed. This check should be done in ndo_set_rx_mode routine. Something sort of: if (dev->mc_list && dev->flags&IFF_MULTICAST) { do it; } --ANK */ if (arp_mc_map(addr, buf, dev, 0) == 0) dev_mc_add(dev, buf); } /* * Remove a filter from a device */ static void ip_mc_filter_del(struct in_device *in_dev, __be32 addr) { char buf[MAX_ADDR_LEN]; struct net_device *dev = in_dev->dev; if (arp_mc_map(addr, buf, dev, 0) == 0) dev_mc_del(dev, buf); } #ifdef CONFIG_IP_MULTICAST /* * deleted ip_mc_list manipulation */ static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im, gfp_t gfp) { struct ip_mc_list *pmc; struct net *net = dev_net(in_dev->dev); /* this is an "ip_mc_list" for convenience; only the fields below * are actually used. In particular, the refcnt and users are not * used for management of the delete list. Using the same structure * for deleted items allows change reports to use common code with * non-deleted or query-response MCA's. */ pmc = kzalloc_obj(*pmc, gfp); if (!pmc) return; spin_lock_init(&pmc->lock); spin_lock_bh(&im->lock); pmc->interface = im->interface; in_dev_hold(in_dev); pmc->multiaddr = im->multiaddr; pmc->crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv); pmc->sfmode = im->sfmode; if (pmc->sfmode == MCAST_INCLUDE) { struct ip_sf_list *psf; pmc->tomb = im->tomb; pmc->sources = im->sources; im->tomb = im->sources = NULL; for (psf = pmc->sources; psf; psf = psf->sf_next) psf->sf_crcount = pmc->crcount; } spin_unlock_bh(&im->lock); spin_lock_bh(&in_dev->mc_tomb_lock); pmc->next = in_dev->mc_tomb; in_dev->mc_tomb = pmc; spin_unlock_bh(&in_dev->mc_tomb_lock); } /* * restore ip_mc_list deleted records */ static void igmpv3_del_delrec(struct in_device *in_dev, struct ip_mc_list *im) { struct ip_mc_list *pmc, *pmc_prev; struct ip_sf_list *psf; struct net *net = dev_net(in_dev->dev); __be32 multiaddr = im->multiaddr; spin_lock_bh(&in_dev->mc_tomb_lock); pmc_prev = NULL; for (pmc = in_dev->mc_tomb; pmc; pmc = pmc->next) { if (pmc->multiaddr == multiaddr) break; pmc_prev = pmc; } if (pmc) { if (pmc_prev) pmc_prev->next = pmc->next; else in_dev->mc_tomb = pmc->next; } spin_unlock_bh(&in_dev->mc_tomb_lock); spin_lock_bh(&im->lock); if (pmc) { im->interface = pmc->interface; if (im->sfmode == MCAST_INCLUDE) { swap(im->tomb, pmc->tomb); swap(im->sources, pmc->sources); for (psf = im->sources; psf; psf = psf->sf_next) psf->sf_crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv); } else { im->crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv); } in_dev_put(pmc->interface); kfree_pmc(pmc); } spin_unlock_bh(&im->lock); } /* * flush ip_mc_list deleted records */ static void igmpv3_clear_delrec(struct in_device *in_dev) { struct ip_mc_list *pmc, *nextpmc; spin_lock_bh(&in_dev->mc_tomb_lock); pmc = in_dev->mc_tomb; in_dev->mc_tomb = NULL; spin_unlock_bh(&in_dev->mc_tomb_lock); for (; pmc; pmc = nextpmc) { nextpmc = pmc->next; ip_mc_clear_src(pmc); in_dev_put(pmc->interface); kfree_pmc(pmc); } /* clear dead sources, too */ rcu_read_lock(); for_each_pmc_rcu(in_dev, pmc) { struct ip_sf_list *psf; spin_lock_bh(&pmc->lock); psf = pmc->tomb; pmc->tomb = NULL; spin_unlock_bh(&pmc->lock); ip_sf_list_clear_all(psf); } rcu_read_unlock(); } #endif static void __igmp_group_dropped(struct ip_mc_list *im, gfp_t gfp) { struct in_device *in_dev = im->interface; #ifdef CONFIG_IP_MULTICAST struct net *net = dev_net(in_dev->dev); int reporter; #endif if (im->loaded) { im->loaded = 0; ip_mc_filter_del(in_dev, im->multiaddr); } #ifdef CONFIG_IP_MULTICAST if (im->multiaddr == IGMP_ALL_HOSTS) return; if (ipv4_is_local_multicast(im->multiaddr) && !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports)) return; reporter = im->reporter; igmp_stop_timer(im); if (!in_dev->dead) { if (IGMP_V1_SEEN(in_dev)) return; if (IGMP_V2_SEEN(in_dev)) { if (reporter) igmp_send_report(in_dev, im, IGMP_HOST_LEAVE_MESSAGE); return; } /* IGMPv3 */ igmpv3_add_delrec(in_dev, im, gfp); igmp_ifc_event(in_dev); } #endif } static void igmp_group_dropped(struct ip_mc_list *im) { __igmp_group_dropped(im, GFP_KERNEL); } static void igmp_group_added(struct ip_mc_list *im) { struct in_device *in_dev = im->interface; #ifdef CONFIG_IP_MULTICAST struct net *net = dev_net(in_dev->dev); #endif if (im->loaded == 0) { im->loaded = 1; ip_mc_filter_add(in_dev, im->multiaddr); } #ifdef CONFIG_IP_MULTICAST if (im->multiaddr == IGMP_ALL_HOSTS) return; if (ipv4_is_local_multicast(im->multiaddr) && !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports)) return; if (in_dev->dead) return; im->unsolicit_count = READ_ONCE(net->ipv4.sysctl_igmp_qrv); if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) { spin_lock_bh(&im->lock); igmp_start_timer(im, IGMP_INITIAL_REPORT_DELAY); spin_unlock_bh(&im->lock); return; } /* else, v3 */ /* Based on RFC3376 5.1, for newly added INCLUDE SSM, we should * not send filter-mode change record as the mode should be from * IN() to IN(A). */ if (im->sfmode == MCAST_EXCLUDE) im->crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv); igmp_ifc_event(in_dev); #endif } /* * Multicast list managers */ static u32 ip_mc_hash(const struct ip_mc_list *im) { return hash_32((__force u32)im->multiaddr, MC_HASH_SZ_LOG); } static void ip_mc_hash_add(struct in_device *in_dev, struct ip_mc_list *im) { struct ip_mc_list __rcu **mc_hash; u32 hash; mc_hash = rtnl_dereference(in_dev->mc_hash); if (mc_hash) { hash = ip_mc_hash(im); im->next_hash = mc_hash[hash]; rcu_assign_pointer(mc_hash[hash], im); return; } /* do not use a hash table for small number of items */ if (in_dev->mc_count < 4) return; mc_hash = kzalloc(sizeof(struct ip_mc_list *) << MC_HASH_SZ_LOG, GFP_KERNEL); if (!mc_hash) return; for_each_pmc_rtnl(in_dev, im) { hash = ip_mc_hash(im); im->next_hash = mc_hash[hash]; RCU_INIT_POINTER(mc_hash[hash], im); } rcu_assign_pointer(in_dev->mc_hash, mc_hash); } static void ip_mc_hash_remove(struct in_device *in_dev, struct ip_mc_list *im) { struct ip_mc_list __rcu **mc_hash = rtnl_dereference(in_dev->mc_hash); struct ip_mc_list *aux; if (!mc_hash) return; mc_hash += ip_mc_hash(im); while ((aux = rtnl_dereference(*mc_hash)) != im) mc_hash = &aux->next_hash; *mc_hash = im->next_hash; } int inet_fill_ifmcaddr(struct sk_buff *skb, struct net_device *dev, const struct ip_mc_list *im, struct inet_fill_args *args) { struct ifa_cacheinfo ci; struct ifaddrmsg *ifm; struct nlmsghdr *nlh; nlh = nlmsg_put(skb, args->portid, args->seq, args->event, sizeof(struct ifaddrmsg), args->flags); if (!nlh) return -EMSGSIZE; ifm = nlmsg_data(nlh); ifm->ifa_family = AF_INET; ifm->ifa_prefixlen = 32; ifm->ifa_flags = IFA_F_PERMANENT; ifm->ifa_scope = RT_SCOPE_UNIVERSE; ifm->ifa_index = dev->ifindex; ci.cstamp = (READ_ONCE(im->mca_cstamp) - INITIAL_JIFFIES) * 100UL / HZ; ci.tstamp = ci.cstamp; ci.ifa_prefered = INFINITY_LIFE_TIME; ci.ifa_valid = INFINITY_LIFE_TIME; if (nla_put_in_addr(skb, IFA_MULTICAST, im->multiaddr) < 0 || nla_put(skb, IFA_CACHEINFO, sizeof(ci), &ci) < 0) { nlmsg_cancel(skb, nlh); return -EMSGSIZE; } nlmsg_end(skb, nlh); return 0; } static void inet_ifmcaddr_notify(struct net_device *dev, const struct ip_mc_list *im, int event) { struct inet_fill_args fillargs = { .event = event, }; struct net *net = dev_net(dev); struct sk_buff *skb; int err = -ENOMEM; skb = nlmsg_new(NLMSG_ALIGN(sizeof(struct ifaddrmsg)) + nla_total_size(sizeof(__be32)) + nla_total_size(sizeof(struct ifa_cacheinfo)), GFP_KERNEL); if (!skb) goto error; err = inet_fill_ifmcaddr(skb, dev, im, &fillargs); if (err < 0) { WARN_ON_ONCE(err == -EMSGSIZE); nlmsg_free(skb); goto error; } rtnl_notify(skb, net, 0, RTNLGRP_IPV4_MCADDR, NULL, GFP_KERNEL); return; error: rtnl_set_sk_err(net, RTNLGRP_IPV4_MCADDR, err); } /* * A socket has joined a multicast group on device dev. */ static void ____ip_mc_inc_group(struct in_device *in_dev, __be32 addr, unsigned int mode, gfp_t gfp) { struct ip_mc_list __rcu **mc_hash; struct ip_mc_list *im; ASSERT_RTNL(); mc_hash = rtnl_dereference(in_dev->mc_hash); if (mc_hash) { u32 hash = hash_32((__force u32)addr, MC_HASH_SZ_LOG); for (im = rtnl_dereference(mc_hash[hash]); im; im = rtnl_dereference(im->next_hash)) { if (im->multiaddr == addr) break; } } else { for_each_pmc_rtnl(in_dev, im) { if (im->multiaddr == addr) break; } } if (im) { im->users++; ip_mc_add_src(in_dev, &addr, mode, 0, NULL, 0); goto out; } im = kzalloc_obj(*im, gfp); if (!im) goto out; im->users = 1; im->interface = in_dev; in_dev_hold(in_dev); im->multiaddr = addr; im->mca_cstamp = jiffies; im->mca_tstamp = im->mca_cstamp; /* initial mode is (EX, empty) */ im->sfmode = mode; im->sfcount[mode] = 1; refcount_set(&im->refcnt, 1); spin_lock_init(&im->lock); #ifdef CONFIG_IP_MULTICAST timer_setup(&im->timer, igmp_timer_expire, 0); #endif im->next_rcu = in_dev->mc_list; in_dev->mc_count++; rcu_assign_pointer(in_dev->mc_list, im); ip_mc_hash_add(in_dev, im); #ifdef CONFIG_IP_MULTICAST igmpv3_del_delrec(in_dev, im); #endif igmp_group_added(im); inet_ifmcaddr_notify(in_dev->dev, im, RTM_NEWMULTICAST); if (!in_dev->dead) ip_rt_multicast_event(in_dev); out: return; } void __ip_mc_inc_group(struct in_device *in_dev, __be32 addr, gfp_t gfp) { ____ip_mc_inc_group(in_dev, addr, MCAST_EXCLUDE, gfp); } EXPORT_SYMBOL(__ip_mc_inc_group); void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) { __ip_mc_inc_group(in_dev, addr, GFP_KERNEL); } EXPORT_SYMBOL(ip_mc_inc_group); static int ip_mc_check_iphdr(struct sk_buff *skb) { const struct iphdr *iph; unsigned int len; unsigned int offset = skb_network_offset(skb) + sizeof(*iph); if (!pskb_may_pull(skb, offset)) return -EINVAL; iph = ip_hdr(skb); if (iph->version != 4 || ip_hdrlen(skb) < sizeof(*iph)) return -EINVAL; offset += ip_hdrlen(skb) - sizeof(*iph); if (!pskb_may_pull(skb, offset)) return -EINVAL; iph = ip_hdr(skb); if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) return -EINVAL; len = skb_network_offset(skb) + ntohs(iph->tot_len); if (skb->len < len || len < offset) return -EINVAL; skb_set_transport_header(skb, offset); return 0; } static int ip_mc_check_igmp_reportv3(struct sk_buff *skb) { unsigned int len = skb_transport_offset(skb); len += sizeof(struct igmpv3_report); return ip_mc_may_pull(skb, len) ? 0 : -EINVAL; } static int ip_mc_check_igmp_query(struct sk_buff *skb) { unsigned int transport_len = ip_transport_len(skb); unsigned int len; /* IGMPv{1,2}? */ if (transport_len != sizeof(struct igmphdr)) { /* or IGMPv3? */ if (transport_len < sizeof(struct igmpv3_query)) return -EINVAL; len = skb_transport_offset(skb) + sizeof(struct igmpv3_query); if (!ip_mc_may_pull(skb, len)) return -EINVAL; } /* RFC2236+RFC3376 (IGMPv2+IGMPv3) require the multicast link layer * all-systems destination addresses (224.0.0.1) for general queries */ if (!igmp_hdr(skb)->group && ip_hdr(skb)->daddr != htonl(INADDR_ALLHOSTS_GROUP)) return -EINVAL; return 0; } static int ip_mc_check_igmp_msg(struct sk_buff *skb) { switch (igmp_hdr(skb)->type) { case IGMP_HOST_LEAVE_MESSAGE: case IGMP_HOST_MEMBERSHIP_REPORT: case IGMPV2_HOST_MEMBERSHIP_REPORT: return 0; case IGMPV3_HOST_MEMBERSHIP_REPORT: return ip_mc_check_igmp_reportv3(skb); case IGMP_HOST_MEMBERSHIP_QUERY: return ip_mc_check_igmp_query(skb); default: return -ENOMSG; } } static __sum16 ip_mc_validate_checksum(struct sk_buff *skb) { return skb_checksum_simple_validate(skb); } static int ip_mc_check_igmp_csum(struct sk_buff *skb) { unsigned int len = skb_transport_offset(skb) + sizeof(struct igmphdr); unsigned int transport_len = ip_transport_len(skb); struct sk_buff *skb_chk; if (!ip_mc_may_pull(skb, len)) return -EINVAL; skb_chk = skb_checksum_trimmed(skb, transport_len, ip_mc_validate_checksum); if (!skb_chk) return -EINVAL; if (skb_chk != skb) kfree_skb(skb_chk); return 0; } /** * ip_mc_check_igmp - checks whether this is a sane IGMP packet * @skb: the skb to validate * * Checks whether an IPv4 packet is a valid IGMP packet. If so sets * skb transport header accordingly and returns zero. * * -EINVAL: A broken packet was detected, i.e. it violates some internet * standard * -ENOMSG: IP header validation succeeded but it is not an IGMP packet. * -ENOMEM: A memory allocation failure happened. * * Caller needs to set the skb network header and free any returned skb if it * differs from the provided skb. */ int ip_mc_check_igmp(struct sk_buff *skb) { int ret = ip_mc_check_iphdr(skb); if (ret < 0) return ret; if (ip_hdr(skb)->protocol != IPPROTO_IGMP) return -ENOMSG; ret = ip_mc_check_igmp_csum(skb); if (ret < 0) return ret; return ip_mc_check_igmp_msg(skb); } EXPORT_SYMBOL(ip_mc_check_igmp); /* * Resend IGMP JOIN report; used by netdev notifier. */ static void ip_mc_rejoin_groups(struct in_device *in_dev) { #ifdef CONFIG_IP_MULTICAST struct ip_mc_list *im; int type; struct net *net = dev_net(in_dev->dev); ASSERT_RTNL(); for_each_pmc_rtnl(in_dev, im) { if (im->multiaddr == IGMP_ALL_HOSTS) continue; if (ipv4_is_local_multicast(im->multiaddr) && !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports)) continue; /* a failover is happening and switches * must be notified immediately */ if (IGMP_V1_SEEN(in_dev)) type = IGMP_HOST_MEMBERSHIP_REPORT; else if (IGMP_V2_SEEN(in_dev)) type = IGMPV2_HOST_MEMBERSHIP_REPORT; else type = IGMPV3_HOST_MEMBERSHIP_REPORT; igmp_send_report(in_dev, im, type); } #endif } /* * A socket has left a multicast group on device dev */ void __ip_mc_dec_group(struct in_device *in_dev, __be32 addr, gfp_t gfp) { struct ip_mc_list *i; struct ip_mc_list __rcu **ip; ASSERT_RTNL(); for (ip = &in_dev->mc_list; (i = rtnl_dereference(*ip)) != NULL; ip = &i->next_rcu) { if (i->multiaddr == addr) { if (--i->users == 0) { ip_mc_hash_remove(in_dev, i); *ip = i->next_rcu; in_dev->mc_count--; __igmp_group_dropped(i, gfp); inet_ifmcaddr_notify(in_dev->dev, i, RTM_DELMULTICAST); ip_mc_clear_src(i); if (!in_dev->dead) ip_rt_multicast_event(in_dev); ip_ma_put(i); return; } break; } } } EXPORT_SYMBOL(__ip_mc_dec_group); /* Device changing type */ void ip_mc_unmap(struct in_device *in_dev) { struct ip_mc_list *pmc; ASSERT_RTNL(); for_each_pmc_rtnl(in_dev, pmc) igmp_group_dropped(pmc); } void ip_mc_remap(struct in_device *in_dev) { struct ip_mc_list *pmc; ASSERT_RTNL(); for_each_pmc_rtnl(in_dev, pmc) { #ifdef CONFIG_IP_MULTICAST igmpv3_del_delrec(in_dev, pmc); #endif igmp_group_added(pmc); } } /* Device going down */ void ip_mc_down(struct in_device *in_dev) { struct ip_mc_list *pmc; ASSERT_RTNL(); for_each_pmc_rtnl(in_dev, pmc) igmp_group_dropped(pmc); #ifdef CONFIG_IP_MULTICAST WRITE_ONCE(in_dev->mr_ifc_count, 0); if (timer_delete(&in_dev->mr_ifc_timer)) __in_dev_put(in_dev); in_dev->mr_gq_running = 0; if (timer_delete(&in_dev->mr_gq_timer)) __in_dev_put(in_dev); #endif ip_mc_dec_group(in_dev, IGMP_ALL_HOSTS); } #ifdef CONFIG_IP_MULTICAST static void ip_mc_reset(struct in_device *in_dev) { struct net *net = dev_net(in_dev->dev); in_dev->mr_qi = IGMP_QUERY_INTERVAL; in_dev->mr_qri = IGMP_QUERY_RESPONSE_INTERVAL; in_dev->mr_qrv = READ_ONCE(net->ipv4.sysctl_igmp_qrv); } #else static void ip_mc_reset(struct in_device *in_dev) { } #endif void ip_mc_init_dev(struct in_device *in_dev) { ASSERT_RTNL(); #ifdef CONFIG_IP_MULTICAST timer_setup(&in_dev->mr_gq_timer, igmp_gq_timer_expire, 0); timer_setup(&in_dev->mr_ifc_timer, igmp_ifc_timer_expire, 0); #endif ip_mc_reset(in_dev); spin_lock_init(&in_dev->mc_tomb_lock); } /* Device going up */ void ip_mc_up(struct in_device *in_dev) { struct ip_mc_list *pmc; ASSERT_RTNL(); ip_mc_reset(in_dev); ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS); for_each_pmc_rtnl(in_dev, pmc) { #ifdef CONFIG_IP_MULTICAST igmpv3_del_delrec(in_dev, pmc); #endif igmp_group_added(pmc); } } /* * Device is about to be destroyed: clean up. */ void ip_mc_destroy_dev(struct in_device *in_dev) { struct ip_mc_list *i; ASSERT_RTNL(); /* Deactivate timers */ ip_mc_down(in_dev); #ifdef CONFIG_IP_MULTICAST igmpv3_clear_delrec(in_dev); #endif while ((i = rtnl_dereference(in_dev->mc_list)) != NULL) { in_dev->mc_list = i->next_rcu; in_dev->mc_count--; ip_mc_clear_src(i); ip_ma_put(i); } } /* RTNL is locked */ static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr) { struct net_device *dev = NULL; struct in_device *idev = NULL; if (imr->imr_ifindex) { idev = inetdev_by_index(net, imr->imr_ifindex); return idev; } if (imr->imr_address.s_addr) { dev = __ip_dev_find(net, imr->imr_address.s_addr, false); if (!dev) return NULL; } if (!dev) { struct rtable *rt = ip_route_output(net, imr->imr_multiaddr.s_addr, 0, 0, 0, RT_SCOPE_UNIVERSE); if (!IS_ERR(rt)) { dev = rt->dst.dev; ip_rt_put(rt); } } if (dev) { imr->imr_ifindex = dev->ifindex; idev = __in_dev_get_rtnl(dev); } return idev; } /* * Join a socket to a group */ static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode, __be32 *psfsrc) { struct ip_sf_list *psf, *psf_prev; int rv = 0; psf_prev = NULL; for (psf = pmc->sources; psf; psf = psf->sf_next) { if (psf->sf_inaddr == *psfsrc) break; psf_prev = psf; } if (!psf || psf->sf_count[sfmode] == 0) { /* source filter not found, or count wrong => bug */ return -ESRCH; } psf->sf_count[sfmode]--; if (psf->sf_count[sfmode] == 0) { ip_rt_multicast_event(pmc->interface); } if (!psf->sf_count[MCAST_INCLUDE] && !psf->sf_count[MCAST_EXCLUDE]) { #ifdef CONFIG_IP_MULTICAST struct in_device *in_dev = pmc->interface; struct net *net = dev_net(in_dev->dev); #endif /* no more filters for this source */ if (psf_prev) psf_prev->sf_next = psf->sf_next; else pmc->sources = psf->sf_next; #ifdef CONFIG_IP_MULTICAST if (psf->sf_oldin && !IGMP_V1_SEEN(in_dev) && !IGMP_V2_SEEN(in_dev)) { psf->sf_crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv); psf->sf_next = pmc->tomb; pmc->tomb = psf; rv = 1; } else #endif kfree(psf); } return rv; } #ifndef CONFIG_IP_MULTICAST #define igmp_ifc_event(x) do { } while (0) #endif static int ip_mc_del_src(struct in_device *in_dev, __be32 *pmca, int sfmode, int sfcount, __be32 *psfsrc, int delta) { struct ip_mc_list *pmc; int changerec = 0; int i, err; if (!in_dev) return -ENODEV; rcu_read_lock(); for_each_pmc_rcu(in_dev, pmc) { if (*pmca == pmc->multiaddr) break; } if (!pmc) { /* MCA not found?? bug */ rcu_read_unlock(); return -ESRCH; } spin_lock_bh(&pmc->lock); rcu_read_unlock(); #ifdef CONFIG_IP_MULTICAST sf_markstate(pmc); #endif if (!delta) { err = -EINVAL; if (!pmc->sfcount[sfmode]) goto out_unlock; pmc->sfcount[sfmode]--; } err = 0; for (i = 0; i < sfcount; i++) { int rv = ip_mc_del1_src(pmc, sfmode, &psfsrc[i]); changerec |= rv > 0; if (!err && rv < 0) err = rv; } if (pmc->sfmode == MCAST_EXCLUDE && pmc->sfcount[MCAST_EXCLUDE] == 0 && pmc->sfcount[MCAST_INCLUDE]) { #ifdef CONFIG_IP_MULTICAST struct ip_sf_list *psf; struct net *net = dev_net(in_dev->dev); #endif /* filter mode change */ pmc->sfmode = MCAST_INCLUDE; #ifdef CONFIG_IP_MULTICAST pmc->crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv); WRITE_ONCE(in_dev->mr_ifc_count, pmc->crcount); for (psf = pmc->sources; psf; psf = psf->sf_next) psf->sf_crcount = 0; igmp_ifc_event(pmc->interface); } else if (sf_setstate(pmc) || changerec) { igmp_ifc_event(pmc->interface); #endif } out_unlock: spin_unlock_bh(&pmc->lock); return err; } /* * Add multicast single-source filter to the interface list */ static int ip_mc_add1_src(struct ip_mc_list *pmc, int sfmode, __be32 *psfsrc) { struct ip_sf_list *psf, *psf_prev; psf_prev = NULL; for (psf = pmc->sources; psf; psf = psf->sf_next) { if (psf->sf_inaddr == *psfsrc) break; psf_prev = psf; } if (!psf) { psf = kzalloc_obj(*psf, GFP_ATOMIC); if (!psf) return -ENOBUFS; psf->sf_inaddr = *psfsrc; if (psf_prev) { psf_prev->sf_next = psf; } else pmc->sources = psf; } psf->sf_count[sfmode]++; if (psf->sf_count[sfmode] == 1) { ip_rt_multicast_event(pmc->interface); } return 0; } #ifdef CONFIG_IP_MULTICAST static void sf_markstate(struct ip_mc_list *pmc) { struct ip_sf_list *psf; int mca_xcount = pmc->sfcount[MCAST_EXCLUDE]; for (psf = pmc->sources; psf; psf = psf->sf_next) if (pmc->sfcount[MCAST_EXCLUDE]) { psf->sf_oldin = mca_xcount == psf->sf_count[MCAST_EXCLUDE] && !psf->sf_count[MCAST_INCLUDE]; } else psf->sf_oldin = psf->sf_count[MCAST_INCLUDE] != 0; } static int sf_setstate(struct ip_mc_list *pmc) { struct ip_sf_list *psf, *dpsf; int mca_xcount = pmc->sfcount[MCAST_EXCLUDE]; int qrv = pmc->interface->mr_qrv; int new_in, rv; rv = 0; for (psf = pmc->sources; psf; psf = psf->sf_next) { if (pmc->sfcount[MCAST_EXCLUDE]) { new_in = mca_xcount == psf->sf_count[MCAST_EXCLUDE] && !psf->sf_count[MCAST_INCLUDE]; } else new_in = psf->sf_count[MCAST_INCLUDE] != 0; if (new_in) { if (!psf->sf_oldin) { struct ip_sf_list *prev = NULL; for (dpsf = pmc->tomb; dpsf; dpsf = dpsf->sf_next) { if (dpsf->sf_inaddr == psf->sf_inaddr) break; prev = dpsf; } if (dpsf) { if (prev) prev->sf_next = dpsf->sf_next; else pmc->tomb = dpsf->sf_next; kfree(dpsf); } psf->sf_crcount = qrv; rv++; } } else if (psf->sf_oldin) { psf->sf_crcount = 0; /* * add or update "delete" records if an active filter * is now inactive */ for (dpsf = pmc->tomb; dpsf; dpsf = dpsf->sf_next) if (dpsf->sf_inaddr == psf->sf_inaddr) break; if (!dpsf) { dpsf = kmalloc_obj(*dpsf, GFP_ATOMIC); if (!dpsf) continue; *dpsf = *psf; /* pmc->lock held by callers */ dpsf->sf_next = pmc->tomb; pmc->tomb = dpsf; } dpsf->sf_crcount = qrv; rv++; } } return rv; } #endif /* * Add multicast source filter list to the interface list */ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode, int sfcount, __be32 *psfsrc, int delta) { struct ip_mc_list *pmc; int isexclude; int i, err; if (!in_dev) return -ENODEV; rcu_read_lock(); for_each_pmc_rcu(in_dev, pmc) { if (*pmca == pmc->multiaddr) break; } if (!pmc) { /* MCA not found?? bug */ rcu_read_unlock(); return -ESRCH; } spin_lock_bh(&pmc->lock); rcu_read_unlock(); #ifdef CONFIG_IP_MULTICAST sf_markstate(pmc); #endif isexclude = pmc->sfmode == MCAST_EXCLUDE; if (!delta) pmc->sfcount[sfmode]++; err = 0; for (i = 0; i < sfcount; i++) { err = ip_mc_add1_src(pmc, sfmode, &psfsrc[i]); if (err) break; } if (err) { int j; if (!delta) pmc->sfcount[sfmode]--; for (j = 0; j < i; j++) (void) ip_mc_del1_src(pmc, sfmode, &psfsrc[j]); } else if (isexclude != (pmc->sfcount[MCAST_EXCLUDE] != 0)) { #ifdef CONFIG_IP_MULTICAST struct ip_sf_list *psf; struct net *net = dev_net(pmc->interface->dev); in_dev = pmc->interface; #endif /* filter mode change */ if (pmc->sfcount[MCAST_EXCLUDE]) pmc->sfmode = MCAST_EXCLUDE; else if (pmc->sfcount[MCAST_INCLUDE]) pmc->sfmode = MCAST_INCLUDE; #ifdef CONFIG_IP_MULTICAST /* else no filters; keep old mode for reports */ pmc->crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv); WRITE_ONCE(in_dev->mr_ifc_count, pmc->crcount); for (psf = pmc->sources; psf; psf = psf->sf_next) psf->sf_crcount = 0; igmp_ifc_event(in_dev); } else if (sf_setstate(pmc)) { igmp_ifc_event(in_dev); #endif } spin_unlock_bh(&pmc->lock); return err; } static void ip_mc_clear_src(struct ip_mc_list *pmc) { struct ip_sf_list *tomb, *sources; spin_lock_bh(&pmc->lock); tomb = pmc->tomb; pmc->tomb = NULL; sources = pmc->sources; pmc->sources = NULL; pmc->sfmode = MCAST_EXCLUDE; pmc->sfcount[MCAST_INCLUDE] = 0; pmc->sfcount[MCAST_EXCLUDE] = 1; spin_unlock_bh(&pmc->lock); ip_sf_list_clear_all(tomb); ip_sf_list_clear_all(sources); } /* Join a multicast group */ static int __ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr, unsigned int mode) { __be32 addr = imr->imr_multiaddr.s_addr; struct ip_mc_socklist *iml, *i; struct in_device *in_dev; struct inet_sock *inet = inet_sk(sk); struct net *net = sock_net(sk); int ifindex; int count = 0; int err; ASSERT_RTNL(); if (!ipv4_is_multicast(addr)) return -EINVAL; in_dev = ip_mc_find_dev(net, imr); if (!in_dev) { err = -ENODEV; goto done; } err = -EADDRINUSE; ifindex = imr->imr_ifindex; for_each_pmc_rtnl(inet, i) { if (i->multi.imr_multiaddr.s_addr == addr && i->multi.imr_ifindex == ifindex) goto done; count++; } err = -ENOBUFS; if (count >= READ_ONCE(net->ipv4.sysctl_igmp_max_memberships)) goto done; iml = sock_kmalloc(sk, sizeof(*iml), GFP_KERNEL); if (!iml) goto done; memcpy(&iml->multi, imr, sizeof(*imr)); iml->next_rcu = inet->mc_list; iml->sflist = NULL; iml->sfmode = mode; rcu_assign_pointer(inet->mc_list, iml); ____ip_mc_inc_group(in_dev, addr, mode, GFP_KERNEL); err = 0; done: return err; } /* Join ASM (Any-Source Multicast) group */ int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr) { return __ip_mc_join_group(sk, imr, MCAST_EXCLUDE); } EXPORT_SYMBOL(ip_mc_join_group); /* Join SSM (Source-Specific Multicast) group */ int ip_mc_join_group_ssm(struct sock *sk, struct ip_mreqn *imr, unsigned int mode) { return __ip_mc_join_group(sk, imr, mode); } static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml, struct in_device *in_dev) { struct ip_sf_socklist *psf = rtnl_dereference(iml->sflist); int err; if (!psf) { /* any-source empty exclude case */ return ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr, iml->sfmode, 0, NULL, 0); } err = ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr, iml->sfmode, psf->sl_count, psf->sl_addr, 0); RCU_INIT_POINTER(iml->sflist, NULL); /* decrease mem now to avoid the memleak warning */ atomic_sub(struct_size(psf, sl_addr, psf->sl_max), &sk->sk_omem_alloc); kfree_rcu(psf, rcu); return err; } int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) { struct inet_sock *inet = inet_sk(sk); struct ip_mc_socklist *iml; struct ip_mc_socklist __rcu **imlp; struct in_device *in_dev; struct net *net = sock_net(sk); __be32 group = imr->imr_multiaddr.s_addr; u32 ifindex; int ret = -EADDRNOTAVAIL; ASSERT_RTNL(); in_dev = ip_mc_find_dev(net, imr); if (!imr->imr_ifindex && !imr->imr_address.s_addr && !in_dev) { ret = -ENODEV; goto out; } ifindex = imr->imr_ifindex; for (imlp = &inet->mc_list; (iml = rtnl_dereference(*imlp)) != NULL; imlp = &iml->next_rcu) { if (iml->multi.imr_multiaddr.s_addr != group) continue; if (ifindex) { if (iml->multi.imr_ifindex != ifindex) continue; } else if (imr->imr_address.s_addr && imr->imr_address.s_addr != iml->multi.imr_address.s_addr) continue; (void) ip_mc_leave_src(sk, iml, in_dev); *imlp = iml->next_rcu; if (in_dev) ip_mc_dec_group(in_dev, group); /* decrease mem now to avoid the memleak warning */ atomic_sub(sizeof(*iml), &sk->sk_omem_alloc); kfree_rcu(iml, rcu); return 0; } out: return ret; } EXPORT_SYMBOL(ip_mc_leave_group); int ip_mc_source(int add, int omode, struct sock *sk, struct ip_mreq_source *mreqs, int ifindex) { int err; struct ip_mreqn imr; __be32 addr = mreqs->imr_multiaddr; struct ip_mc_socklist *pmc; struct in_device *in_dev = NULL; struct inet_sock *inet = inet_sk(sk); struct ip_sf_socklist *psl; struct net *net = sock_net(sk); int leavegroup = 0; int i, j, rv; if (!ipv4_is_multicast(addr)) return -EINVAL; ASSERT_RTNL(); imr.imr_multiaddr.s_addr = mreqs->imr_multiaddr; imr.imr_address.s_addr = mreqs->imr_interface; imr.imr_ifindex = ifindex; in_dev = ip_mc_find_dev(net, &imr); if (!in_dev) { err = -ENODEV; goto done; } err = -EADDRNOTAVAIL; for_each_pmc_rtnl(inet, pmc) { if ((pmc->multi.imr_multiaddr.s_addr == imr.imr_multiaddr.s_addr) && (pmc->multi.imr_ifindex == imr.imr_ifindex)) break; } if (!pmc) { /* must have a prior join */ err = -EINVAL; goto done; } /* if a source filter was set, must be the same mode as before */ if (pmc->sflist) { if (pmc->sfmode != omode) { err = -EINVAL; goto done; } } else if (pmc->sfmode != omode) { /* allow mode switches for empty-set filters */ ip_mc_add_src(in_dev, &mreqs->imr_multiaddr, omode, 0, NULL, 0); ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, pmc->sfmode, 0, NULL, 0); pmc->sfmode = omode; } psl = rtnl_dereference(pmc->sflist); if (!add) { if (!psl) goto done; /* err = -EADDRNOTAVAIL */ rv = !0; for (i = 0; i < psl->sl_count; i++) { rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr, sizeof(__be32)); if (rv == 0) break; } if (rv) /* source not found */ goto done; /* err = -EADDRNOTAVAIL */ /* special case - (INCLUDE, empty) == LEAVE_GROUP */ if (psl->sl_count == 1 && omode == MCAST_INCLUDE) { leavegroup = 1; goto done; } /* update the interface filter */ ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, omode, 1, &mreqs->imr_sourceaddr, 1); for (j = i+1; j < psl->sl_count; j++) psl->sl_addr[j-1] = psl->sl_addr[j]; psl->sl_count--; err = 0; goto done; } /* else, add a new source to the filter */ if (psl && psl->sl_count >= READ_ONCE(net->ipv4.sysctl_igmp_max_msf)) { err = -ENOBUFS; goto done; } if (!psl || psl->sl_count == psl->sl_max) { struct ip_sf_socklist *newpsl; int count = IP_SFBLOCK; if (psl) count += psl->sl_max; newpsl = sock_kmalloc(sk, struct_size(newpsl, sl_addr, count), GFP_KERNEL); if (!newpsl) { err = -ENOBUFS; goto done; } newpsl->sl_max = count; newpsl->sl_count = count - IP_SFBLOCK; if (psl) { for (i = 0; i < psl->sl_count; i++) newpsl->sl_addr[i] = psl->sl_addr[i]; /* decrease mem now to avoid the memleak warning */ atomic_sub(struct_size(psl, sl_addr, psl->sl_max), &sk->sk_omem_alloc); } rcu_assign_pointer(pmc->sflist, newpsl); if (psl) kfree_rcu(psl, rcu); psl = newpsl; } rv = 1; /* > 0 for insert logic below if sl_count is 0 */ for (i = 0; i < psl->sl_count; i++) { rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr, sizeof(__be32)); if (rv == 0) break; } if (rv == 0) /* address already there is an error */ goto done; for (j = psl->sl_count-1; j >= i; j--) psl->sl_addr[j+1] = psl->sl_addr[j]; psl->sl_addr[i] = mreqs->imr_sourceaddr; psl->sl_count++; err = 0; /* update the interface list */ ip_mc_add_src(in_dev, &mreqs->imr_multiaddr, omode, 1, &mreqs->imr_sourceaddr, 1); done: if (leavegroup) err = ip_mc_leave_group(sk, &imr); return err; } int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex) { int err = 0; struct ip_mreqn imr; __be32 addr = msf->imsf_multiaddr; struct ip_mc_socklist *pmc; struct in_device *in_dev; struct inet_sock *inet = inet_sk(sk); struct ip_sf_socklist *newpsl, *psl; struct net *net = sock_net(sk); int leavegroup = 0; if (!ipv4_is_multicast(addr)) return -EINVAL; if (msf->imsf_fmode != MCAST_INCLUDE && msf->imsf_fmode != MCAST_EXCLUDE) return -EINVAL; ASSERT_RTNL(); imr.imr_multiaddr.s_addr = msf->imsf_multiaddr; imr.imr_address.s_addr = msf->imsf_interface; imr.imr_ifindex = ifindex; in_dev = ip_mc_find_dev(net, &imr); if (!in_dev) { err = -ENODEV; goto done; } /* special case - (INCLUDE, empty) == LEAVE_GROUP */ if (msf->imsf_fmode == MCAST_INCLUDE && msf->imsf_numsrc == 0) { leavegroup = 1; goto done; } for_each_pmc_rtnl(inet, pmc) { if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr && pmc->multi.imr_ifindex == imr.imr_ifindex) break; } if (!pmc) { /* must have a prior join */ err = -EINVAL; goto done; } if (msf->imsf_numsrc) { newpsl = sock_kmalloc(sk, struct_size(newpsl, sl_addr, msf->imsf_numsrc), GFP_KERNEL); if (!newpsl) { err = -ENOBUFS; goto done; } newpsl->sl_max = newpsl->sl_count = msf->imsf_numsrc; memcpy(newpsl->sl_addr, msf->imsf_slist_flex, flex_array_size(msf, imsf_slist_flex, msf->imsf_numsrc)); err = ip_mc_add_src(in_dev, &msf->imsf_multiaddr, msf->imsf_fmode, newpsl->sl_count, newpsl->sl_addr, 0); if (err) { sock_kfree_s(sk, newpsl, struct_size(newpsl, sl_addr, newpsl->sl_max)); goto done; } } else { newpsl = NULL; (void) ip_mc_add_src(in_dev, &msf->imsf_multiaddr, msf->imsf_fmode, 0, NULL, 0); } psl = rtnl_dereference(pmc->sflist); if (psl) { (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode, psl->sl_count, psl->sl_addr, 0); /* decrease mem now to avoid the memleak warning */ atomic_sub(struct_size(psl, sl_addr, psl->sl_max), &sk->sk_omem_alloc); } else { (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode, 0, NULL, 0); } rcu_assign_pointer(pmc->sflist, newpsl); if (psl) kfree_rcu(psl, rcu); pmc->sfmode = msf->imsf_fmode; err = 0; done: if (leavegroup) err = ip_mc_leave_group(sk, &imr); return err; } int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf, sockptr_t optval, sockptr_t optlen) { int err, len, count, copycount, msf_size; struct ip_mreqn imr; __be32 addr = msf->imsf_multiaddr; struct ip_mc_socklist *pmc; struct in_device *in_dev; struct inet_sock *inet = inet_sk(sk); struct ip_sf_socklist *psl; struct net *net = sock_net(sk); ASSERT_RTNL(); if (!ipv4_is_multicast(addr)) return -EINVAL; imr.imr_multiaddr.s_addr = msf->imsf_multiaddr; imr.imr_address.s_addr = msf->imsf_interface; imr.imr_ifindex = 0; in_dev = ip_mc_find_dev(net, &imr); if (!in_dev) { err = -ENODEV; goto done; } err = -EADDRNOTAVAIL; for_each_pmc_rtnl(inet, pmc) { if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr && pmc->multi.imr_ifindex == imr.imr_ifindex) break; } if (!pmc) /* must have a prior join */ goto done; msf->imsf_fmode = pmc->sfmode; psl = rtnl_dereference(pmc->sflist); if (!psl) { count = 0; } else { count = psl->sl_count; } copycount = count < msf->imsf_numsrc ? count : msf->imsf_numsrc; len = flex_array_size(psl, sl_addr, copycount); msf->imsf_numsrc = count; msf_size = IP_MSFILTER_SIZE(copycount); if (copy_to_sockptr(optlen, &msf_size, sizeof(int)) || copy_to_sockptr(optval, msf, IP_MSFILTER_SIZE(0))) { return -EFAULT; } if (len && copy_to_sockptr_offset(optval, offsetof(struct ip_msfilter, imsf_slist_flex), psl->sl_addr, len)) return -EFAULT; return 0; done: return err; } int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf, sockptr_t optval, size_t ss_offset) { int i, count, copycount; struct sockaddr_in *psin; __be32 addr; struct ip_mc_socklist *pmc; struct inet_sock *inet = inet_sk(sk); struct ip_sf_socklist *psl; ASSERT_RTNL(); psin = (struct sockaddr_in *)&gsf->gf_group; if (psin->sin_family != AF_INET) return -EINVAL; addr = psin->sin_addr.s_addr; if (!ipv4_is_multicast(addr)) return -EINVAL; for_each_pmc_rtnl(inet, pmc) { if (pmc->multi.imr_multiaddr.s_addr == addr && pmc->multi.imr_ifindex == gsf->gf_interface) break; } if (!pmc) /* must have a prior join */ return -EADDRNOTAVAIL; gsf->gf_fmode = pmc->sfmode; psl = rtnl_dereference(pmc->sflist); count = psl ? psl->sl_count : 0; copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc; gsf->gf_numsrc = count; for (i = 0; i < copycount; i++) { struct sockaddr_storage ss; psin = (struct sockaddr_in *)&ss; memset(&ss, 0, sizeof(ss)); psin->sin_family = AF_INET; psin->sin_addr.s_addr = psl->sl_addr[i]; if (copy_to_sockptr_offset(optval, ss_offset, &ss, sizeof(ss))) return -EFAULT; ss_offset += sizeof(ss); } return 0; } /* * check if a multicast source filter allows delivery for a given <src,dst,intf> */ int ip_mc_sf_allow(const struct sock *sk, __be32 loc_addr, __be32 rmt_addr, int dif, int sdif) { const struct inet_sock *inet = inet_sk(sk); struct ip_mc_socklist *pmc; struct ip_sf_socklist *psl; int i; int ret; ret = 1; if (!ipv4_is_multicast(loc_addr)) goto out; rcu_read_lock(); for_each_pmc_rcu(inet, pmc) { if (pmc->multi.imr_multiaddr.s_addr == loc_addr && (pmc->multi.imr_ifindex == dif || (sdif && pmc->multi.imr_ifindex == sdif))) break; } ret = inet_test_bit(MC_ALL, sk); if (!pmc) goto unlock; psl = rcu_dereference(pmc->sflist); ret = (pmc->sfmode == MCAST_EXCLUDE); if (!psl) goto unlock; for (i = 0; i < psl->sl_count; i++) { if (psl->sl_addr[i] == rmt_addr) break; } ret = 0; if (pmc->sfmode == MCAST_INCLUDE && i >= psl->sl_count) goto unlock; if (pmc->sfmode == MCAST_EXCLUDE && i < psl->sl_count) goto unlock; ret = 1; unlock: rcu_read_unlock(); out: return ret; } /* * A socket is closing. */ void ip_mc_drop_socket(struct sock *sk) { struct inet_sock *inet = inet_sk(sk); struct ip_mc_socklist *iml; struct net *net = sock_net(sk); if (!inet->mc_list) return; rtnl_lock(); while ((iml = rtnl_dereference(inet->mc_list)) != NULL) { struct in_device *in_dev; inet->mc_list = iml->next_rcu; in_dev = inetdev_by_index(net, iml->multi.imr_ifindex); (void) ip_mc_leave_src(sk, iml, in_dev); if (in_dev) ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr); /* decrease mem now to avoid the memleak warning */ atomic_sub(sizeof(*iml), &sk->sk_omem_alloc); kfree_rcu(iml, rcu); } rtnl_unlock(); } /* called with rcu_read_lock() */ int ip_check_mc_rcu(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u8 proto) { struct ip_mc_list *im; struct ip_mc_list __rcu **mc_hash; struct ip_sf_list *psf; int rv = 0; mc_hash = rcu_dereference(in_dev->mc_hash); if (mc_hash) { u32 hash = hash_32((__force u32)mc_addr, MC_HASH_SZ_LOG); for (im = rcu_dereference(mc_hash[hash]); im != NULL; im = rcu_dereference(im->next_hash)) { if (im->multiaddr == mc_addr) break; } } else { for_each_pmc_rcu(in_dev, im) { if (im->multiaddr == mc_addr) break; } } if (im && proto == IPPROTO_IGMP) { rv = 1; } else if (im) { if (src_addr) { spin_lock_bh(&im->lock); for (psf = im->sources; psf; psf = psf->sf_next) { if (psf->sf_inaddr == src_addr) break; } if (psf) rv = psf->sf_count[MCAST_INCLUDE] || psf->sf_count[MCAST_EXCLUDE] != im->sfcount[MCAST_EXCLUDE]; else rv = im->sfcount[MCAST_EXCLUDE] != 0; spin_unlock_bh(&im->lock); } else rv = 1; /* unspecified source; tentatively allow */ } return rv; } #if defined(CONFIG_PROC_FS) struct igmp_mc_iter_state { struct seq_net_private p; struct net_device *dev; struct in_device *in_dev; }; #define igmp_mc_seq_private(seq) ((struct igmp_mc_iter_state *)(seq)->private) static inline struct ip_mc_list *igmp_mc_get_first(struct seq_file *seq) { struct net *net = seq_file_net(seq); struct ip_mc_list *im = NULL; struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); state->in_dev = NULL; for_each_netdev_rcu(net, state->dev) { struct in_device *in_dev; in_dev = __in_dev_get_rcu(state->dev); if (!in_dev) continue; im = rcu_dereference(in_dev->mc_list); if (im) { state->in_dev = in_dev; break; } } return im; } static struct ip_mc_list *igmp_mc_get_next(struct seq_file *seq, struct ip_mc_list *im) { struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); im = rcu_dereference(im->next_rcu); while (!im) { state->dev = next_net_device_rcu(state->dev); if (!state->dev) { state->in_dev = NULL; break; } state->in_dev = __in_dev_get_rcu(state->dev); if (!state->in_dev) continue; im = rcu_dereference(state->in_dev->mc_list); } return im; } static struct ip_mc_list *igmp_mc_get_idx(struct seq_file *seq, loff_t pos) { struct ip_mc_list *im = igmp_mc_get_first(seq); if (im) while (pos && (im = igmp_mc_get_next(seq, im)) != NULL) --pos; return pos ? NULL : im; } static void *igmp_mc_seq_start(struct seq_file *seq, loff_t *pos) __acquires(rcu) { rcu_read_lock(); return *pos ? igmp_mc_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; } static void *igmp_mc_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct ip_mc_list *im; if (v == SEQ_START_TOKEN) im = igmp_mc_get_first(seq); else im = igmp_mc_get_next(seq, v); ++*pos; return im; } static void igmp_mc_seq_stop(struct seq_file *seq, void *v) __releases(rcu) { struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); state->in_dev = NULL; state->dev = NULL; rcu_read_unlock(); } static int igmp_mc_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) seq_puts(seq, "Idx\tDevice : Count Querier\tGroup Users Timer\tReporter\n"); else { struct ip_mc_list *im = v; struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); char *querier; long delta; #ifdef CONFIG_IP_MULTICAST querier = IGMP_V1_SEEN(state->in_dev) ? "V1" : IGMP_V2_SEEN(state->in_dev) ? "V2" : "V3"; #else querier = "NONE"; #endif if (rcu_access_pointer(state->in_dev->mc_list) == im) { seq_printf(seq, "%d\t%-10s: %5d %7s\n", state->dev->ifindex, state->dev->name, state->in_dev->mc_count, querier); } delta = im->timer.expires - jiffies; seq_printf(seq, "\t\t\t\t%08X %5d %d:%08lX\t\t%d\n", im->multiaddr, im->users, im->tm_running, im->tm_running ? jiffies_delta_to_clock_t(delta) : 0, im->reporter); } return 0; } static const struct seq_operations igmp_mc_seq_ops = { .start = igmp_mc_seq_start, .next = igmp_mc_seq_next, .stop = igmp_mc_seq_stop, .show = igmp_mc_seq_show, }; struct igmp_mcf_iter_state { struct seq_net_private p; struct net_device *dev; struct in_device *idev; struct ip_mc_list *im; }; #define igmp_mcf_seq_private(seq) ((struct igmp_mcf_iter_state *)(seq)->private) static inline struct ip_sf_list *igmp_mcf_get_first(struct seq_file *seq) { struct net *net = seq_file_net(seq); struct ip_sf_list *psf = NULL; struct ip_mc_list *im = NULL; struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq); state->idev = NULL; state->im = NULL; for_each_netdev_rcu(net, state->dev) { struct in_device *idev; idev = __in_dev_get_rcu(state->dev); if (unlikely(!idev)) continue; im = rcu_dereference(idev->mc_list); if (likely(im)) { spin_lock_bh(&im->lock); psf = im->sources; if (likely(psf)) { state->im = im; state->idev = idev; break; } spin_unlock_bh(&im->lock); } } return psf; } static struct ip_sf_list *igmp_mcf_get_next(struct seq_file *seq, struct ip_sf_list *psf) { struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq); psf = psf->sf_next; while (!psf) { spin_unlock_bh(&state->im->lock); state->im = state->im->next; while (!state->im) { state->dev = next_net_device_rcu(state->dev); if (!state->dev) { state->idev = NULL; goto out; } state->idev = __in_dev_get_rcu(state->dev); if (!state->idev) continue; state->im = rcu_dereference(state->idev->mc_list); } spin_lock_bh(&state->im->lock); psf = state->im->sources; } out: return psf; } static struct ip_sf_list *igmp_mcf_get_idx(struct seq_file *seq, loff_t pos) { struct ip_sf_list *psf = igmp_mcf_get_first(seq); if (psf) while (pos && (psf = igmp_mcf_get_next(seq, psf)) != NULL) --pos; return pos ? NULL : psf; } static void *igmp_mcf_seq_start(struct seq_file *seq, loff_t *pos) __acquires(rcu) { rcu_read_lock(); return *pos ? igmp_mcf_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; } static void *igmp_mcf_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct ip_sf_list *psf; if (v == SEQ_START_TOKEN) psf = igmp_mcf_get_first(seq); else psf = igmp_mcf_get_next(seq, v); ++*pos; return psf; } static void igmp_mcf_seq_stop(struct seq_file *seq, void *v) __releases(rcu) { struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq); if (likely(state->im)) { spin_unlock_bh(&state->im->lock); state->im = NULL; } state->idev = NULL; state->dev = NULL; rcu_read_unlock(); } static int igmp_mcf_seq_show(struct seq_file *seq, void *v) { struct ip_sf_list *psf = v; struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq); if (v == SEQ_START_TOKEN) { seq_puts(seq, "Idx Device MCA SRC INC EXC\n"); } else { seq_printf(seq, "%3d %6.6s 0x%08x " "0x%08x %6lu %6lu\n", state->dev->ifindex, state->dev->name, ntohl(state->im->multiaddr), ntohl(psf->sf_inaddr), psf->sf_count[MCAST_INCLUDE], psf->sf_count[MCAST_EXCLUDE]); } return 0; } static const struct seq_operations igmp_mcf_seq_ops = { .start = igmp_mcf_seq_start, .next = igmp_mcf_seq_next, .stop = igmp_mcf_seq_stop, .show = igmp_mcf_seq_show, }; static int __net_init igmp_net_init(struct net *net) { struct proc_dir_entry *pde; int err; pde = proc_create_net("igmp", 0444, net->proc_net, &igmp_mc_seq_ops, sizeof(struct igmp_mc_iter_state)); if (!pde) goto out_igmp; pde = proc_create_net("mcfilter", 0444, net->proc_net, &igmp_mcf_seq_ops, sizeof(struct igmp_mcf_iter_state)); if (!pde) goto out_mcfilter; err = inet_ctl_sock_create(&net->ipv4.mc_autojoin_sk, AF_INET, SOCK_DGRAM, 0, net); if (err < 0) { pr_err("Failed to initialize the IGMP autojoin socket (err %d)\n", err); goto out_sock; } return 0; out_sock: remove_proc_entry("mcfilter", net->proc_net); out_mcfilter: remove_proc_entry("igmp", net->proc_net); out_igmp: return -ENOMEM; } static void __net_exit igmp_net_exit(struct net *net) { remove_proc_entry("mcfilter", net->proc_net); remove_proc_entry("igmp", net->proc_net); inet_ctl_sock_destroy(net->ipv4.mc_autojoin_sk); } static struct pernet_operations igmp_net_ops = { .init = igmp_net_init, .exit = igmp_net_exit, }; #endif static int igmp_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct in_device *in_dev; switch (event) { case NETDEV_RESEND_IGMP: in_dev = __in_dev_get_rtnl(dev); if (in_dev) ip_mc_rejoin_groups(in_dev); break; default: break; } return NOTIFY_DONE; } static struct notifier_block igmp_notifier = { .notifier_call = igmp_netdev_event, }; int __init igmp_mc_init(void) { #if defined(CONFIG_PROC_FS) int err; err = register_pernet_subsys(&igmp_net_ops); if (err) return err; err = register_netdevice_notifier(&igmp_notifier); if (err) goto reg_notif_fail; return 0; reg_notif_fail: unregister_pernet_subsys(&igmp_net_ops); return err; #else return register_netdevice_notifier(&igmp_notifier); #endif }
470 186 177 8 187 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/xattr_security.c * Handler for storing security labels as extended attributes. */ #include <linux/string.h> #include <linux/fs.h> #include <linux/security.h> #include <linux/slab.h> #include "ext4_jbd2.h" #include "ext4.h" #include "xattr.h" static int ext4_xattr_security_get(const struct xattr_handler *handler, struct dentry *unused, struct inode *inode, const char *name, void *buffer, size_t size) { return ext4_xattr_get(inode, EXT4_XATTR_INDEX_SECURITY, name, buffer, size); } static int ext4_xattr_security_set(const struct xattr_handler *handler, struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) { return ext4_xattr_set(inode, EXT4_XATTR_INDEX_SECURITY, name, value, size, flags); } static int ext4_initxattrs(struct inode *inode, const struct xattr *xattr_array, void *fs_info) { const struct xattr *xattr; handle_t *handle = fs_info; int err = 0; for (xattr = xattr_array; xattr->name != NULL; xattr++) { err = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_SECURITY, xattr->name, xattr->value, xattr->value_len, XATTR_CREATE); if (err < 0) break; } return err; } int ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir, const struct qstr *qstr) { return security_inode_init_security(inode, dir, qstr, &ext4_initxattrs, handle); } const struct xattr_handler ext4_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, .get = ext4_xattr_security_get, .set = ext4_xattr_security_set, };
126 126 126 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2013 HUAWEI * Author: Cai Zhiyong <caizhiyong@huawei.com> * * Read block device partition table from the command line. * Typically used for fixed block (eMMC) embedded devices. * It has no MBR, so saves storage space. Bootloader can be easily accessed * by absolute address of data on the block device. * Users can easily change the partition. * * The format for the command line is just like mtdparts. * * For further information, see "Documentation/block/cmdline-partition.rst" * */ #include <linux/blkdev.h> #include <linux/fs.h> #include <linux/slab.h> #include "check.h" /* partition flags */ #define PF_RDONLY 0x01 /* Device is read only */ #define PF_POWERUP_LOCK 0x02 /* Always locked after reset */ struct cmdline_subpart { char name[BDEVNAME_SIZE]; /* partition name, such as 'rootfs' */ sector_t from; sector_t size; int flags; struct cmdline_subpart *next_subpart; }; struct cmdline_parts { char name[BDEVNAME_SIZE]; /* block device, such as 'mmcblk0' */ unsigned int nr_subparts; struct cmdline_subpart *subpart; struct cmdline_parts *next_parts; }; static int parse_subpart(struct cmdline_subpart **subpart, char *partdef) { int ret = 0; struct cmdline_subpart *new_subpart; *subpart = NULL; new_subpart = kzalloc_obj(struct cmdline_subpart); if (!new_subpart) return -ENOMEM; if (*partdef == '-') { new_subpart->size = (sector_t)(~0ULL); partdef++; } else { new_subpart->size = (sector_t)memparse(partdef, &partdef); if (new_subpart->size < (sector_t)PAGE_SIZE) { pr_warn("cmdline partition size is invalid."); ret = -EINVAL; goto fail; } } if (*partdef == '@') { partdef++; new_subpart->from = (sector_t)memparse(partdef, &partdef); } else { new_subpart->from = (sector_t)(~0ULL); } if (*partdef == '(') { partdef++; char *next = strsep(&partdef, ")"); if (!next) { pr_warn("cmdline partition format is invalid."); ret = -EINVAL; goto fail; } strscpy(new_subpart->name, next, sizeof(new_subpart->name)); } else new_subpart->name[0] = '\0'; new_subpart->flags = 0; if (!strncmp(partdef, "ro", 2)) { new_subpart->flags |= PF_RDONLY; partdef += 2; } if (!strncmp(partdef, "lk", 2)) { new_subpart->flags |= PF_POWERUP_LOCK; partdef += 2; } *subpart = new_subpart; return 0; fail: kfree(new_subpart); return ret; } static void free_subpart(struct cmdline_parts *parts) { struct cmdline_subpart *subpart; while (parts->subpart) { subpart = parts->subpart; parts->subpart = subpart->next_subpart; kfree(subpart); } } static int parse_parts(struct cmdline_parts **parts, char *bdevdef) { int ret = -EINVAL; char *next; struct cmdline_subpart **next_subpart; struct cmdline_parts *newparts; *parts = NULL; newparts = kzalloc_obj(struct cmdline_parts); if (!newparts) return -ENOMEM; next = strsep(&bdevdef, ":"); if (!next) { pr_warn("cmdline partition has no block device."); goto fail; } strscpy(newparts->name, next, sizeof(newparts->name)); newparts->nr_subparts = 0; next_subpart = &newparts->subpart; while ((next = strsep(&bdevdef, ","))) { ret = parse_subpart(next_subpart, next); if (ret) goto fail; newparts->nr_subparts++; next_subpart = &(*next_subpart)->next_subpart; } if (!newparts->subpart) { pr_warn("cmdline partition has no valid partition."); ret = -EINVAL; goto fail; } *parts = newparts; return 0; fail: free_subpart(newparts); kfree(newparts); return ret; } static void cmdline_parts_free(struct cmdline_parts **parts) { struct cmdline_parts *next_parts; while (*parts) { next_parts = (*parts)->next_parts; free_subpart(*parts); kfree(*parts); *parts = next_parts; } } static int cmdline_parts_parse(struct cmdline_parts **parts, const char *cmdline) { int ret; char *buf; char *pbuf; char *next; struct cmdline_parts **next_parts; *parts = NULL; pbuf = buf = kstrdup(cmdline, GFP_KERNEL); if (!buf) return -ENOMEM; next_parts = parts; while ((next = strsep(&pbuf, ";"))) { ret = parse_parts(next_parts, next); if (ret) goto fail; next_parts = &(*next_parts)->next_parts; } if (!*parts) { pr_warn("cmdline partition has no valid partition."); ret = -EINVAL; goto fail; } ret = 0; done: kfree(buf); return ret; fail: cmdline_parts_free(parts); goto done; } static struct cmdline_parts *cmdline_parts_find(struct cmdline_parts *parts, const char *bdev) { while (parts && strncmp(bdev, parts->name, sizeof(parts->name))) parts = parts->next_parts; return parts; } static char *cmdline; static struct cmdline_parts *bdev_parts; static int add_part(int slot, struct cmdline_subpart *subpart, struct parsed_partitions *state) { struct partition_meta_info *info; char tmp[sizeof(info->volname) + 4]; if (slot >= state->limit) return 1; put_partition(state, slot, subpart->from >> 9, subpart->size >> 9); if (subpart->flags & PF_RDONLY) state->parts[slot].flags |= ADDPART_FLAG_READONLY; info = &state->parts[slot].info; strscpy(info->volname, subpart->name, sizeof(info->volname)); snprintf(tmp, sizeof(tmp), "(%s)", info->volname); strlcat(state->pp_buf, tmp, PAGE_SIZE); state->parts[slot].has_info = true; return 0; } static int cmdline_parts_set(struct cmdline_parts *parts, sector_t disk_size, struct parsed_partitions *state) { sector_t from = 0; struct cmdline_subpart *subpart; int slot = 1; for (subpart = parts->subpart; subpart; subpart = subpart->next_subpart, slot++) { if (subpart->from == (sector_t)(~0ULL)) subpart->from = from; else from = subpart->from; if (from >= disk_size) break; if (subpart->size > (disk_size - from)) subpart->size = disk_size - from; from += subpart->size; if (add_part(slot, subpart, state)) break; } return slot; } static int __init cmdline_parts_setup(char *s) { cmdline = s; return 1; } __setup("blkdevparts=", cmdline_parts_setup); static bool has_overlaps(sector_t from, sector_t size, sector_t from2, sector_t size2) { sector_t end = from + size; sector_t end2 = from2 + size2; if (from >= from2 && from < end2) return true; if (end > from2 && end <= end2) return true; if (from2 >= from && from2 < end) return true; if (end2 > from && end2 <= end) return true; return false; } static inline void overlaps_warns_header(void) { pr_warn("Overlapping partitions are used in command line partitions."); pr_warn("Don't use filesystems on overlapping partitions:"); } static void cmdline_parts_verifier(int slot, struct parsed_partitions *state) { int i; bool header = true; for (; slot < state->limit && state->parts[slot].has_info; slot++) { for (i = slot+1; i < state->limit && state->parts[i].has_info; i++) { if (has_overlaps(state->parts[slot].from, state->parts[slot].size, state->parts[i].from, state->parts[i].size)) { if (header) { header = false; overlaps_warns_header(); } pr_warn("%s[%llu,%llu] overlaps with " "%s[%llu,%llu].", state->parts[slot].info.volname, (u64)state->parts[slot].from << 9, (u64)state->parts[slot].size << 9, state->parts[i].info.volname, (u64)state->parts[i].from << 9, (u64)state->parts[i].size << 9); } } } } /* * Purpose: allocate cmdline partitions. * Returns: * -1 if unable to read the partition table * 0 if this isn't our partition table * 1 if successful */ int cmdline_partition(struct parsed_partitions *state) { sector_t disk_size; struct cmdline_parts *parts; if (cmdline) { if (bdev_parts) cmdline_parts_free(&bdev_parts); if (cmdline_parts_parse(&bdev_parts, cmdline)) { cmdline = NULL; return -1; } cmdline = NULL; } if (!bdev_parts) return 0; parts = cmdline_parts_find(bdev_parts, state->disk->disk_name); if (!parts) return 0; disk_size = get_capacity(state->disk) << 9; cmdline_parts_set(parts, disk_size, state); cmdline_parts_verifier(1, state); strlcat(state->pp_buf, "\n", PAGE_SIZE); return 1; }
40 21 38 70 71 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 /* * linux/fs/nls/nls_koi8-ru.c * * Charset koi8-ru translation based on charset koi8-u. * The Unicode to charset table has only exact mappings. */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/nls.h> #include <linux/errno.h> static struct nls_table *p_nls; static int uni2char(const wchar_t uni, unsigned char *out, int boundlen) { if (boundlen <= 0) return -ENAMETOOLONG; if ((uni & 0xffaf) == 0x040e || (uni & 0xffce) == 0x254c) { /* koi8-ru and koi8-u differ only on two characters */ if (uni == 0x040e) out[0] = 0xbe; else if (uni == 0x045e) out[0] = 0xae; else if (uni == 0x255d || uni == 0x256c) return 0; else return p_nls->uni2char(uni, out, boundlen); return 1; } else /* fast path */ return p_nls->uni2char(uni, out, boundlen); } static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) { int n; if ((*rawstring & 0xef) != 0xae) { /* koi8-ru and koi8-u differ only on two characters */ *uni = (*rawstring & 0x10) ? 0x040e : 0x045e; return 1; } n = p_nls->char2uni(rawstring, boundlen, uni); return n; } static struct nls_table table = { .charset = "koi8-ru", .uni2char = uni2char, .char2uni = char2uni, }; static int __init init_nls_koi8_ru(void) { p_nls = load_nls("koi8-u"); if (p_nls) { table.charset2upper = p_nls->charset2upper; table.charset2lower = p_nls->charset2lower; return register_nls(&table); } return -EINVAL; } static void __exit exit_nls_koi8_ru(void) { unregister_nls(&table); unload_nls(p_nls); } module_init(init_nls_koi8_ru) module_exit(exit_nls_koi8_ru) MODULE_DESCRIPTION("NLS KOI8-RU (Belarusian)"); MODULE_LICENSE("Dual BSD/GPL");
388 220 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_RATELIMIT_H #define _LINUX_RATELIMIT_H #include <linux/ratelimit_types.h> #include <linux/sched.h> #include <linux/spinlock.h> static inline void ratelimit_state_init(struct ratelimit_state *rs, int interval, int burst) { memset(rs, 0, sizeof(*rs)); raw_spin_lock_init(&rs->lock); rs->interval = interval; rs->burst = burst; } static inline void ratelimit_default_init(struct ratelimit_state *rs) { return ratelimit_state_init(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); } static inline void ratelimit_state_inc_miss(struct ratelimit_state *rs) { atomic_inc(&rs->missed); } static inline int ratelimit_state_get_miss(struct ratelimit_state *rs) { return atomic_read(&rs->missed); } static inline int ratelimit_state_reset_miss(struct ratelimit_state *rs) { return atomic_xchg_relaxed(&rs->missed, 0); } static inline void ratelimit_state_reset_interval(struct ratelimit_state *rs, int interval_init) { unsigned long flags; raw_spin_lock_irqsave(&rs->lock, flags); rs->interval = interval_init; rs->flags &= ~RATELIMIT_INITIALIZED; atomic_set(&rs->rs_n_left, rs->burst); ratelimit_state_reset_miss(rs); raw_spin_unlock_irqrestore(&rs->lock, flags); } static inline void ratelimit_state_exit(struct ratelimit_state *rs) { int m; if (!(rs->flags & RATELIMIT_MSG_ON_RELEASE)) return; m = ratelimit_state_reset_miss(rs); if (m) pr_warn("%s: %d output lines suppressed due to ratelimiting\n", current->comm, m); } static inline void ratelimit_set_flags(struct ratelimit_state *rs, unsigned long flags) { rs->flags = flags; } extern struct ratelimit_state printk_ratelimit_state; #ifdef CONFIG_PRINTK #define WARN_ON_RATELIMIT(condition, state) ({ \ bool __rtn_cond = !!(condition); \ WARN_ON(__rtn_cond && __ratelimit(state)); \ __rtn_cond; \ }) #define WARN_RATELIMIT(condition, format, ...) \ ({ \ static DEFINE_RATELIMIT_STATE(_rs, \ DEFAULT_RATELIMIT_INTERVAL, \ DEFAULT_RATELIMIT_BURST); \ int rtn = !!(condition); \ \ if (unlikely(rtn && __ratelimit(&_rs))) \ WARN(rtn, format, ##__VA_ARGS__); \ \ rtn; \ }) #else #define WARN_ON_RATELIMIT(condition, state) \ WARN_ON(condition) #define WARN_RATELIMIT(condition, format, ...) \ ({ \ int rtn = WARN(condition, format, ##__VA_ARGS__); \ rtn; \ }) #endif #endif /* _LINUX_RATELIMIT_H */
8442 8500 8535 8904 8912 20 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __X86_KERNEL_FPU_CONTEXT_H #define __X86_KERNEL_FPU_CONTEXT_H #include <asm/fpu/xstate.h> #include <asm/trace/fpu.h> /* Functions related to FPU context tracking */ /* * The in-register FPU state for an FPU context on a CPU is assumed to be * valid if the fpu->last_cpu matches the CPU, and the fpu_fpregs_owner_ctx * matches the FPU. * * If the FPU register state is valid, the kernel can skip restoring the * FPU state from memory. * * Any code that clobbers the FPU registers or updates the in-memory * FPU state for a task MUST let the rest of the kernel know that the * FPU registers are no longer valid for this task. * * Invalidate a resource you control: CPU if using the CPU for something else * (with preemption disabled), FPU for the current task, or a task that * is prevented from running by the current task. */ static inline void __cpu_invalidate_fpregs_state(void) { __this_cpu_write(fpu_fpregs_owner_ctx, NULL); } static inline void __fpu_invalidate_fpregs_state(struct fpu *fpu) { fpu->last_cpu = -1; } static inline int fpregs_state_valid(struct fpu *fpu, unsigned int cpu) { return fpu == this_cpu_read(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu; } static inline void fpregs_deactivate(struct fpu *fpu) { __this_cpu_write(fpu_fpregs_owner_ctx, NULL); trace_x86_fpu_regs_deactivated(fpu); } static inline void fpregs_activate(struct fpu *fpu) { __this_cpu_write(fpu_fpregs_owner_ctx, fpu); trace_x86_fpu_regs_activated(fpu); } /* Internal helper for switch_fpu_return() and signal frame setup */ static inline void fpregs_restore_userregs(void) { struct fpu *fpu = x86_task_fpu(current); int cpu = smp_processor_id(); if (WARN_ON_ONCE(current->flags & (PF_KTHREAD | PF_USER_WORKER))) return; if (!fpregs_state_valid(fpu, cpu)) { /* * This restores _all_ xstate which has not been * established yet. * * If PKRU is enabled, then the PKRU value is already * correct because it was either set in switch_to() or in * flush_thread(). So it is excluded because it might be * not up to date in current->thread.fpu->xsave state. * * XFD state is handled in restore_fpregs_from_fpstate(). */ restore_fpregs_from_fpstate(fpu->fpstate, XFEATURE_MASK_FPSTATE); fpregs_activate(fpu); fpu->last_cpu = cpu; } clear_thread_flag(TIF_NEED_FPU_LOAD); } #endif
6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2011 STRATO. All rights reserved. */ #ifndef BTRFS_BACKREF_H #define BTRFS_BACKREF_H #include <linux/types.h> #include <linux/rbtree.h> #include <linux/list.h> #include <linux/slab.h> #include <uapi/linux/btrfs.h> #include <uapi/linux/btrfs_tree.h> #include "messages.h" #include "locking.h" #include "disk-io.h" #include "extent_io.h" #include "ctree.h" struct extent_inode_elem; struct ulist; struct btrfs_extent_item; struct btrfs_trans_handle; struct btrfs_fs_info; /* * Used by implementations of iterate_extent_inodes_t (see definition below) to * signal that backref iteration can stop immediately and no error happened. * The value must be non-negative and must not be 0, 1 (which is a common return * value from things like btrfs_search_slot() and used internally in the backref * walking code) and different from BACKREF_FOUND_SHARED and * BACKREF_FOUND_NOT_SHARED */ #define BTRFS_ITERATE_EXTENT_INODES_STOP 5 /* * Should return 0 if no errors happened and iteration of backrefs should * continue. Can return BTRFS_ITERATE_EXTENT_INODES_STOP or any other non-zero * value to immediately stop iteration and possibly signal an error back to * the caller. */ typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 num_bytes, u64 root, void *ctx); /* * Context and arguments for backref walking functions. Some of the fields are * to be filled by the caller of such functions while other are filled by the * functions themselves, as described below. */ struct btrfs_backref_walk_ctx { /* * The address of the extent for which we are doing backref walking. * Can be either a data extent or a metadata extent. * * Must always be set by the top level caller. */ u64 bytenr; /* * Offset relative to the target extent. This is only used for data * extents, and it's meaningful because we can have file extent items * that point only to a section of a data extent ("bookend" extents), * and we want to filter out any that don't point to a section of the * data extent containing the given offset. * * Must always be set by the top level caller. */ u64 extent_item_pos; /* * If true and bytenr corresponds to a data extent, then references from * all file extent items that point to the data extent are considered, * @extent_item_pos is ignored. */ bool ignore_extent_item_pos; /* * If true and bytenr corresponds to a data extent, then the inode list * (each member describing inode number, file offset and root) is not * added to each reference added to the @refs ulist. */ bool skip_inode_ref_list; /* A valid transaction handle or NULL. */ struct btrfs_trans_handle *trans; /* * The file system's info object, can not be NULL. * * Must always be set by the top level caller. */ struct btrfs_fs_info *fs_info; /* * Time sequence acquired from btrfs_get_tree_mod_seq(), in case the * caller joined the tree mod log to get a consistent view of b+trees * while we do backref walking, or BTRFS_SEQ_LAST. * When using BTRFS_SEQ_LAST, delayed refs are not checked and it uses * commit roots when searching b+trees - this is a special case for * qgroups used during a transaction commit. */ u64 time_seq; /* * Used to collect the bytenr of metadata extents that point to the * target extent. */ struct ulist *refs; /* * List used to collect the IDs of the roots from which the target * extent is accessible. Can be NULL in case the caller does not care * about collecting root IDs. */ struct ulist *roots; /* * Used by iterate_extent_inodes() and the main backref walk code * (find_parent_nodes()). Lookup and store functions for an optional * cache which maps the logical address (bytenr) of leaves to an array * of root IDs. */ bool (*cache_lookup)(u64 leaf_bytenr, void *user_ctx, const u64 **root_ids_ret, int *root_count_ret); void (*cache_store)(u64 leaf_bytenr, const struct ulist *root_ids, void *user_ctx); /* * If this is not NULL, then the backref walking code will call this * for each indirect data extent reference as soon as it finds one, * before collecting all the remaining backrefs and before resolving * indirect backrefs. This allows for the caller to terminate backref * walking as soon as it finds one backref that matches some specific * criteria. The @cache_lookup and @cache_store callbacks should not * be NULL in order to use this callback. */ iterate_extent_inodes_t *indirect_ref_iterator; /* * If this is not NULL, then the backref walking code will call this for * each extent item it's meant to process before it actually starts * processing it. If this returns anything other than 0, then it stops * the backref walking code immediately. */ int (*check_extent_item)(u64 bytenr, const struct btrfs_extent_item *ei, const struct extent_buffer *leaf, void *user_ctx); /* * If this is not NULL, then the backref walking code will call this for * each extent data ref it finds (BTRFS_EXTENT_DATA_REF_KEY keys) before * processing that data ref. If this callback return false, then it will * ignore this data ref and it will never resolve the indirect data ref, * saving time searching for leaves in a fs tree with file extent items * matching the data ref. */ bool (*skip_data_ref)(u64 root, u64 ino, u64 offset, void *user_ctx); /* Context object to pass to the callbacks defined above. */ void *user_ctx; }; struct inode_fs_paths { struct btrfs_path *btrfs_path; struct btrfs_root *fs_root; struct btrfs_data_container *fspath; }; struct btrfs_backref_shared_cache_entry { u64 bytenr; u64 gen; bool is_shared; }; #define BTRFS_BACKREF_CTX_PREV_EXTENTS_SIZE 8 struct btrfs_backref_share_check_ctx { /* Ulists used during backref walking. */ struct ulist refs; /* * The current leaf the caller of btrfs_is_data_extent_shared() is at. * Typically the caller (at the moment only fiemap) tries to determine * the sharedness of data extents point by file extent items from entire * leaves. */ u64 curr_leaf_bytenr; /* * The previous leaf the caller was at in the previous call to * btrfs_is_data_extent_shared(). This may be the same as the current * leaf. On the first call it must be 0. */ u64 prev_leaf_bytenr; /* * A path from a root to a leaf that has a file extent item pointing to * a given data extent should never exceed the maximum b+tree height. */ struct btrfs_backref_shared_cache_entry path_cache_entries[BTRFS_MAX_LEVEL]; bool use_path_cache; /* * Cache the sharedness result for the last few extents we have found, * but only for extents for which we have multiple file extent items * that point to them. * It's very common to have several file extent items that point to the * same extent (bytenr) but with different offsets and lengths. This * typically happens for COW writes, partial writes into prealloc * extents, NOCOW writes after snapshotting a root, hole punching or * reflinking within the same file (less common perhaps). * So keep a small cache with the lookup results for the extent pointed * by the last few file extent items. This cache is checked, with a * linear scan, whenever btrfs_is_data_extent_shared() is called, so * it must be small so that it does not negatively affect performance in * case we don't have multiple file extent items that point to the same * data extent. */ struct { u64 bytenr; bool is_shared; } prev_extents_cache[BTRFS_BACKREF_CTX_PREV_EXTENTS_SIZE]; /* * The slot in the prev_extents_cache array that will be used for * storing the sharedness result of a new data extent. */ int prev_extents_cache_slot; }; struct btrfs_backref_share_check_ctx *btrfs_alloc_backref_share_check_ctx(void); void btrfs_free_backref_share_ctx(struct btrfs_backref_share_check_ctx *ctx); int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, struct btrfs_path *path, struct btrfs_key *found_key, u64 *flags); int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, struct btrfs_key *key, struct btrfs_extent_item *ei, u32 item_size, u64 *out_root, u8 *out_level); int iterate_extent_inodes(struct btrfs_backref_walk_ctx *ctx, bool search_commit_root, iterate_extent_inodes_t *iterate, void *user_ctx); int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, void *ctx, bool ignore_offset); int paths_from_inode(u64 inum, struct inode_fs_paths *ipath); int btrfs_find_all_leafs(struct btrfs_backref_walk_ctx *ctx); int btrfs_find_all_roots(struct btrfs_backref_walk_ctx *ctx, bool skip_commit_root_sem); char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, u32 name_len, unsigned long name_off, struct extent_buffer *eb_in, u64 parent, char *dest, u32 size); struct btrfs_data_container *init_data_container(u32 total_bytes); struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, struct btrfs_path *path); DEFINE_FREE(inode_fs_paths, struct inode_fs_paths *, if (_T) { kvfree(_T->fspath); kfree(_T); }) int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid, u64 start_off, struct btrfs_path *path, struct btrfs_inode_extref **ret_extref, u64 *found_off); int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, u64 extent_gen, struct btrfs_backref_share_check_ctx *ctx); int __init btrfs_prelim_ref_init(void); void __cold btrfs_prelim_ref_exit(void); struct prelim_ref { struct rb_node rbnode; u64 root_id; struct btrfs_key key_for_search; u8 level; int count; struct extent_inode_elem *inode_list; u64 parent; u64 wanted_disk_byte; }; /* * Iterate backrefs of one extent. * * Now it only supports iteration of tree block in commit root. */ struct btrfs_backref_iter { u64 bytenr; struct btrfs_path *path; struct btrfs_fs_info *fs_info; struct btrfs_key cur_key; u32 item_ptr; u32 cur_ptr; u32 end_ptr; }; struct btrfs_backref_iter *btrfs_backref_iter_alloc(struct btrfs_fs_info *fs_info); /* * For metadata with EXTENT_ITEM key (non-skinny) case, the first inline data * is btrfs_tree_block_info, without a btrfs_extent_inline_ref header. * * This helper determines if that's the case. */ static inline bool btrfs_backref_has_tree_block_info( struct btrfs_backref_iter *iter) { if (iter->cur_key.type == BTRFS_EXTENT_ITEM_KEY && iter->cur_ptr - iter->item_ptr == sizeof(struct btrfs_extent_item)) return true; return false; } int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr); int btrfs_backref_iter_next(struct btrfs_backref_iter *iter); /* * Backref cache related structures * * The whole objective of backref_cache is to build a bi-directional map * of tree blocks (represented by backref_node) and all their parents. */ /* * Represent a tree block in the backref cache */ struct btrfs_backref_node { union{ /* Use rb_simple_node for search/insert */ struct { struct rb_node rb_node; u64 bytenr; }; struct rb_simple_node simple_node; }; /* * This is a sanity check, whenever we COW a block we will update * new_bytenr with it's current location, and we will check this in * various places to validate that the cache makes sense, it shouldn't * be used for anything else. */ u64 new_bytenr; /* Objectid of tree block owner, can be not uptodate */ u64 owner; /* Link to pending, changed or detached list */ struct list_head list; /* List of upper level edges, which link this node to its parents */ struct list_head upper; /* List of lower level edges, which link this node to its children */ struct list_head lower; /* NULL if this node is not tree root */ struct btrfs_root *root; /* Extent buffer got by COWing the block */ struct extent_buffer *eb; /* Level of the tree block */ unsigned int level:8; /* Is the extent buffer locked */ unsigned int locked:1; /* Has the block been processed */ unsigned int processed:1; /* Have backrefs of this block been checked */ unsigned int checked:1; /* * 1 if corresponding block has been COWed but some upper level block * pointers may not point to the new location */ unsigned int pending:1; /* 1 if the backref node isn't connected to any other backref node */ unsigned int detached:1; /* * For generic purpose backref cache, where we only care if it's a reloc * root, doesn't care the source subvolid. */ unsigned int is_reloc_root:1; }; #define LOWER 0 #define UPPER 1 /* * Represent an edge connecting upper and lower backref nodes. */ struct btrfs_backref_edge { /* * list[LOWER] is linked to btrfs_backref_node::upper of lower level * node, and list[UPPER] is linked to btrfs_backref_node::lower of * upper level node. * * Also, build_backref_tree() uses list[UPPER] for pending edges, before * linking list[UPPER] to its upper level nodes. */ struct list_head list[2]; /* Two related nodes */ struct btrfs_backref_node *node[2]; }; struct btrfs_backref_cache { /* Red black tree of all backref nodes in the cache */ struct rb_root rb_root; /* For passing backref nodes to btrfs_reloc_cow_block */ struct btrfs_backref_node *path[BTRFS_MAX_LEVEL]; /* * List of blocks that have been COWed but some block pointers in upper * level blocks may not reflect the new location */ struct list_head pending[BTRFS_MAX_LEVEL]; u64 last_trans; int nr_nodes; int nr_edges; /* List of unchecked backref edges during backref cache build */ struct list_head pending_edge; /* List of useless backref nodes during backref cache build */ struct list_head useless_node; struct btrfs_fs_info *fs_info; /* * Whether this cache is for relocation * * Relocation backref cache require more info for reloc root compared * to generic backref cache. */ bool is_reloc; }; void btrfs_backref_init_cache(struct btrfs_fs_info *fs_info, struct btrfs_backref_cache *cache, bool is_reloc); struct btrfs_backref_node *btrfs_backref_alloc_node( struct btrfs_backref_cache *cache, u64 bytenr, int level); struct btrfs_backref_edge *btrfs_backref_alloc_edge( struct btrfs_backref_cache *cache); void btrfs_backref_free_node(struct btrfs_backref_cache *cache, struct btrfs_backref_node *node); void btrfs_backref_free_edge(struct btrfs_backref_cache *cache, struct btrfs_backref_edge *edge); void btrfs_backref_unlock_node_buffer(struct btrfs_backref_node *node); void btrfs_backref_drop_node_buffer(struct btrfs_backref_node *node); void btrfs_backref_cleanup_node(struct btrfs_backref_cache *cache, struct btrfs_backref_node *node); void btrfs_backref_drop_node(struct btrfs_backref_cache *tree, struct btrfs_backref_node *node); void btrfs_backref_release_cache(struct btrfs_backref_cache *cache); static inline void btrfs_backref_panic(struct btrfs_fs_info *fs_info, u64 bytenr, int error) { btrfs_panic(fs_info, error, "Inconsistency in backref cache found at offset %llu", bytenr); } int btrfs_backref_add_tree_node(struct btrfs_trans_handle *trans, struct btrfs_backref_cache *cache, struct btrfs_path *path, struct btrfs_backref_iter *iter, struct btrfs_key *node_key, struct btrfs_backref_node *cur); int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache, struct btrfs_backref_node *start); void btrfs_backref_error_cleanup(struct btrfs_backref_cache *cache, struct btrfs_backref_node *node); #endif
69 69 69 69 69 2 69 68 1 67 67 8 65 5 62 5 5 103 69 66 60 60 60 103 103 101 69 69 66 9 133 73 60 284 182 102 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 // SPDX-License-Identifier: GPL-2.0-only /* * blockcheck.c * * Checksum and ECC codes for the OCFS2 userspace library. * * Copyright (C) 2006, 2008 Oracle. All rights reserved. */ #include <linux/kernel.h> #include <linux/types.h> #include <linux/crc32.h> #include <linux/buffer_head.h> #include <linux/bitops.h> #include <linux/debugfs.h> #include <linux/module.h> #include <linux/fs.h> #include <asm/byteorder.h> #include <cluster/masklog.h> #include "ocfs2.h" #include "blockcheck.h" /* * We use the following conventions: * * d = # data bits * p = # parity bits * c = # total code bits (d + p) */ /* * Calculate the bit offset in the hamming code buffer based on the bit's * offset in the data buffer. Since the hamming code reserves all * power-of-two bits for parity, the data bit number and the code bit * number are offset by all the parity bits beforehand. * * Recall that bit numbers in hamming code are 1-based. This function * takes the 0-based data bit from the caller. * * An example. Take bit 1 of the data buffer. 1 is a power of two (2^0), * so it's a parity bit. 2 is a power of two (2^1), so it's a parity bit. * 3 is not a power of two. So bit 1 of the data buffer ends up as bit 3 * in the code buffer. * * The caller can pass in *p if it wants to keep track of the most recent * number of parity bits added. This allows the function to start the * calculation at the last place. */ static unsigned int calc_code_bit(unsigned int i, unsigned int *p_cache) { unsigned int b, p = 0; /* * Data bits are 0-based, but we're talking code bits, which * are 1-based. */ b = i + 1; /* Use the cache if it is there */ if (p_cache) p = *p_cache; b += p; /* * For every power of two below our bit number, bump our bit. * * We compare with (b + 1) because we have to compare with what b * would be _if_ it were bumped up by the parity bit. Capice? * * p is set above. */ for (; (1 << p) < (b + 1); p++) b++; if (p_cache) *p_cache = p; return b; } /* * This is the low level encoder function. It can be called across * multiple hunks just like the crc32 code. 'd' is the number of bits * _in_this_hunk_. nr is the bit offset of this hunk. So, if you had * two 512B buffers, you would do it like so: * * parity = ocfs2_hamming_encode(0, buf1, 512 * 8, 0); * parity = ocfs2_hamming_encode(parity, buf2, 512 * 8, 512 * 8); * * If you just have one buffer, use ocfs2_hamming_encode_block(). */ u32 ocfs2_hamming_encode(u32 parity, void *data, unsigned int d, unsigned int nr) { unsigned int i, b, p = 0; BUG_ON(!d); /* * b is the hamming code bit number. Hamming code specifies a * 1-based array, but C uses 0-based. So 'i' is for C, and 'b' is * for the algorithm. * * The i++ in the for loop is so that the start offset passed * to ocfs2_find_next_bit_set() is one greater than the previously * found bit. */ for (i = 0; (i = ocfs2_find_next_bit(data, d, i)) < d; i++) { /* * i is the offset in this hunk, nr + i is the total bit * offset. */ b = calc_code_bit(nr + i, &p); /* * Data bits in the resultant code are checked by * parity bits that are part of the bit number * representation. Huh? * * <wikipedia href="https://en.wikipedia.org/wiki/Hamming_code"> * In other words, the parity bit at position 2^k * checks bits in positions having bit k set in * their binary representation. Conversely, for * instance, bit 13, i.e. 1101(2), is checked by * bits 1000(2) = 8, 0100(2)=4 and 0001(2) = 1. * </wikipedia> * * Note that 'k' is the _code_ bit number. 'b' in * our loop. */ parity ^= b; } /* While the data buffer was treated as little endian, the * return value is in host endian. */ return parity; } u32 ocfs2_hamming_encode_block(void *data, unsigned int blocksize) { return ocfs2_hamming_encode(0, data, blocksize * 8, 0); } /* * Like ocfs2_hamming_encode(), this can handle hunks. nr is the bit * offset of the current hunk. If bit to be fixed is not part of the * current hunk, this does nothing. * * If you only have one hunk, use ocfs2_hamming_fix_block(). */ void ocfs2_hamming_fix(void *data, unsigned int d, unsigned int nr, unsigned int fix) { unsigned int i, b; BUG_ON(!d); /* * If the bit to fix has an hweight of 1, it's a parity bit. One * busted parity bit is its own error. Nothing to do here. */ if (hweight32(fix) == 1) return; /* * nr + d is the bit right past the data hunk we're looking at. * If fix after that, nothing to do */ if (fix >= calc_code_bit(nr + d, NULL)) return; /* * nr is the offset in the data hunk we're starting at. Let's * start b at the offset in the code buffer. See hamming_encode() * for a more detailed description of 'b'. */ b = calc_code_bit(nr, NULL); /* If the fix is before this hunk, nothing to do */ if (fix < b) return; for (i = 0; i < d; i++, b++) { /* Skip past parity bits */ while (hweight32(b) == 1) b++; /* * i is the offset in this data hunk. * nr + i is the offset in the total data buffer. * b is the offset in the total code buffer. * * Thus, when b == fix, bit i in the current hunk needs * fixing. */ if (b == fix) { if (ocfs2_test_bit(i, data)) ocfs2_clear_bit(i, data); else ocfs2_set_bit(i, data); break; } } } void ocfs2_hamming_fix_block(void *data, unsigned int blocksize, unsigned int fix) { ocfs2_hamming_fix(data, blocksize * 8, 0, fix); } /* * Debugfs handling. */ #ifdef CONFIG_DEBUG_FS static int blockcheck_u64_get(void *data, u64 *val) { *val = *(u64 *)data; return 0; } DEFINE_DEBUGFS_ATTRIBUTE(blockcheck_fops, blockcheck_u64_get, NULL, "%llu\n"); static void ocfs2_blockcheck_debug_remove(struct ocfs2_blockcheck_stats *stats) { if (stats) { debugfs_remove_recursive(stats->b_debug_dir); stats->b_debug_dir = NULL; } } static void ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats, struct dentry *parent) { struct dentry *dir; dir = debugfs_create_dir("blockcheck", parent); stats->b_debug_dir = dir; debugfs_create_file("blocks_checked", S_IFREG | S_IRUSR, dir, &stats->b_check_count, &blockcheck_fops); debugfs_create_file("checksums_failed", S_IFREG | S_IRUSR, dir, &stats->b_failure_count, &blockcheck_fops); debugfs_create_file("ecc_recoveries", S_IFREG | S_IRUSR, dir, &stats->b_recover_count, &blockcheck_fops); } #else static inline void ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats, struct dentry *parent) { } static inline void ocfs2_blockcheck_debug_remove(struct ocfs2_blockcheck_stats *stats) { } #endif /* CONFIG_DEBUG_FS */ /* Always-called wrappers for starting and stopping the debugfs files */ void ocfs2_blockcheck_stats_debugfs_install(struct ocfs2_blockcheck_stats *stats, struct dentry *parent) { ocfs2_blockcheck_debug_install(stats, parent); } void ocfs2_blockcheck_stats_debugfs_remove(struct ocfs2_blockcheck_stats *stats) { ocfs2_blockcheck_debug_remove(stats); } static void ocfs2_blockcheck_inc_check(struct ocfs2_blockcheck_stats *stats) { u64 new_count; if (!stats) return; spin_lock(&stats->b_lock); stats->b_check_count++; new_count = stats->b_check_count; spin_unlock(&stats->b_lock); if (!new_count) mlog(ML_NOTICE, "Block check count has wrapped\n"); } static void ocfs2_blockcheck_inc_failure(struct ocfs2_blockcheck_stats *stats) { u64 new_count; if (!stats) return; spin_lock(&stats->b_lock); stats->b_failure_count++; new_count = stats->b_failure_count; spin_unlock(&stats->b_lock); if (!new_count) mlog(ML_NOTICE, "Checksum failure count has wrapped\n"); } static void ocfs2_blockcheck_inc_recover(struct ocfs2_blockcheck_stats *stats) { u64 new_count; if (!stats) return; spin_lock(&stats->b_lock); stats->b_recover_count++; new_count = stats->b_recover_count; spin_unlock(&stats->b_lock); if (!new_count) mlog(ML_NOTICE, "ECC recovery count has wrapped\n"); } /* * These are the low-level APIs for using the ocfs2_block_check structure. */ /* * This function generates check information for a block. * data is the block to be checked. bc is a pointer to the * ocfs2_block_check structure describing the crc32 and the ecc. * * bc should be a pointer inside data, as the function will * take care of zeroing it before calculating the check information. If * bc does not point inside data, the caller must make sure any inline * ocfs2_block_check structures are zeroed. * * The data buffer must be in on-disk endian (little endian for ocfs2). * bc will be filled with little-endian values and will be ready to go to * disk. */ void ocfs2_block_check_compute(void *data, size_t blocksize, struct ocfs2_block_check *bc) { u32 crc; u32 ecc; memset(bc, 0, sizeof(struct ocfs2_block_check)); crc = crc32_le(~0, data, blocksize); ecc = ocfs2_hamming_encode_block(data, blocksize); /* * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no * larger than 16 bits. */ BUG_ON(ecc > USHRT_MAX); bc->bc_crc32e = cpu_to_le32(crc); bc->bc_ecc = cpu_to_le16((u16)ecc); } /* * This function validates existing check information. Like _compute, * the function will take care of zeroing bc before calculating check codes. * If bc is not a pointer inside data, the caller must have zeroed any * inline ocfs2_block_check structures. * * Again, the data passed in should be the on-disk endian. */ int ocfs2_block_check_validate(void *data, size_t blocksize, struct ocfs2_block_check *bc, struct ocfs2_blockcheck_stats *stats) { int rc = 0; u32 bc_crc32e; u16 bc_ecc; u32 crc, ecc; ocfs2_blockcheck_inc_check(stats); bc_crc32e = le32_to_cpu(bc->bc_crc32e); bc_ecc = le16_to_cpu(bc->bc_ecc); memset(bc, 0, sizeof(struct ocfs2_block_check)); /* Fast path - if the crc32 validates, we're good to go */ crc = crc32_le(~0, data, blocksize); if (crc == bc_crc32e) goto out; ocfs2_blockcheck_inc_failure(stats); mlog(ML_ERROR, "CRC32 failed: stored: 0x%x, computed 0x%x. Applying ECC.\n", (unsigned int)bc_crc32e, (unsigned int)crc); /* Ok, try ECC fixups */ ecc = ocfs2_hamming_encode_block(data, blocksize); ocfs2_hamming_fix_block(data, blocksize, ecc ^ bc_ecc); /* And check the crc32 again */ crc = crc32_le(~0, data, blocksize); if (crc == bc_crc32e) { ocfs2_blockcheck_inc_recover(stats); goto out; } mlog(ML_ERROR, "Fixed CRC32 failed: stored: 0x%x, computed 0x%x\n", (unsigned int)bc_crc32e, (unsigned int)crc); rc = -EIO; out: bc->bc_crc32e = cpu_to_le32(bc_crc32e); bc->bc_ecc = cpu_to_le16(bc_ecc); return rc; } /* * This function generates check information for a list of buffer_heads. * bhs is the blocks to be checked. bc is a pointer to the * ocfs2_block_check structure describing the crc32 and the ecc. * * bc should be a pointer inside data, as the function will * take care of zeroing it before calculating the check information. If * bc does not point inside data, the caller must make sure any inline * ocfs2_block_check structures are zeroed. * * The data buffer must be in on-disk endian (little endian for ocfs2). * bc will be filled with little-endian values and will be ready to go to * disk. */ void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr, struct ocfs2_block_check *bc) { int i; u32 crc, ecc; BUG_ON(nr < 0); if (!nr) return; memset(bc, 0, sizeof(struct ocfs2_block_check)); for (i = 0, crc = ~0, ecc = 0; i < nr; i++) { crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size); /* * The number of bits in a buffer is obviously b_size*8. * The offset of this buffer is b_size*i, so the bit offset * of this buffer is b_size*8*i. */ ecc = (u16)ocfs2_hamming_encode(ecc, bhs[i]->b_data, bhs[i]->b_size * 8, bhs[i]->b_size * 8 * i); } /* * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no * larger than 16 bits. */ BUG_ON(ecc > USHRT_MAX); bc->bc_crc32e = cpu_to_le32(crc); bc->bc_ecc = cpu_to_le16((u16)ecc); } /* * This function validates existing check information on a list of * buffer_heads. Like _compute_bhs, the function will take care of * zeroing bc before calculating check codes. If bc is not a pointer * inside data, the caller must have zeroed any inline * ocfs2_block_check structures. * * Again, the data passed in should be the on-disk endian. */ int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr, struct ocfs2_block_check *bc, struct ocfs2_blockcheck_stats *stats) { int i, rc = 0; u32 bc_crc32e; u16 bc_ecc; u32 crc, ecc, fix; BUG_ON(nr < 0); if (!nr) return 0; ocfs2_blockcheck_inc_check(stats); bc_crc32e = le32_to_cpu(bc->bc_crc32e); bc_ecc = le16_to_cpu(bc->bc_ecc); memset(bc, 0, sizeof(struct ocfs2_block_check)); /* Fast path - if the crc32 validates, we're good to go */ for (i = 0, crc = ~0; i < nr; i++) crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size); if (crc == bc_crc32e) goto out; ocfs2_blockcheck_inc_failure(stats); mlog(ML_ERROR, "CRC32 failed: stored: %u, computed %u. Applying ECC.\n", (unsigned int)bc_crc32e, (unsigned int)crc); /* Ok, try ECC fixups */ for (i = 0, ecc = 0; i < nr; i++) { /* * The number of bits in a buffer is obviously b_size*8. * The offset of this buffer is b_size*i, so the bit offset * of this buffer is b_size*8*i. */ ecc = (u16)ocfs2_hamming_encode(ecc, bhs[i]->b_data, bhs[i]->b_size * 8, bhs[i]->b_size * 8 * i); } fix = ecc ^ bc_ecc; for (i = 0; i < nr; i++) { /* * Try the fix against each buffer. It will only affect * one of them. */ ocfs2_hamming_fix(bhs[i]->b_data, bhs[i]->b_size * 8, bhs[i]->b_size * 8 * i, fix); } /* And check the crc32 again */ for (i = 0, crc = ~0; i < nr; i++) crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size); if (crc == bc_crc32e) { ocfs2_blockcheck_inc_recover(stats); goto out; } mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n", (unsigned int)bc_crc32e, (unsigned int)crc); rc = -EIO; out: bc->bc_crc32e = cpu_to_le32(bc_crc32e); bc->bc_ecc = cpu_to_le16(bc_ecc); return rc; } /* * These are the main API. They check the superblock flag before * calling the underlying operations. * * They expect the buffer(s) to be in disk format. */ void ocfs2_compute_meta_ecc(struct super_block *sb, void *data, struct ocfs2_block_check *bc) { if (ocfs2_meta_ecc(OCFS2_SB(sb))) ocfs2_block_check_compute(data, sb->s_blocksize, bc); } int ocfs2_validate_meta_ecc(struct super_block *sb, void *data, struct ocfs2_block_check *bc) { int rc = 0; struct ocfs2_super *osb = OCFS2_SB(sb); if (ocfs2_meta_ecc(osb)) rc = ocfs2_block_check_validate(data, sb->s_blocksize, bc, &osb->osb_ecc_stats); return rc; } void ocfs2_compute_meta_ecc_bhs(struct super_block *sb, struct buffer_head **bhs, int nr, struct ocfs2_block_check *bc) { if (ocfs2_meta_ecc(OCFS2_SB(sb))) ocfs2_block_check_compute_bhs(bhs, nr, bc); } int ocfs2_validate_meta_ecc_bhs(struct super_block *sb, struct buffer_head **bhs, int nr, struct ocfs2_block_check *bc) { int rc = 0; struct ocfs2_super *osb = OCFS2_SB(sb); if (ocfs2_meta_ecc(osb)) rc = ocfs2_block_check_validate_bhs(bhs, nr, bc, &osb->osb_ecc_stats); return rc; }
2 1 1 3 3 3 3 4 4 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 // SPDX-License-Identifier: GPL-2.0-or-later /* * Generic PPP layer for Linux. * * Copyright 1999-2002 Paul Mackerras. * * The generic PPP layer handles the PPP network interfaces, the * /dev/ppp device, packet and VJ compression, and multilink. * It talks to PPP `channels' via the interface defined in * include/linux/ppp_channel.h. Channels provide the basic means for * sending and receiving PPP frames on some kind of communications * channel. * * Part of the code in this driver was inspired by the old async-only * PPP driver, written by Michael Callahan and Al Longyear, and * subsequently hacked by Paul Mackerras. * * ==FILEVERSION 20041108== */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/kmod.h> #include <linux/init.h> #include <linux/list.h> #include <linux/idr.h> #include <linux/netdevice.h> #include <linux/poll.h> #include <linux/ppp_defs.h> #include <linux/filter.h> #include <linux/ppp-ioctl.h> #include <linux/ppp_channel.h> #include <linux/ppp-comp.h> #include <linux/skbuff.h> #include <linux/rculist.h> #include <linux/rtnetlink.h> #include <linux/if_arp.h> #include <linux/ip.h> #include <linux/tcp.h> #include <linux/spinlock.h> #include <linux/rwsem.h> #include <linux/stddef.h> #include <linux/device.h> #include <linux/mutex.h> #include <linux/slab.h> #include <linux/file.h> #include <linux/unaligned.h> #include <net/netdev_lock.h> #include <net/slhc_vj.h> #include <linux/atomic.h> #include <linux/refcount.h> #include <linux/nsproxy.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #define PPP_VERSION "2.4.2" /* * Network protocols we support. */ #define NP_IP 0 /* Internet Protocol V4 */ #define NP_IPV6 1 /* Internet Protocol V6 */ #define NP_IPX 2 /* IPX protocol */ #define NP_AT 3 /* Appletalk protocol */ #define NP_MPLS_UC 4 /* MPLS unicast */ #define NP_MPLS_MC 5 /* MPLS multicast */ #define NUM_NP 6 /* Number of NPs. */ #define MPHDRLEN 6 /* multilink protocol header length */ #define MPHDRLEN_SSN 4 /* ditto with short sequence numbers */ #define PPP_PROTO_LEN 2 #define PPP_LCP_HDRLEN 4 /* The filter instructions generated by libpcap are constructed * assuming a four-byte PPP header on each packet, where the last * 2 bytes are the protocol field defined in the RFC and the first * byte of the first 2 bytes indicates the direction. * The second byte is currently unused, but we still need to initialize * it to prevent crafted BPF programs from reading them which would * cause reading of uninitialized data. */ #define PPP_FILTER_OUTBOUND_TAG 0x0100 #define PPP_FILTER_INBOUND_TAG 0x0000 /* * An instance of /dev/ppp can be associated with either a ppp * interface unit or a ppp channel. In both cases, file->private_data * points to one of these. */ struct ppp_file { enum { INTERFACE=1, CHANNEL } kind; struct sk_buff_head xq; /* pppd transmit queue */ struct sk_buff_head rq; /* receive queue for pppd */ wait_queue_head_t rwait; /* for poll on reading /dev/ppp */ refcount_t refcnt; /* # refs (incl /dev/ppp attached) */ int hdrlen; /* space to leave for headers */ int index; /* interface unit / channel number */ int dead; /* unit/channel has been shut down */ }; #define PF_TO_X(pf, X) container_of(pf, X, file) #define PF_TO_PPP(pf) PF_TO_X(pf, struct ppp) #define PF_TO_CHANNEL(pf) PF_TO_X(pf, struct channel) struct ppp_xmit_recursion { struct task_struct *owner; local_lock_t bh_lock; }; /* * Data structure describing one ppp unit. * A ppp unit corresponds to a ppp network interface device * and represents a multilink bundle. * It can have 0 or more ppp channels connected to it. */ struct ppp { struct ppp_file file; /* stuff for read/write/poll 0 */ struct file *owner; /* file that owns this unit 48 */ struct list_head channels; /* list of attached channels 4c */ int n_channels; /* how many channels are attached 54 */ spinlock_t rlock; /* lock for receive side 58 */ spinlock_t wlock; /* lock for transmit side 5c */ struct ppp_xmit_recursion __percpu *xmit_recursion; /* xmit recursion detect */ int mru; /* max receive unit 60 */ unsigned int flags; /* control bits 64 */ unsigned int xstate; /* transmit state bits 68 */ unsigned int rstate; /* receive state bits 6c */ int debug; /* debug flags 70 */ struct slcompress *vj; /* state for VJ header compression */ enum NPmode npmode[NUM_NP]; /* what to do with each net proto 78 */ struct sk_buff *xmit_pending; /* a packet ready to go out 88 */ struct compressor *xcomp; /* transmit packet compressor 8c */ void *xc_state; /* its internal state 90 */ struct compressor *rcomp; /* receive decompressor 94 */ void *rc_state; /* its internal state 98 */ unsigned long last_xmit; /* jiffies when last pkt sent 9c */ unsigned long last_recv; /* jiffies when last pkt rcvd a0 */ struct net_device *dev; /* network interface device a4 */ int closing; /* is device closing down? a8 */ #ifdef CONFIG_PPP_MULTILINK int nxchan; /* next channel to send something on */ u32 nxseq; /* next sequence number to send */ int mrru; /* MP: max reconst. receive unit */ u32 nextseq; /* MP: seq no of next packet */ u32 minseq; /* MP: min of most recent seqnos */ struct sk_buff_head mrq; /* MP: receive reconstruction queue */ #endif /* CONFIG_PPP_MULTILINK */ #ifdef CONFIG_PPP_FILTER struct bpf_prog *pass_filter; /* filter for packets to pass */ struct bpf_prog *active_filter; /* filter for pkts to reset idle */ #endif /* CONFIG_PPP_FILTER */ struct net *ppp_net; /* the net we belong to */ }; /* * Bits in flags: SC_NO_TCP_CCID, SC_CCP_OPEN, SC_CCP_UP, SC_LOOP_TRAFFIC, * SC_MULTILINK, SC_MP_SHORTSEQ, SC_MP_XSHORTSEQ, SC_COMP_TCP, SC_REJ_COMP_TCP, * SC_MUST_COMP * Bits in rstate: SC_DECOMP_RUN, SC_DC_ERROR, SC_DC_FERROR. * Bits in xstate: SC_COMP_RUN */ #define SC_FLAG_BITS (SC_NO_TCP_CCID|SC_CCP_OPEN|SC_CCP_UP|SC_LOOP_TRAFFIC \ |SC_MULTILINK|SC_MP_SHORTSEQ|SC_MP_XSHORTSEQ \ |SC_COMP_TCP|SC_REJ_COMP_TCP|SC_MUST_COMP) /* * Private data structure for each channel. * This includes the data structure used for multilink. */ struct channel { struct ppp_file file; /* stuff for read/write/poll */ struct list_head list; /* link in all/new_channels list */ struct ppp_channel *chan; /* public channel data structure */ struct rw_semaphore chan_sem; /* protects `chan' during chan ioctl */ spinlock_t downl; /* protects `chan', file.xq dequeue */ struct ppp __rcu *ppp; /* ppp unit we're connected to */ struct net *chan_net; /* the net channel belongs to */ netns_tracker ns_tracker; struct list_head clist; /* link in list of channels per unit */ spinlock_t upl; /* protects `ppp' and 'bridge' */ struct channel __rcu *bridge; /* "bridged" ppp channel */ #ifdef CONFIG_PPP_MULTILINK u8 avail; /* flag used in multilink stuff */ u8 had_frag; /* >= 1 fragments have been sent */ u32 lastseq; /* MP: last sequence # received */ int speed; /* speed of the corresponding ppp channel*/ #endif /* CONFIG_PPP_MULTILINK */ }; struct ppp_config { struct file *file; s32 unit; bool ifname_is_set; }; /* * SMP locking issues: * Both the ppp.rlock and ppp.wlock locks protect the ppp.channels * list and the ppp.n_channels field, you need to take both locks * before you modify them. * The lock ordering is: channel.upl -> ppp.wlock -> ppp.rlock -> * channel.downl. */ static DEFINE_MUTEX(ppp_mutex); static atomic_t ppp_unit_count = ATOMIC_INIT(0); static atomic_t channel_count = ATOMIC_INIT(0); /* per-net private data for this module */ static unsigned int ppp_net_id __read_mostly; struct ppp_net { /* units to ppp mapping */ struct idr units_idr; /* * all_ppp_mutex protects the units_idr mapping. * It also ensures that finding a ppp unit in the units_idr * map and updating its file.refcnt field is atomic. */ struct mutex all_ppp_mutex; /* channels */ struct list_head all_channels; struct list_head new_channels; int last_channel_index; /* * all_channels_lock protects all_channels and * last_channel_index, and the atomicity of find * a channel and updating its file.refcnt field. */ spinlock_t all_channels_lock; }; /* Get the PPP protocol number from a skb */ #define PPP_PROTO(skb) get_unaligned_be16((skb)->data) /* We limit the length of ppp->file.rq to this (arbitrary) value */ #define PPP_MAX_RQLEN 32 /* * Maximum number of multilink fragments queued up. * This has to be large enough to cope with the maximum latency of * the slowest channel relative to the others. Strictly it should * depend on the number of channels and their characteristics. */ #define PPP_MP_MAX_QLEN 128 /* Multilink header bits. */ #define B 0x80 /* this fragment begins a packet */ #define E 0x40 /* this fragment ends a packet */ /* Compare multilink sequence numbers (assumed to be 32 bits wide) */ #define seq_before(a, b) ((s32)((a) - (b)) < 0) #define seq_after(a, b) ((s32)((a) - (b)) > 0) /* Prototypes. */ static int ppp_unattached_ioctl(struct net *net, struct ppp_file *pf, struct file *file, unsigned int cmd, unsigned long arg); static void ppp_xmit_process(struct ppp *ppp, struct sk_buff *skb); static void ppp_send_frame(struct ppp *ppp, struct sk_buff *skb); static void ppp_push(struct ppp *ppp); static void ppp_channel_push(struct channel *pch); static void ppp_receive_frame(struct ppp *ppp, struct sk_buff *skb, struct channel *pch); static void ppp_receive_error(struct ppp *ppp); static void ppp_receive_nonmp_frame(struct ppp *ppp, struct sk_buff *skb); static struct sk_buff *ppp_decompress_frame(struct ppp *ppp, struct sk_buff *skb); #ifdef CONFIG_PPP_MULTILINK static void ppp_receive_mp_frame(struct ppp *ppp, struct sk_buff *skb, struct channel *pch); static void ppp_mp_insert(struct ppp *ppp, struct sk_buff *skb); static struct sk_buff *ppp_mp_reconstruct(struct ppp *ppp); static int ppp_mp_explode(struct ppp *ppp, struct sk_buff *skb); #endif /* CONFIG_PPP_MULTILINK */ static int ppp_set_compress(struct ppp *ppp, struct ppp_option_data *data); static void ppp_ccp_peek(struct ppp *ppp, struct sk_buff *skb, int inbound); static void ppp_ccp_closed(struct ppp *ppp); static struct compressor *find_compressor(int type); static void ppp_get_stats(struct ppp *ppp, struct ppp_stats *st); static int ppp_create_interface(struct net *net, struct file *file, int *unit); static void init_ppp_file(struct ppp_file *pf, int kind); static void ppp_destroy_interface(struct ppp *ppp); static struct ppp *ppp_find_unit(struct ppp_net *pn, int unit); static struct channel *ppp_find_channel(struct ppp_net *pn, int unit); static int ppp_connect_channel(struct channel *pch, int unit); static int ppp_disconnect_channel(struct channel *pch); static void ppp_destroy_channel(struct channel *pch); static int unit_get(struct idr *p, void *ptr, int min); static int unit_set(struct idr *p, void *ptr, int n); static void unit_put(struct idr *p, int n); static void *unit_find(struct idr *p, int n); static void ppp_setup(struct net_device *dev); static const struct net_device_ops ppp_netdev_ops; static const struct class ppp_class = { .name = "ppp", }; /* per net-namespace data */ static inline struct ppp_net *ppp_pernet(struct net *net) { return net_generic(net, ppp_net_id); } /* Translates a PPP protocol number to a NP index (NP == network protocol) */ static inline int proto_to_npindex(int proto) { switch (proto) { case PPP_IP: return NP_IP; case PPP_IPV6: return NP_IPV6; case PPP_IPX: return NP_IPX; case PPP_AT: return NP_AT; case PPP_MPLS_UC: return NP_MPLS_UC; case PPP_MPLS_MC: return NP_MPLS_MC; } return -EINVAL; } /* Translates an NP index into a PPP protocol number */ static const int npindex_to_proto[NUM_NP] = { PPP_IP, PPP_IPV6, PPP_IPX, PPP_AT, PPP_MPLS_UC, PPP_MPLS_MC, }; /* Translates an ethertype into an NP index */ static inline int ethertype_to_npindex(int ethertype) { switch (ethertype) { case ETH_P_IP: return NP_IP; case ETH_P_IPV6: return NP_IPV6; case ETH_P_IPX: return NP_IPX; case ETH_P_PPPTALK: case ETH_P_ATALK: return NP_AT; case ETH_P_MPLS_UC: return NP_MPLS_UC; case ETH_P_MPLS_MC: return NP_MPLS_MC; } return -1; } /* Translates an NP index into an ethertype */ static const int npindex_to_ethertype[NUM_NP] = { ETH_P_IP, ETH_P_IPV6, ETH_P_IPX, ETH_P_PPPTALK, ETH_P_MPLS_UC, ETH_P_MPLS_MC, }; /* * Locking shorthand. */ #define ppp_xmit_lock(ppp) spin_lock_bh(&(ppp)->wlock) #define ppp_xmit_unlock(ppp) spin_unlock_bh(&(ppp)->wlock) #define ppp_recv_lock(ppp) spin_lock_bh(&(ppp)->rlock) #define ppp_recv_unlock(ppp) spin_unlock_bh(&(ppp)->rlock) #define ppp_lock(ppp) do { ppp_xmit_lock(ppp); \ ppp_recv_lock(ppp); } while (0) #define ppp_unlock(ppp) do { ppp_recv_unlock(ppp); \ ppp_xmit_unlock(ppp); } while (0) /* * /dev/ppp device routines. * The /dev/ppp device is used by pppd to control the ppp unit. * It supports the read, write, ioctl and poll functions. * Open instances of /dev/ppp can be in one of three states: * unattached, attached to a ppp unit, or attached to a ppp channel. */ static int ppp_open(struct inode *inode, struct file *file) { /* * This could (should?) be enforced by the permissions on /dev/ppp. */ if (!ns_capable(file->f_cred->user_ns, CAP_NET_ADMIN)) return -EPERM; return 0; } static int ppp_release(struct inode *unused, struct file *file) { struct ppp_file *pf = file->private_data; struct ppp *ppp; if (pf) { file->private_data = NULL; if (pf->kind == INTERFACE) { ppp = PF_TO_PPP(pf); rtnl_lock(); if (file == ppp->owner) unregister_netdevice(ppp->dev); rtnl_unlock(); } if (refcount_dec_and_test(&pf->refcnt)) { switch (pf->kind) { case INTERFACE: ppp_destroy_interface(PF_TO_PPP(pf)); break; case CHANNEL: ppp_destroy_channel(PF_TO_CHANNEL(pf)); break; } } } return 0; } static ssize_t ppp_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct ppp_file *pf = file->private_data; DECLARE_WAITQUEUE(wait, current); ssize_t ret; struct sk_buff *skb = NULL; struct iovec iov; struct iov_iter to; ret = count; if (!pf) return -ENXIO; add_wait_queue(&pf->rwait, &wait); for (;;) { set_current_state(TASK_INTERRUPTIBLE); skb = skb_dequeue(&pf->rq); if (skb) break; ret = 0; if (pf->dead) break; if (pf->kind == INTERFACE) { /* * Return 0 (EOF) on an interface that has no * channels connected, unless it is looping * network traffic (demand mode). */ struct ppp *ppp = PF_TO_PPP(pf); ppp_recv_lock(ppp); if (ppp->n_channels == 0 && (ppp->flags & SC_LOOP_TRAFFIC) == 0) { ppp_recv_unlock(ppp); break; } ppp_recv_unlock(ppp); } ret = -EAGAIN; if (file->f_flags & O_NONBLOCK) break; ret = -ERESTARTSYS; if (signal_pending(current)) break; schedule(); } set_current_state(TASK_RUNNING); remove_wait_queue(&pf->rwait, &wait); if (!skb) goto out; ret = -EOVERFLOW; if (skb->len > count) goto outf; ret = -EFAULT; iov.iov_base = buf; iov.iov_len = count; iov_iter_init(&to, ITER_DEST, &iov, 1, count); if (skb_copy_datagram_iter(skb, 0, &to, skb->len)) goto outf; ret = skb->len; outf: kfree_skb(skb); out: return ret; } static bool ppp_check_packet(struct sk_buff *skb, size_t count) { /* LCP packets must include LCP header which 4 bytes long: * 1-byte code, 1-byte identifier, and 2-byte length. */ return get_unaligned_be16(skb->data) != PPP_LCP || count >= PPP_PROTO_LEN + PPP_LCP_HDRLEN; } static ssize_t ppp_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct ppp_file *pf = file->private_data; struct sk_buff *skb; ssize_t ret; if (!pf) return -ENXIO; /* All PPP packets should start with the 2-byte protocol */ if (count < PPP_PROTO_LEN) return -EINVAL; ret = -ENOMEM; skb = alloc_skb(count + pf->hdrlen, GFP_KERNEL); if (!skb) goto out; skb_reserve(skb, pf->hdrlen); ret = -EFAULT; if (copy_from_user(skb_put(skb, count), buf, count)) { kfree_skb(skb); goto out; } ret = -EINVAL; if (unlikely(!ppp_check_packet(skb, count))) { kfree_skb(skb); goto out; } switch (pf->kind) { case INTERFACE: ppp_xmit_process(PF_TO_PPP(pf), skb); break; case CHANNEL: skb_queue_tail(&pf->xq, skb); ppp_channel_push(PF_TO_CHANNEL(pf)); break; } ret = count; out: return ret; } /* No kernel lock - fine */ static __poll_t ppp_poll(struct file *file, poll_table *wait) { struct ppp_file *pf = file->private_data; __poll_t mask; if (!pf) return 0; poll_wait(file, &pf->rwait, wait); mask = EPOLLOUT | EPOLLWRNORM; if (skb_peek(&pf->rq)) mask |= EPOLLIN | EPOLLRDNORM; if (pf->dead) mask |= EPOLLHUP; else if (pf->kind == INTERFACE) { /* see comment in ppp_read */ struct ppp *ppp = PF_TO_PPP(pf); ppp_recv_lock(ppp); if (ppp->n_channels == 0 && (ppp->flags & SC_LOOP_TRAFFIC) == 0) mask |= EPOLLIN | EPOLLRDNORM; ppp_recv_unlock(ppp); } return mask; } #ifdef CONFIG_PPP_FILTER static struct bpf_prog *get_filter(struct sock_fprog *uprog) { struct sock_fprog_kern fprog; struct bpf_prog *res = NULL; int err; if (!uprog->len) return NULL; /* uprog->len is unsigned short, so no overflow here */ fprog.len = uprog->len; fprog.filter = memdup_array_user(uprog->filter, uprog->len, sizeof(struct sock_filter)); if (IS_ERR(fprog.filter)) return ERR_CAST(fprog.filter); err = bpf_prog_create(&res, &fprog); kfree(fprog.filter); return err ? ERR_PTR(err) : res; } static struct bpf_prog *ppp_get_filter(struct sock_fprog __user *p) { struct sock_fprog uprog; if (copy_from_user(&uprog, p, sizeof(struct sock_fprog))) return ERR_PTR(-EFAULT); return get_filter(&uprog); } #ifdef CONFIG_COMPAT struct sock_fprog32 { unsigned short len; compat_caddr_t filter; }; #define PPPIOCSPASS32 _IOW('t', 71, struct sock_fprog32) #define PPPIOCSACTIVE32 _IOW('t', 70, struct sock_fprog32) static struct bpf_prog *compat_ppp_get_filter(struct sock_fprog32 __user *p) { struct sock_fprog32 uprog32; struct sock_fprog uprog; if (copy_from_user(&uprog32, p, sizeof(struct sock_fprog32))) return ERR_PTR(-EFAULT); uprog.len = uprog32.len; uprog.filter = compat_ptr(uprog32.filter); return get_filter(&uprog); } #endif #endif /* Bridge one PPP channel to another. * When two channels are bridged, ppp_input on one channel is redirected to * the other's ops->start_xmit handler. * In order to safely bridge channels we must reject channels which are already * part of a bridge instance, or which form part of an existing unit. * Once successfully bridged, each channel holds a reference on the other * to prevent it being freed while the bridge is extant. */ static int ppp_bridge_channels(struct channel *pch, struct channel *pchb) { spin_lock(&pch->upl); if (rcu_dereference_protected(pch->ppp, lockdep_is_held(&pch->upl)) || rcu_dereference_protected(pch->bridge, lockdep_is_held(&pch->upl))) { spin_unlock(&pch->upl); return -EALREADY; } refcount_inc(&pchb->file.refcnt); rcu_assign_pointer(pch->bridge, pchb); spin_unlock(&pch->upl); spin_lock(&pchb->upl); if (rcu_dereference_protected(pchb->ppp, lockdep_is_held(&pchb->upl)) || rcu_dereference_protected(pchb->bridge, lockdep_is_held(&pchb->upl))) { spin_unlock(&pchb->upl); goto err_unset; } refcount_inc(&pch->file.refcnt); rcu_assign_pointer(pchb->bridge, pch); spin_unlock(&pchb->upl); return 0; err_unset: spin_lock(&pch->upl); /* Re-read pch->bridge with upl held in case it was modified concurrently */ pchb = rcu_dereference_protected(pch->bridge, lockdep_is_held(&pch->upl)); RCU_INIT_POINTER(pch->bridge, NULL); spin_unlock(&pch->upl); synchronize_rcu(); if (pchb) if (refcount_dec_and_test(&pchb->file.refcnt)) ppp_destroy_channel(pchb); return -EALREADY; } static int ppp_unbridge_channels(struct channel *pch) { struct channel *pchb, *pchbb; spin_lock(&pch->upl); pchb = rcu_dereference_protected(pch->bridge, lockdep_is_held(&pch->upl)); if (!pchb) { spin_unlock(&pch->upl); return -EINVAL; } RCU_INIT_POINTER(pch->bridge, NULL); spin_unlock(&pch->upl); /* Only modify pchb if phcb->bridge points back to pch. * If not, it implies that there has been a race unbridging (and possibly * even rebridging) pchb. We should leave pchb alone to avoid either a * refcount underflow, or breaking another established bridge instance. */ spin_lock(&pchb->upl); pchbb = rcu_dereference_protected(pchb->bridge, lockdep_is_held(&pchb->upl)); if (pchbb == pch) RCU_INIT_POINTER(pchb->bridge, NULL); spin_unlock(&pchb->upl); synchronize_rcu(); if (pchbb == pch) if (refcount_dec_and_test(&pch->file.refcnt)) ppp_destroy_channel(pch); if (refcount_dec_and_test(&pchb->file.refcnt)) ppp_destroy_channel(pchb); return 0; } static long ppp_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct ppp_file *pf; struct ppp *ppp; int err = -EFAULT, val, val2, i; struct ppp_idle32 idle32; struct ppp_idle64 idle64; struct npioctl npi; int unit, cflags; struct slcompress *vj; void __user *argp = (void __user *)arg; int __user *p = argp; mutex_lock(&ppp_mutex); pf = file->private_data; if (!pf) { err = ppp_unattached_ioctl(current->nsproxy->net_ns, pf, file, cmd, arg); goto out; } if (cmd == PPPIOCDETACH) { /* * PPPIOCDETACH is no longer supported as it was heavily broken, * and is only known to have been used by pppd older than * ppp-2.4.2 (released November 2003). */ pr_warn_once("%s (%d) used obsolete PPPIOCDETACH ioctl\n", current->comm, current->pid); err = -EINVAL; goto out; } if (pf->kind == CHANNEL) { struct channel *pch, *pchb; struct ppp_channel *chan; struct ppp_net *pn; pch = PF_TO_CHANNEL(pf); switch (cmd) { case PPPIOCCONNECT: if (get_user(unit, p)) break; err = ppp_connect_channel(pch, unit); break; case PPPIOCDISCONN: err = ppp_disconnect_channel(pch); break; case PPPIOCBRIDGECHAN: if (get_user(unit, p)) break; err = -ENXIO; pn = ppp_pernet(current->nsproxy->net_ns); spin_lock_bh(&pn->all_channels_lock); pchb = ppp_find_channel(pn, unit); /* Hold a reference to prevent pchb being freed while * we establish the bridge. */ if (pchb) refcount_inc(&pchb->file.refcnt); spin_unlock_bh(&pn->all_channels_lock); if (!pchb) break; err = ppp_bridge_channels(pch, pchb); /* Drop earlier refcount now bridge establishment is complete */ if (refcount_dec_and_test(&pchb->file.refcnt)) ppp_destroy_channel(pchb); break; case PPPIOCUNBRIDGECHAN: err = ppp_unbridge_channels(pch); break; default: down_read(&pch->chan_sem); chan = pch->chan; err = -ENOTTY; if (chan && chan->ops->ioctl) err = chan->ops->ioctl(chan, cmd, arg); up_read(&pch->chan_sem); } goto out; } if (pf->kind != INTERFACE) { /* can't happen */ pr_err("PPP: not interface or channel??\n"); err = -EINVAL; goto out; } ppp = PF_TO_PPP(pf); switch (cmd) { case PPPIOCSMRU: if (get_user(val, p)) break; ppp->mru = val; err = 0; break; case PPPIOCSFLAGS: if (get_user(val, p)) break; ppp_lock(ppp); cflags = ppp->flags & ~val; #ifdef CONFIG_PPP_MULTILINK if (!(ppp->flags & SC_MULTILINK) && (val & SC_MULTILINK)) ppp->nextseq = 0; #endif ppp->flags = val & SC_FLAG_BITS; ppp_unlock(ppp); if (cflags & SC_CCP_OPEN) ppp_ccp_closed(ppp); err = 0; break; case PPPIOCGFLAGS: val = ppp->flags | ppp->xstate | ppp->rstate; if (put_user(val, p)) break; err = 0; break; case PPPIOCSCOMPRESS: { struct ppp_option_data data; if (copy_from_user(&data, argp, sizeof(data))) err = -EFAULT; else err = ppp_set_compress(ppp, &data); break; } case PPPIOCGUNIT: if (put_user(ppp->file.index, p)) break; err = 0; break; case PPPIOCSDEBUG: if (get_user(val, p)) break; ppp->debug = val; err = 0; break; case PPPIOCGDEBUG: if (put_user(ppp->debug, p)) break; err = 0; break; case PPPIOCGIDLE32: idle32.xmit_idle = (jiffies - ppp->last_xmit) / HZ; idle32.recv_idle = (jiffies - ppp->last_recv) / HZ; if (copy_to_user(argp, &idle32, sizeof(idle32))) break; err = 0; break; case PPPIOCGIDLE64: idle64.xmit_idle = (jiffies - ppp->last_xmit) / HZ; idle64.recv_idle = (jiffies - ppp->last_recv) / HZ; if (copy_to_user(argp, &idle64, sizeof(idle64))) break; err = 0; break; case PPPIOCSMAXCID: if (get_user(val, p)) break; val2 = 15; if ((val >> 16) != 0) { val2 = val >> 16; val &= 0xffff; } vj = slhc_init(val2+1, val+1); if (IS_ERR(vj)) { err = PTR_ERR(vj); break; } ppp_lock(ppp); if (ppp->vj) slhc_free(ppp->vj); ppp->vj = vj; ppp_unlock(ppp); err = 0; break; case PPPIOCGNPMODE: case PPPIOCSNPMODE: if (copy_from_user(&npi, argp, sizeof(npi))) break; err = proto_to_npindex(npi.protocol); if (err < 0) break; i = err; if (cmd == PPPIOCGNPMODE) { err = -EFAULT; npi.mode = ppp->npmode[i]; if (copy_to_user(argp, &npi, sizeof(npi))) break; } else { ppp->npmode[i] = npi.mode; /* we may be able to transmit more packets now (??) */ netif_wake_queue(ppp->dev); } err = 0; break; #ifdef CONFIG_PPP_FILTER case PPPIOCSPASS: case PPPIOCSACTIVE: { struct bpf_prog *filter = ppp_get_filter(argp); struct bpf_prog **which; if (IS_ERR(filter)) { err = PTR_ERR(filter); break; } if (cmd == PPPIOCSPASS) which = &ppp->pass_filter; else which = &ppp->active_filter; ppp_lock(ppp); if (*which) bpf_prog_destroy(*which); *which = filter; ppp_unlock(ppp); err = 0; break; } #endif /* CONFIG_PPP_FILTER */ #ifdef CONFIG_PPP_MULTILINK case PPPIOCSMRRU: if (get_user(val, p)) break; ppp_recv_lock(ppp); ppp->mrru = val; ppp_recv_unlock(ppp); err = 0; break; #endif /* CONFIG_PPP_MULTILINK */ default: err = -ENOTTY; } out: mutex_unlock(&ppp_mutex); return err; } #ifdef CONFIG_COMPAT struct ppp_option_data32 { compat_uptr_t ptr; u32 length; compat_int_t transmit; }; #define PPPIOCSCOMPRESS32 _IOW('t', 77, struct ppp_option_data32) static long ppp_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct ppp_file *pf; int err = -ENOIOCTLCMD; void __user *argp = (void __user *)arg; mutex_lock(&ppp_mutex); pf = file->private_data; if (pf && pf->kind == INTERFACE) { struct ppp *ppp = PF_TO_PPP(pf); switch (cmd) { #ifdef CONFIG_PPP_FILTER case PPPIOCSPASS32: case PPPIOCSACTIVE32: { struct bpf_prog *filter = compat_ppp_get_filter(argp); struct bpf_prog **which; if (IS_ERR(filter)) { err = PTR_ERR(filter); break; } if (cmd == PPPIOCSPASS32) which = &ppp->pass_filter; else which = &ppp->active_filter; ppp_lock(ppp); if (*which) bpf_prog_destroy(*which); *which = filter; ppp_unlock(ppp); err = 0; break; } #endif /* CONFIG_PPP_FILTER */ case PPPIOCSCOMPRESS32: { struct ppp_option_data32 data32; if (copy_from_user(&data32, argp, sizeof(data32))) { err = -EFAULT; } else { struct ppp_option_data data = { .ptr = compat_ptr(data32.ptr), .length = data32.length, .transmit = data32.transmit }; err = ppp_set_compress(ppp, &data); } break; } } } mutex_unlock(&ppp_mutex); /* all other commands have compatible arguments */ if (err == -ENOIOCTLCMD) err = ppp_ioctl(file, cmd, (unsigned long)compat_ptr(arg)); return err; } #endif static int ppp_unattached_ioctl(struct net *net, struct ppp_file *pf, struct file *file, unsigned int cmd, unsigned long arg) { int unit, err = -EFAULT; struct ppp *ppp; struct channel *chan; struct ppp_net *pn; int __user *p = (int __user *)arg; switch (cmd) { case PPPIOCNEWUNIT: /* Create a new ppp unit */ if (get_user(unit, p)) break; err = ppp_create_interface(net, file, &unit); if (err < 0) break; err = -EFAULT; if (put_user(unit, p)) break; err = 0; break; case PPPIOCATTACH: /* Attach to an existing ppp unit */ if (get_user(unit, p)) break; err = -ENXIO; pn = ppp_pernet(net); mutex_lock(&pn->all_ppp_mutex); ppp = ppp_find_unit(pn, unit); if (ppp) { refcount_inc(&ppp->file.refcnt); file->private_data = &ppp->file; err = 0; } mutex_unlock(&pn->all_ppp_mutex); break; case PPPIOCATTCHAN: if (get_user(unit, p)) break; err = -ENXIO; pn = ppp_pernet(net); spin_lock_bh(&pn->all_channels_lock); chan = ppp_find_channel(pn, unit); if (chan) { refcount_inc(&chan->file.refcnt); file->private_data = &chan->file; err = 0; } spin_unlock_bh(&pn->all_channels_lock); break; default: err = -ENOTTY; } return err; } static const struct file_operations ppp_device_fops = { .owner = THIS_MODULE, .read = ppp_read, .write = ppp_write, .poll = ppp_poll, .unlocked_ioctl = ppp_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ppp_compat_ioctl, #endif .open = ppp_open, .release = ppp_release, .llseek = noop_llseek, }; static void ppp_nl_dellink(struct net_device *dev, struct list_head *head); static __net_init int ppp_init_net(struct net *net) { struct ppp_net *pn = net_generic(net, ppp_net_id); idr_init(&pn->units_idr); mutex_init(&pn->all_ppp_mutex); INIT_LIST_HEAD(&pn->all_channels); INIT_LIST_HEAD(&pn->new_channels); spin_lock_init(&pn->all_channels_lock); return 0; } static __net_exit void ppp_exit_rtnl_net(struct net *net, struct list_head *dev_to_kill) { struct ppp_net *pn = net_generic(net, ppp_net_id); struct ppp *ppp; int id; idr_for_each_entry(&pn->units_idr, ppp, id) ppp_nl_dellink(ppp->dev, dev_to_kill); } static __net_exit void ppp_exit_net(struct net *net) { struct ppp_net *pn = net_generic(net, ppp_net_id); mutex_destroy(&pn->all_ppp_mutex); idr_destroy(&pn->units_idr); WARN_ON_ONCE(!list_empty(&pn->all_channels)); WARN_ON_ONCE(!list_empty(&pn->new_channels)); } static struct pernet_operations ppp_net_ops = { .init = ppp_init_net, .exit_rtnl = ppp_exit_rtnl_net, .exit = ppp_exit_net, .id = &ppp_net_id, .size = sizeof(struct ppp_net), }; static int ppp_unit_register(struct ppp *ppp, int unit, bool ifname_is_set) { struct ppp_net *pn = ppp_pernet(ppp->ppp_net); int ret; mutex_lock(&pn->all_ppp_mutex); if (unit < 0) { ret = unit_get(&pn->units_idr, ppp, 0); if (ret < 0) goto err; if (!ifname_is_set) { while (1) { snprintf(ppp->dev->name, IFNAMSIZ, "ppp%i", ret); if (!netdev_name_in_use(ppp->ppp_net, ppp->dev->name)) break; unit_put(&pn->units_idr, ret); ret = unit_get(&pn->units_idr, ppp, ret + 1); if (ret < 0) goto err; } } } else { /* Caller asked for a specific unit number. Fail with -EEXIST * if unavailable. For backward compatibility, return -EEXIST * too if idr allocation fails; this makes pppd retry without * requesting a specific unit number. */ if (unit_find(&pn->units_idr, unit)) { ret = -EEXIST; goto err; } ret = unit_set(&pn->units_idr, ppp, unit); if (ret < 0) { /* Rewrite error for backward compatibility */ ret = -EEXIST; goto err; } } ppp->file.index = ret; if (!ifname_is_set) snprintf(ppp->dev->name, IFNAMSIZ, "ppp%i", ppp->file.index); mutex_unlock(&pn->all_ppp_mutex); ret = register_netdevice(ppp->dev); if (ret < 0) goto err_unit; atomic_inc(&ppp_unit_count); return 0; err_unit: mutex_lock(&pn->all_ppp_mutex); unit_put(&pn->units_idr, ppp->file.index); err: mutex_unlock(&pn->all_ppp_mutex); return ret; } static int ppp_dev_configure(struct net *src_net, struct net_device *dev, const struct ppp_config *conf) { struct ppp *ppp = netdev_priv(dev); int indx; int err; int cpu; ppp->dev = dev; ppp->ppp_net = src_net; ppp->mru = PPP_MRU; ppp->owner = conf->file; init_ppp_file(&ppp->file, INTERFACE); ppp->file.hdrlen = PPP_HDRLEN - 2; /* don't count proto bytes */ for (indx = 0; indx < NUM_NP; ++indx) ppp->npmode[indx] = NPMODE_PASS; INIT_LIST_HEAD(&ppp->channels); spin_lock_init(&ppp->rlock); spin_lock_init(&ppp->wlock); ppp->xmit_recursion = alloc_percpu(struct ppp_xmit_recursion); if (!ppp->xmit_recursion) { err = -ENOMEM; goto err1; } for_each_possible_cpu(cpu) { struct ppp_xmit_recursion *xmit_recursion; xmit_recursion = per_cpu_ptr(ppp->xmit_recursion, cpu); xmit_recursion->owner = NULL; local_lock_init(&xmit_recursion->bh_lock); } #ifdef CONFIG_PPP_MULTILINK ppp->minseq = -1; skb_queue_head_init(&ppp->mrq); #endif /* CONFIG_PPP_MULTILINK */ #ifdef CONFIG_PPP_FILTER ppp->pass_filter = NULL; ppp->active_filter = NULL; #endif /* CONFIG_PPP_FILTER */ err = ppp_unit_register(ppp, conf->unit, conf->ifname_is_set); if (err < 0) goto err2; conf->file->private_data = &ppp->file; return 0; err2: free_percpu(ppp->xmit_recursion); err1: return err; } static const struct nla_policy ppp_nl_policy[IFLA_PPP_MAX + 1] = { [IFLA_PPP_DEV_FD] = { .type = NLA_S32 }, }; static int ppp_nl_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { if (!data) return -EINVAL; if (!data[IFLA_PPP_DEV_FD]) return -EINVAL; if (nla_get_s32(data[IFLA_PPP_DEV_FD]) < 0) return -EBADF; return 0; } static int ppp_nl_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct net *link_net = rtnl_newlink_link_net(params); struct nlattr **data = params->data; struct nlattr **tb = params->tb; struct ppp_config conf = { .unit = -1, .ifname_is_set = true, }; struct file *file; int err; file = fget(nla_get_s32(data[IFLA_PPP_DEV_FD])); if (!file) return -EBADF; /* rtnl_lock is already held here, but ppp_create_interface() locks * ppp_mutex before holding rtnl_lock. Using mutex_trylock() avoids * possible deadlock due to lock order inversion, at the cost of * pushing the problem back to userspace. */ if (!mutex_trylock(&ppp_mutex)) { err = -EBUSY; goto out; } if (file->f_op != &ppp_device_fops || file->private_data) { err = -EBADF; goto out_unlock; } conf.file = file; /* Don't use device name generated by the rtnetlink layer when ifname * isn't specified. Let ppp_dev_configure() set the device name using * the PPP unit identifer as suffix (i.e. ppp<unit_id>). This allows * userspace to infer the device name using to the PPPIOCGUNIT ioctl. */ if (!tb[IFLA_IFNAME] || !nla_len(tb[IFLA_IFNAME]) || !*(char *)nla_data(tb[IFLA_IFNAME])) conf.ifname_is_set = false; err = ppp_dev_configure(link_net, dev, &conf); out_unlock: mutex_unlock(&ppp_mutex); out: fput(file); return err; } static void ppp_nl_dellink(struct net_device *dev, struct list_head *head) { unregister_netdevice_queue(dev, head); } static size_t ppp_nl_get_size(const struct net_device *dev) { return 0; } static int ppp_nl_fill_info(struct sk_buff *skb, const struct net_device *dev) { return 0; } static struct net *ppp_nl_get_link_net(const struct net_device *dev) { struct ppp *ppp = netdev_priv(dev); return READ_ONCE(ppp->ppp_net); } static struct rtnl_link_ops ppp_link_ops __read_mostly = { .kind = "ppp", .maxtype = IFLA_PPP_MAX, .policy = ppp_nl_policy, .priv_size = sizeof(struct ppp), .setup = ppp_setup, .validate = ppp_nl_validate, .newlink = ppp_nl_newlink, .dellink = ppp_nl_dellink, .get_size = ppp_nl_get_size, .fill_info = ppp_nl_fill_info, .get_link_net = ppp_nl_get_link_net, }; #define PPP_MAJOR 108 /* Called at boot time if ppp is compiled into the kernel, or at module load time (from init_module) if compiled as a module. */ static int __init ppp_init(void) { int err; pr_info("PPP generic driver version " PPP_VERSION "\n"); err = register_pernet_device(&ppp_net_ops); if (err) { pr_err("failed to register PPP pernet device (%d)\n", err); goto out; } err = register_chrdev(PPP_MAJOR, "ppp", &ppp_device_fops); if (err) { pr_err("failed to register PPP device (%d)\n", err); goto out_net; } err = class_register(&ppp_class); if (err) goto out_chrdev; err = rtnl_link_register(&ppp_link_ops); if (err) { pr_err("failed to register rtnetlink PPP handler\n"); goto out_class; } /* not a big deal if we fail here :-) */ device_create(&ppp_class, NULL, MKDEV(PPP_MAJOR, 0), NULL, "ppp"); return 0; out_class: class_unregister(&ppp_class); out_chrdev: unregister_chrdev(PPP_MAJOR, "ppp"); out_net: unregister_pernet_device(&ppp_net_ops); out: return err; } /* * Network interface unit routines. */ static netdev_tx_t ppp_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct ppp *ppp = netdev_priv(dev); int npi, proto; unsigned char *pp; npi = ethertype_to_npindex(ntohs(skb->protocol)); if (npi < 0) goto outf; /* Drop, accept or reject the packet */ switch (ppp->npmode[npi]) { case NPMODE_PASS: break; case NPMODE_QUEUE: /* it would be nice to have a way to tell the network system to queue this one up for later. */ goto outf; case NPMODE_DROP: case NPMODE_ERROR: goto outf; } /* Put the 2-byte PPP protocol number on the front, making sure there is room for the address and control fields. */ if (skb_cow_head(skb, PPP_HDRLEN)) goto outf; pp = skb_push(skb, 2); proto = npindex_to_proto[npi]; put_unaligned_be16(proto, pp); skb_scrub_packet(skb, !net_eq(ppp->ppp_net, dev_net(dev))); ppp_xmit_process(ppp, skb); return NETDEV_TX_OK; outf: kfree_skb(skb); ++dev->stats.tx_dropped; return NETDEV_TX_OK; } static int ppp_net_siocdevprivate(struct net_device *dev, struct ifreq *ifr, void __user *addr, int cmd) { struct ppp *ppp = netdev_priv(dev); int err = -EFAULT; struct ppp_stats stats; struct ppp_comp_stats cstats; char *vers; switch (cmd) { case SIOCGPPPSTATS: ppp_get_stats(ppp, &stats); if (copy_to_user(addr, &stats, sizeof(stats))) break; err = 0; break; case SIOCGPPPCSTATS: memset(&cstats, 0, sizeof(cstats)); if (ppp->xc_state) ppp->xcomp->comp_stat(ppp->xc_state, &cstats.c); if (ppp->rc_state) ppp->rcomp->decomp_stat(ppp->rc_state, &cstats.d); if (copy_to_user(addr, &cstats, sizeof(cstats))) break; err = 0; break; case SIOCGPPPVER: vers = PPP_VERSION; if (copy_to_user(addr, vers, strlen(vers) + 1)) break; err = 0; break; default: err = -EINVAL; } return err; } static void ppp_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats64) { stats64->rx_errors = dev->stats.rx_errors; stats64->tx_errors = dev->stats.tx_errors; stats64->rx_dropped = dev->stats.rx_dropped; stats64->tx_dropped = dev->stats.tx_dropped; stats64->rx_length_errors = dev->stats.rx_length_errors; dev_fetch_sw_netstats(stats64, dev->tstats); } static int ppp_dev_init(struct net_device *dev) { struct ppp *ppp; netdev_lockdep_set_classes(dev); ppp = netdev_priv(dev); /* Let the netdevice take a reference on the ppp file. This ensures * that ppp_destroy_interface() won't run before the device gets * unregistered. */ refcount_inc(&ppp->file.refcnt); return 0; } static void ppp_dev_uninit(struct net_device *dev) { struct ppp *ppp = netdev_priv(dev); struct ppp_net *pn = ppp_pernet(ppp->ppp_net); ppp_lock(ppp); ppp->closing = 1; ppp_unlock(ppp); mutex_lock(&pn->all_ppp_mutex); unit_put(&pn->units_idr, ppp->file.index); mutex_unlock(&pn->all_ppp_mutex); ppp->owner = NULL; ppp->file.dead = 1; wake_up_interruptible(&ppp->file.rwait); } static void ppp_dev_priv_destructor(struct net_device *dev) { struct ppp *ppp; ppp = netdev_priv(dev); if (refcount_dec_and_test(&ppp->file.refcnt)) ppp_destroy_interface(ppp); } static int ppp_fill_forward_path(struct net_device_path_ctx *ctx, struct net_device_path *path) { struct ppp *ppp = netdev_priv(ctx->dev); struct ppp_channel *chan; struct channel *pch; if (ppp->flags & SC_MULTILINK) return -EOPNOTSUPP; pch = list_first_or_null_rcu(&ppp->channels, struct channel, clist); if (!pch) return -ENODEV; chan = READ_ONCE(pch->chan); if (!chan) return -ENODEV; if (!chan->ops->fill_forward_path) return -EOPNOTSUPP; return chan->ops->fill_forward_path(ctx, path, chan); } static const struct net_device_ops ppp_netdev_ops = { .ndo_init = ppp_dev_init, .ndo_uninit = ppp_dev_uninit, .ndo_start_xmit = ppp_start_xmit, .ndo_siocdevprivate = ppp_net_siocdevprivate, .ndo_get_stats64 = ppp_get_stats64, .ndo_fill_forward_path = ppp_fill_forward_path, }; static const struct device_type ppp_type = { .name = "ppp", }; static void ppp_setup(struct net_device *dev) { dev->netdev_ops = &ppp_netdev_ops; SET_NETDEV_DEVTYPE(dev, &ppp_type); dev->lltx = true; dev->hard_header_len = PPP_HDRLEN; dev->mtu = PPP_MRU; dev->addr_len = 0; dev->tx_queue_len = 3; dev->type = ARPHRD_PPP; dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST; dev->priv_destructor = ppp_dev_priv_destructor; dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; dev->features = NETIF_F_SG | NETIF_F_FRAGLIST; dev->hw_features = dev->features; netif_keep_dst(dev); } /* * Transmit-side routines. */ /* Called to do any work queued up on the transmit side that can now be done */ static void __ppp_xmit_process(struct ppp *ppp, struct sk_buff *skb) { ppp_xmit_lock(ppp); if (!ppp->closing) { ppp_push(ppp); if (skb) skb_queue_tail(&ppp->file.xq, skb); while (!ppp->xmit_pending && (skb = skb_dequeue(&ppp->file.xq))) ppp_send_frame(ppp, skb); /* If there's no work left to do, tell the core net code that we can accept some more. */ if (!ppp->xmit_pending && !skb_peek(&ppp->file.xq)) netif_wake_queue(ppp->dev); else netif_stop_queue(ppp->dev); } else { kfree_skb(skb); } ppp_xmit_unlock(ppp); } static void ppp_xmit_process(struct ppp *ppp, struct sk_buff *skb) { struct ppp_xmit_recursion *xmit_recursion; local_bh_disable(); xmit_recursion = this_cpu_ptr(ppp->xmit_recursion); if (xmit_recursion->owner == current) goto err; local_lock_nested_bh(&ppp->xmit_recursion->bh_lock); xmit_recursion->owner = current; __ppp_xmit_process(ppp, skb); xmit_recursion->owner = NULL; local_unlock_nested_bh(&ppp->xmit_recursion->bh_lock); local_bh_enable(); return; err: local_bh_enable(); kfree_skb(skb); if (net_ratelimit()) netdev_err(ppp->dev, "recursion detected\n"); } static inline struct sk_buff * pad_compress_skb(struct ppp *ppp, struct sk_buff *skb) { struct sk_buff *new_skb; int len; int new_skb_size = ppp->dev->mtu + ppp->xcomp->comp_extra + ppp->dev->hard_header_len; int compressor_skb_size = ppp->dev->mtu + ppp->xcomp->comp_extra + PPP_HDRLEN; if (skb_linearize(skb)) return NULL; new_skb = alloc_skb(new_skb_size, GFP_ATOMIC); if (!new_skb) { if (net_ratelimit()) netdev_err(ppp->dev, "PPP: no memory (comp pkt)\n"); return NULL; } if (ppp->dev->hard_header_len > PPP_HDRLEN) skb_reserve(new_skb, ppp->dev->hard_header_len - PPP_HDRLEN); /* compressor still expects A/C bytes in hdr */ len = ppp->xcomp->compress(ppp->xc_state, skb->data - 2, new_skb->data, skb->len + 2, compressor_skb_size); if (len > 0 && (ppp->flags & SC_CCP_UP)) { consume_skb(skb); skb = new_skb; skb_put(skb, len); skb_pull(skb, 2); /* pull off A/C bytes */ } else if (len == 0) { /* didn't compress, or CCP not up yet */ consume_skb(new_skb); new_skb = skb; } else { /* * (len < 0) * MPPE requires that we do not send unencrypted * frames. The compressor will return -1 if we * should drop the frame. We cannot simply test * the compress_proto because MPPE and MPPC share * the same number. */ if (net_ratelimit()) netdev_err(ppp->dev, "ppp: compressor dropped pkt\n"); consume_skb(new_skb); new_skb = NULL; } return new_skb; } /* * Compress and send a frame. * The caller should have locked the xmit path, * and xmit_pending should be 0. */ static void ppp_send_frame(struct ppp *ppp, struct sk_buff *skb) { int proto = PPP_PROTO(skb); struct sk_buff *new_skb; int len; unsigned char *cp; skb->dev = ppp->dev; if (proto < 0x8000) { #ifdef CONFIG_PPP_FILTER /* check if the packet passes the pass and active filters. * See comment for PPP_FILTER_OUTBOUND_TAG above. */ *(__be16 *)skb_push(skb, 2) = htons(PPP_FILTER_OUTBOUND_TAG); if (ppp->pass_filter && bpf_prog_run(ppp->pass_filter, skb) == 0) { if (ppp->debug & 1) netdev_printk(KERN_DEBUG, ppp->dev, "PPP: outbound frame " "not passed\n"); kfree_skb(skb); return; } /* if this packet passes the active filter, record the time */ if (!(ppp->active_filter && bpf_prog_run(ppp->active_filter, skb) == 0)) ppp->last_xmit = jiffies; skb_pull(skb, 2); #else /* for data packets, record the time */ ppp->last_xmit = jiffies; #endif /* CONFIG_PPP_FILTER */ } dev_sw_netstats_tx_add(ppp->dev, 1, skb->len - PPP_PROTO_LEN); switch (proto) { case PPP_IP: if (!ppp->vj || (ppp->flags & SC_COMP_TCP) == 0) break; if (skb_linearize(skb)) goto drop; /* try to do VJ TCP header compression */ new_skb = alloc_skb(skb->len + ppp->dev->hard_header_len - 2, GFP_ATOMIC); if (!new_skb) { netdev_err(ppp->dev, "PPP: no memory (VJ comp pkt)\n"); goto drop; } skb_reserve(new_skb, ppp->dev->hard_header_len - 2); cp = skb->data + 2; len = slhc_compress(ppp->vj, cp, skb->len - 2, new_skb->data + 2, &cp, !(ppp->flags & SC_NO_TCP_CCID)); if (cp == skb->data + 2) { /* didn't compress */ consume_skb(new_skb); } else { if (cp[0] & SL_TYPE_COMPRESSED_TCP) { proto = PPP_VJC_COMP; cp[0] &= ~SL_TYPE_COMPRESSED_TCP; } else { proto = PPP_VJC_UNCOMP; cp[0] = skb->data[2]; } consume_skb(skb); skb = new_skb; cp = skb_put(skb, len + 2); cp[0] = 0; cp[1] = proto; } break; case PPP_CCP: /* peek at outbound CCP frames */ ppp_ccp_peek(ppp, skb, 0); break; } /* try to do packet compression */ if ((ppp->xstate & SC_COMP_RUN) && ppp->xc_state && proto != PPP_LCP && proto != PPP_CCP) { if (!(ppp->flags & SC_CCP_UP) && (ppp->flags & SC_MUST_COMP)) { if (net_ratelimit()) netdev_err(ppp->dev, "ppp: compression required but " "down - pkt dropped.\n"); goto drop; } new_skb = pad_compress_skb(ppp, skb); if (!new_skb) goto drop; skb = new_skb; } /* * If we are waiting for traffic (demand dialling), * queue it up for pppd to receive. */ if (ppp->flags & SC_LOOP_TRAFFIC) { if (ppp->file.rq.qlen > PPP_MAX_RQLEN) goto drop; skb_queue_tail(&ppp->file.rq, skb); wake_up_interruptible(&ppp->file.rwait); return; } ppp->xmit_pending = skb; ppp_push(ppp); return; drop: kfree_skb(skb); ++ppp->dev->stats.tx_errors; } /* * Try to send the frame in xmit_pending. * The caller should have the xmit path locked. */ static void ppp_push(struct ppp *ppp) { struct list_head *list; struct channel *pch; struct sk_buff *skb = ppp->xmit_pending; if (!skb) return; list = &ppp->channels; if (list_empty(list)) { /* nowhere to send the packet, just drop it */ ppp->xmit_pending = NULL; kfree_skb(skb); return; } if ((ppp->flags & SC_MULTILINK) == 0) { struct ppp_channel *chan; /* not doing multilink: send it down the first channel */ list = list->next; pch = list_entry(list, struct channel, clist); spin_lock(&pch->downl); chan = pch->chan; if (unlikely(!chan || (!chan->direct_xmit && skb_linearize(skb)))) { /* channel got unregistered, or it requires a linear * skb but linearization failed */ kfree_skb(skb); ppp->xmit_pending = NULL; goto out; } if (chan->ops->start_xmit(chan, skb)) ppp->xmit_pending = NULL; out: spin_unlock(&pch->downl); return; } #ifdef CONFIG_PPP_MULTILINK /* Multilink: fragment the packet over as many links as can take the packet at the moment. */ if (!ppp_mp_explode(ppp, skb)) return; #endif /* CONFIG_PPP_MULTILINK */ ppp->xmit_pending = NULL; kfree_skb(skb); } #ifdef CONFIG_PPP_MULTILINK static bool mp_protocol_compress __read_mostly = true; module_param(mp_protocol_compress, bool, 0644); MODULE_PARM_DESC(mp_protocol_compress, "compress protocol id in multilink fragments"); /* * Divide a packet to be transmitted into fragments and * send them out the individual links. */ static int ppp_mp_explode(struct ppp *ppp, struct sk_buff *skb) { int len, totlen; int i, bits, hdrlen, mtu; int flen; int navail, nfree, nzero; int nbigger; int totspeed; int totfree; unsigned char *p, *q; struct list_head *list; struct channel *pch; struct sk_buff *frag; struct ppp_channel *chan; totspeed = 0; /*total bitrate of the bundle*/ nfree = 0; /* # channels which have no packet already queued */ navail = 0; /* total # of usable channels (not deregistered) */ nzero = 0; /* number of channels with zero speed associated*/ totfree = 0; /*total # of channels available and *having no queued packets before *starting the fragmentation*/ hdrlen = (ppp->flags & SC_MP_XSHORTSEQ)? MPHDRLEN_SSN: MPHDRLEN; i = 0; list_for_each_entry(pch, &ppp->channels, clist) { if (pch->chan) { pch->avail = 1; navail++; pch->speed = pch->chan->speed; } else { pch->avail = 0; } if (pch->avail) { if (skb_queue_empty(&pch->file.xq) || !pch->had_frag) { if (pch->speed == 0) nzero++; else totspeed += pch->speed; pch->avail = 2; ++nfree; ++totfree; } if (!pch->had_frag && i < ppp->nxchan) ppp->nxchan = i; } ++i; } /* * Don't start sending this packet unless at least half of * the channels are free. This gives much better TCP * performance if we have a lot of channels. */ if (nfree == 0 || nfree < navail / 2) return 0; /* can't take now, leave it in xmit_pending */ /* Do protocol field compression */ if (skb_linearize(skb)) goto err_linearize; p = skb->data; len = skb->len; if (*p == 0 && mp_protocol_compress) { ++p; --len; } totlen = len; nbigger = len % nfree; /* skip to the channel after the one we last used and start at that one */ list = &ppp->channels; for (i = 0; i < ppp->nxchan; ++i) { list = list->next; if (list == &ppp->channels) { i = 0; break; } } /* create a fragment for each channel */ bits = B; while (len > 0) { list = list->next; if (list == &ppp->channels) { i = 0; continue; } pch = list_entry(list, struct channel, clist); ++i; if (!pch->avail) continue; /* * Skip this channel if it has a fragment pending already and * we haven't given a fragment to all of the free channels. */ if (pch->avail == 1) { if (nfree > 0) continue; } else { pch->avail = 1; } /* check the channel's mtu and whether it is still attached. */ spin_lock(&pch->downl); if (pch->chan == NULL) { /* can't use this channel, it's being deregistered */ if (pch->speed == 0) nzero--; else totspeed -= pch->speed; spin_unlock(&pch->downl); pch->avail = 0; totlen = len; totfree--; nfree--; if (--navail == 0) break; continue; } /* *if the channel speed is not set divide *the packet evenly among the free channels; *otherwise divide it according to the speed *of the channel we are going to transmit on */ flen = len; if (nfree > 0) { if (pch->speed == 0) { flen = len/nfree; if (nbigger > 0) { flen++; nbigger--; } } else { flen = (((totfree - nzero)*(totlen + hdrlen*totfree)) / ((totspeed*totfree)/pch->speed)) - hdrlen; if (nbigger > 0) { flen += ((totfree - nzero)*pch->speed)/totspeed; nbigger -= ((totfree - nzero)*pch->speed)/ totspeed; } } nfree--; } /* *check if we are on the last channel or *we exceded the length of the data to *fragment */ if ((nfree <= 0) || (flen > len)) flen = len; /* *it is not worth to tx on slow channels: *in that case from the resulting flen according to the *above formula will be equal or less than zero. *Skip the channel in this case */ if (flen <= 0) { pch->avail = 2; spin_unlock(&pch->downl); continue; } /* * hdrlen includes the 2-byte PPP protocol field, but the * MTU counts only the payload excluding the protocol field. * (RFC1661 Section 2) */ mtu = pch->chan->mtu - (hdrlen - 2); if (mtu < 4) mtu = 4; if (flen > mtu) flen = mtu; if (flen == len) bits |= E; frag = alloc_skb(flen + hdrlen + (flen == 0), GFP_ATOMIC); if (!frag) goto noskb; q = skb_put(frag, flen + hdrlen); /* make the MP header */ put_unaligned_be16(PPP_MP, q); if (ppp->flags & SC_MP_XSHORTSEQ) { q[2] = bits + ((ppp->nxseq >> 8) & 0xf); q[3] = ppp->nxseq; } else { q[2] = bits; q[3] = ppp->nxseq >> 16; q[4] = ppp->nxseq >> 8; q[5] = ppp->nxseq; } memcpy(q + hdrlen, p, flen); /* try to send it down the channel */ chan = pch->chan; if (!skb_queue_empty(&pch->file.xq) || !chan->ops->start_xmit(chan, frag)) skb_queue_tail(&pch->file.xq, frag); pch->had_frag = 1; p += flen; len -= flen; ++ppp->nxseq; bits = 0; spin_unlock(&pch->downl); } ppp->nxchan = i; return 1; noskb: spin_unlock(&pch->downl); err_linearize: if (ppp->debug & 1) netdev_err(ppp->dev, "PPP: no memory (fragment)\n"); ++ppp->dev->stats.tx_errors; ++ppp->nxseq; return 1; /* abandon the frame */ } #endif /* CONFIG_PPP_MULTILINK */ /* Try to send data out on a channel */ static void __ppp_channel_push(struct channel *pch, struct ppp *ppp) { struct sk_buff *skb; spin_lock(&pch->downl); if (pch->chan) { while (!skb_queue_empty(&pch->file.xq)) { skb = skb_dequeue(&pch->file.xq); if (!pch->chan->ops->start_xmit(pch->chan, skb)) { /* put the packet back and try again later */ skb_queue_head(&pch->file.xq, skb); break; } } } else { /* channel got deregistered */ skb_queue_purge(&pch->file.xq); } spin_unlock(&pch->downl); /* see if there is anything from the attached unit to be sent */ if (skb_queue_empty(&pch->file.xq)) { if (ppp) __ppp_xmit_process(ppp, NULL); } } static void ppp_channel_push(struct channel *pch) { struct ppp_xmit_recursion *xmit_recursion; struct ppp *ppp; rcu_read_lock_bh(); ppp = rcu_dereference_bh(pch->ppp); if (ppp) { xmit_recursion = this_cpu_ptr(ppp->xmit_recursion); local_lock_nested_bh(&ppp->xmit_recursion->bh_lock); xmit_recursion->owner = current; __ppp_channel_push(pch, ppp); xmit_recursion->owner = NULL; local_unlock_nested_bh(&ppp->xmit_recursion->bh_lock); } else { __ppp_channel_push(pch, NULL); } rcu_read_unlock_bh(); } /* * Receive-side routines. */ struct ppp_mp_skb_parm { u32 sequence; u8 BEbits; }; #define PPP_MP_CB(skb) ((struct ppp_mp_skb_parm *)((skb)->cb)) static inline void ppp_do_recv(struct ppp *ppp, struct sk_buff *skb, struct channel *pch) { ppp_recv_lock(ppp); if (!ppp->closing) ppp_receive_frame(ppp, skb, pch); else kfree_skb(skb); ppp_recv_unlock(ppp); } /** * __ppp_decompress_proto - Decompress protocol field, slim version. * @skb: Socket buffer where protocol field should be decompressed. It must have * at least 1 byte of head room and 1 byte of linear data. First byte of * data must be a protocol field byte. * * Decompress protocol field in PPP header if it's compressed, e.g. when * Protocol-Field-Compression (PFC) was negotiated. No checks w.r.t. skb data * length are done in this function. */ static void __ppp_decompress_proto(struct sk_buff *skb) { if (skb->data[0] & 0x01) *(u8 *)skb_push(skb, 1) = 0x00; } /** * ppp_decompress_proto - Check skb data room and decompress protocol field. * @skb: Socket buffer where protocol field should be decompressed. First byte * of data must be a protocol field byte. * * Decompress protocol field in PPP header if it's compressed, e.g. when * Protocol-Field-Compression (PFC) was negotiated. This function also makes * sure that skb data room is sufficient for Protocol field, before and after * decompression. * * Return: true - decompressed successfully, false - not enough room in skb. */ static bool ppp_decompress_proto(struct sk_buff *skb) { /* At least one byte should be present (if protocol is compressed) */ if (!pskb_may_pull(skb, 1)) return false; __ppp_decompress_proto(skb); /* Protocol field should occupy 2 bytes when not compressed */ return pskb_may_pull(skb, 2); } /* Attempt to handle a frame via. a bridged channel, if one exists. * If the channel is bridged, the frame is consumed by the bridge. * If not, the caller must handle the frame by normal recv mechanisms. * Returns true if the frame is consumed, false otherwise. */ static bool ppp_channel_bridge_input(struct channel *pch, struct sk_buff *skb) { struct channel *pchb; rcu_read_lock(); pchb = rcu_dereference(pch->bridge); if (!pchb) goto out_rcu; spin_lock_bh(&pchb->downl); if (!pchb->chan) { /* channel got unregistered */ kfree_skb(skb); goto outl; } skb_scrub_packet(skb, !net_eq(pch->chan_net, pchb->chan_net)); if (!pchb->chan->ops->start_xmit(pchb->chan, skb)) kfree_skb(skb); outl: spin_unlock_bh(&pchb->downl); out_rcu: rcu_read_unlock(); /* If pchb is set then we've consumed the packet */ return !!pchb; } void ppp_input(struct ppp_channel *chan, struct sk_buff *skb) { struct channel *pch = chan->ppp; struct ppp *ppp; int proto; if (!pch) { kfree_skb(skb); return; } /* If the channel is bridged, transmit via. bridge */ if (ppp_channel_bridge_input(pch, skb)) return; rcu_read_lock_bh(); ppp = rcu_dereference_bh(pch->ppp); if (!ppp_decompress_proto(skb)) { kfree_skb(skb); if (ppp) { ++ppp->dev->stats.rx_length_errors; ppp_receive_error(ppp); } goto done; } proto = PPP_PROTO(skb); if (!ppp || proto >= 0xc000 || proto == PPP_CCPFRAG) { /* put it on the channel queue */ skb_queue_tail(&pch->file.rq, skb); /* drop old frames if queue too long */ while (pch->file.rq.qlen > PPP_MAX_RQLEN && (skb = skb_dequeue(&pch->file.rq))) kfree_skb(skb); wake_up_interruptible(&pch->file.rwait); } else { ppp_do_recv(ppp, skb, pch); } done: rcu_read_unlock_bh(); } /* Put a 0-length skb in the receive queue as an error indication */ void ppp_input_error(struct ppp_channel *chan, int code) { struct channel *pch = chan->ppp; struct sk_buff *skb; struct ppp *ppp; if (!pch) return; rcu_read_lock_bh(); ppp = rcu_dereference_bh(pch->ppp); if (ppp) { skb = alloc_skb(0, GFP_ATOMIC); if (skb) { skb->len = 0; /* probably unnecessary */ skb->cb[0] = code; ppp_do_recv(ppp, skb, pch); } } rcu_read_unlock_bh(); } /* * We come in here to process a received frame. * The receive side of the ppp unit is locked. */ static void ppp_receive_frame(struct ppp *ppp, struct sk_buff *skb, struct channel *pch) { /* note: a 0-length skb is used as an error indication */ if (skb->len > 0) { skb_checksum_complete_unset(skb); #ifdef CONFIG_PPP_MULTILINK /* XXX do channel-level decompression here */ if (PPP_PROTO(skb) == PPP_MP) ppp_receive_mp_frame(ppp, skb, pch); else #endif /* CONFIG_PPP_MULTILINK */ ppp_receive_nonmp_frame(ppp, skb); } else { kfree_skb(skb); ppp_receive_error(ppp); } } static void ppp_receive_error(struct ppp *ppp) { ++ppp->dev->stats.rx_errors; if (ppp->vj) slhc_toss(ppp->vj); } static void ppp_receive_nonmp_frame(struct ppp *ppp, struct sk_buff *skb) { struct sk_buff *ns; int proto, len, npi; /* * Decompress the frame, if compressed. * Note that some decompressors need to see uncompressed frames * that come in as well as compressed frames. */ if (ppp->rc_state && (ppp->rstate & SC_DECOMP_RUN) && (ppp->rstate & (SC_DC_FERROR | SC_DC_ERROR)) == 0) skb = ppp_decompress_frame(ppp, skb); if (ppp->flags & SC_MUST_COMP && ppp->rstate & SC_DC_FERROR) goto err; /* At this point the "Protocol" field MUST be decompressed, either in * ppp_input(), ppp_decompress_frame() or in ppp_receive_mp_frame(). */ proto = PPP_PROTO(skb); switch (proto) { case PPP_VJC_COMP: /* decompress VJ compressed packets */ if (!ppp->vj || (ppp->flags & SC_REJ_COMP_TCP)) goto err; if (skb_tailroom(skb) < 124 || skb_cloned(skb)) { /* copy to a new sk_buff with more tailroom */ ns = dev_alloc_skb(skb->len + 128); if (!ns) { netdev_err(ppp->dev, "PPP: no memory " "(VJ decomp)\n"); goto err; } skb_reserve(ns, 2); skb_copy_bits(skb, 0, skb_put(ns, skb->len), skb->len); consume_skb(skb); skb = ns; } else skb->ip_summed = CHECKSUM_NONE; len = slhc_uncompress(ppp->vj, skb->data + 2, skb->len - 2); if (len <= 0) { netdev_printk(KERN_DEBUG, ppp->dev, "PPP: VJ decompression error\n"); goto err; } len += 2; if (len > skb->len) skb_put(skb, len - skb->len); else if (len < skb->len) skb_trim(skb, len); proto = PPP_IP; break; case PPP_VJC_UNCOMP: if (!ppp->vj || (ppp->flags & SC_REJ_COMP_TCP)) goto err; /* Until we fix the decompressor need to make sure * data portion is linear. */ if (!pskb_may_pull(skb, skb->len)) goto err; if (slhc_remember(ppp->vj, skb->data + 2, skb->len - 2) <= 0) { netdev_err(ppp->dev, "PPP: VJ uncompressed error\n"); goto err; } proto = PPP_IP; break; case PPP_CCP: ppp_ccp_peek(ppp, skb, 1); break; } dev_sw_netstats_rx_add(ppp->dev, skb->len - PPP_PROTO_LEN); npi = proto_to_npindex(proto); if (npi < 0) { /* control or unknown frame - pass it to pppd */ skb_queue_tail(&ppp->file.rq, skb); /* limit queue length by dropping old frames */ while (ppp->file.rq.qlen > PPP_MAX_RQLEN && (skb = skb_dequeue(&ppp->file.rq))) kfree_skb(skb); /* wake up any process polling or blocking on read */ wake_up_interruptible(&ppp->file.rwait); } else { /* network protocol frame - give it to the kernel */ #ifdef CONFIG_PPP_FILTER if (ppp->pass_filter || ppp->active_filter) { if (skb_unclone(skb, GFP_ATOMIC)) goto err; /* Check if the packet passes the pass and active filters. * See comment for PPP_FILTER_INBOUND_TAG above. */ *(__be16 *)skb_push(skb, 2) = htons(PPP_FILTER_INBOUND_TAG); if (ppp->pass_filter && bpf_prog_run(ppp->pass_filter, skb) == 0) { if (ppp->debug & 1) netdev_printk(KERN_DEBUG, ppp->dev, "PPP: inbound frame " "not passed\n"); kfree_skb(skb); return; } if (!(ppp->active_filter && bpf_prog_run(ppp->active_filter, skb) == 0)) ppp->last_recv = jiffies; __skb_pull(skb, 2); } else #endif /* CONFIG_PPP_FILTER */ ppp->last_recv = jiffies; if ((ppp->dev->flags & IFF_UP) == 0 || ppp->npmode[npi] != NPMODE_PASS) { kfree_skb(skb); } else { /* chop off protocol */ skb_pull_rcsum(skb, 2); skb->dev = ppp->dev; skb->protocol = htons(npindex_to_ethertype[npi]); skb_reset_mac_header(skb); skb_scrub_packet(skb, !net_eq(ppp->ppp_net, dev_net(ppp->dev))); netif_rx(skb); } } return; err: kfree_skb(skb); ppp_receive_error(ppp); } static struct sk_buff * ppp_decompress_frame(struct ppp *ppp, struct sk_buff *skb) { int proto = PPP_PROTO(skb); struct sk_buff *ns; int len; /* Until we fix all the decompressor's need to make sure * data portion is linear. */ if (!pskb_may_pull(skb, skb->len)) goto err; if (proto == PPP_COMP) { int obuff_size; switch(ppp->rcomp->compress_proto) { case CI_MPPE: obuff_size = ppp->mru + PPP_HDRLEN + 1; break; default: obuff_size = ppp->mru + PPP_HDRLEN; break; } ns = dev_alloc_skb(obuff_size); if (!ns) { netdev_err(ppp->dev, "ppp_decompress_frame: " "no memory\n"); goto err; } /* the decompressor still expects the A/C bytes in the hdr */ len = ppp->rcomp->decompress(ppp->rc_state, skb->data - 2, skb->len + 2, ns->data, obuff_size); if (len < 0) { /* Pass the compressed frame to pppd as an error indication. */ if (len == DECOMP_FATALERROR) ppp->rstate |= SC_DC_FERROR; kfree_skb(ns); goto err; } consume_skb(skb); skb = ns; skb_put(skb, len); skb_pull(skb, 2); /* pull off the A/C bytes */ /* Don't call __ppp_decompress_proto() here, but instead rely on * corresponding algo (mppe/bsd/deflate) to decompress it. */ } else { /* Uncompressed frame - pass to decompressor so it can update its dictionary if necessary. */ if (ppp->rcomp->incomp) ppp->rcomp->incomp(ppp->rc_state, skb->data - 2, skb->len + 2); } return skb; err: ppp->rstate |= SC_DC_ERROR; ppp_receive_error(ppp); return skb; } #ifdef CONFIG_PPP_MULTILINK /* * Receive a multilink frame. * We put it on the reconstruction queue and then pull off * as many completed frames as we can. */ static void ppp_receive_mp_frame(struct ppp *ppp, struct sk_buff *skb, struct channel *pch) { u32 mask, seq; struct channel *ch; int mphdrlen = (ppp->flags & SC_MP_SHORTSEQ)? MPHDRLEN_SSN: MPHDRLEN; if (!pskb_may_pull(skb, mphdrlen + 1) || ppp->mrru == 0) goto err; /* no good, throw it away */ /* Decode sequence number and begin/end bits */ if (ppp->flags & SC_MP_SHORTSEQ) { seq = ((skb->data[2] & 0x0f) << 8) | skb->data[3]; mask = 0xfff; } else { seq = (skb->data[3] << 16) | (skb->data[4] << 8)| skb->data[5]; mask = 0xffffff; } PPP_MP_CB(skb)->BEbits = skb->data[2]; skb_pull(skb, mphdrlen); /* pull off PPP and MP headers */ /* * Do protocol ID decompression on the first fragment of each packet. * We have to do that here, because ppp_receive_nonmp_frame() expects * decompressed protocol field. */ if (PPP_MP_CB(skb)->BEbits & B) __ppp_decompress_proto(skb); /* * Expand sequence number to 32 bits, making it as close * as possible to ppp->minseq. */ seq |= ppp->minseq & ~mask; if ((int)(ppp->minseq - seq) > (int)(mask >> 1)) seq += mask + 1; else if ((int)(seq - ppp->minseq) > (int)(mask >> 1)) seq -= mask + 1; /* should never happen */ PPP_MP_CB(skb)->sequence = seq; pch->lastseq = seq; /* * If this packet comes before the next one we were expecting, * drop it. */ if (seq_before(seq, ppp->nextseq)) { kfree_skb(skb); ++ppp->dev->stats.rx_dropped; ppp_receive_error(ppp); return; } /* * Reevaluate minseq, the minimum over all channels of the * last sequence number received on each channel. Because of * the increasing sequence number rule, we know that any fragment * before `minseq' which hasn't arrived is never going to arrive. * The list of channels can't change because we have the receive * side of the ppp unit locked. */ list_for_each_entry(ch, &ppp->channels, clist) { if (seq_before(ch->lastseq, seq)) seq = ch->lastseq; } if (seq_before(ppp->minseq, seq)) ppp->minseq = seq; /* Put the fragment on the reconstruction queue */ ppp_mp_insert(ppp, skb); /* If the queue is getting long, don't wait any longer for packets before the start of the queue. */ if (skb_queue_len(&ppp->mrq) >= PPP_MP_MAX_QLEN) { struct sk_buff *mskb = skb_peek(&ppp->mrq); if (seq_before(ppp->minseq, PPP_MP_CB(mskb)->sequence)) ppp->minseq = PPP_MP_CB(mskb)->sequence; } /* Pull completed packets off the queue and receive them. */ while ((skb = ppp_mp_reconstruct(ppp))) { if (pskb_may_pull(skb, 2)) ppp_receive_nonmp_frame(ppp, skb); else { ++ppp->dev->stats.rx_length_errors; kfree_skb(skb); ppp_receive_error(ppp); } } return; err: kfree_skb(skb); ppp_receive_error(ppp); } /* * Insert a fragment on the MP reconstruction queue. * The queue is ordered by increasing sequence number. */ static void ppp_mp_insert(struct ppp *ppp, struct sk_buff *skb) { struct sk_buff *p; struct sk_buff_head *list = &ppp->mrq; u32 seq = PPP_MP_CB(skb)->sequence; /* N.B. we don't need to lock the list lock because we have the ppp unit receive-side lock. */ skb_queue_walk(list, p) { if (seq_before(seq, PPP_MP_CB(p)->sequence)) break; } __skb_queue_before(list, p, skb); } /* * Reconstruct a packet from the MP fragment queue. * We go through increasing sequence numbers until we find a * complete packet, or we get to the sequence number for a fragment * which hasn't arrived but might still do so. */ static struct sk_buff * ppp_mp_reconstruct(struct ppp *ppp) { u32 seq = ppp->nextseq; u32 minseq = ppp->minseq; struct sk_buff_head *list = &ppp->mrq; struct sk_buff *p, *tmp; struct sk_buff *head, *tail; struct sk_buff *skb = NULL; int lost = 0, len = 0; if (ppp->mrru == 0) /* do nothing until mrru is set */ return NULL; head = __skb_peek(list); tail = NULL; skb_queue_walk_safe(list, p, tmp) { again: if (seq_before(PPP_MP_CB(p)->sequence, seq)) { /* this can't happen, anyway ignore the skb */ netdev_err(ppp->dev, "ppp_mp_reconstruct bad " "seq %u < %u\n", PPP_MP_CB(p)->sequence, seq); __skb_unlink(p, list); kfree_skb(p); continue; } if (PPP_MP_CB(p)->sequence != seq) { u32 oldseq; /* Fragment `seq' is missing. If it is after minseq, it might arrive later, so stop here. */ if (seq_after(seq, minseq)) break; /* Fragment `seq' is lost, keep going. */ lost = 1; oldseq = seq; seq = seq_before(minseq, PPP_MP_CB(p)->sequence)? minseq + 1: PPP_MP_CB(p)->sequence; if (ppp->debug & 1) netdev_printk(KERN_DEBUG, ppp->dev, "lost frag %u..%u\n", oldseq, seq-1); goto again; } /* * At this point we know that all the fragments from * ppp->nextseq to seq are either present or lost. * Also, there are no complete packets in the queue * that have no missing fragments and end before this * fragment. */ /* B bit set indicates this fragment starts a packet */ if (PPP_MP_CB(p)->BEbits & B) { head = p; lost = 0; len = 0; } len += p->len; /* Got a complete packet yet? */ if (lost == 0 && (PPP_MP_CB(p)->BEbits & E) && (PPP_MP_CB(head)->BEbits & B)) { if (len > ppp->mrru + 2) { ++ppp->dev->stats.rx_length_errors; netdev_printk(KERN_DEBUG, ppp->dev, "PPP: reconstructed packet" " is too long (%d)\n", len); } else { tail = p; break; } ppp->nextseq = seq + 1; } /* * If this is the ending fragment of a packet, * and we haven't found a complete valid packet yet, * we can discard up to and including this fragment. */ if (PPP_MP_CB(p)->BEbits & E) { struct sk_buff *tmp2; skb_queue_reverse_walk_from_safe(list, p, tmp2) { if (ppp->debug & 1) netdev_printk(KERN_DEBUG, ppp->dev, "discarding frag %u\n", PPP_MP_CB(p)->sequence); __skb_unlink(p, list); kfree_skb(p); } head = skb_peek(list); if (!head) break; } ++seq; } /* If we have a complete packet, copy it all into one skb. */ if (tail != NULL) { /* If we have discarded any fragments, signal a receive error. */ if (PPP_MP_CB(head)->sequence != ppp->nextseq) { skb_queue_walk_safe(list, p, tmp) { if (p == head) break; if (ppp->debug & 1) netdev_printk(KERN_DEBUG, ppp->dev, "discarding frag %u\n", PPP_MP_CB(p)->sequence); __skb_unlink(p, list); kfree_skb(p); } if (ppp->debug & 1) netdev_printk(KERN_DEBUG, ppp->dev, " missed pkts %u..%u\n", ppp->nextseq, PPP_MP_CB(head)->sequence-1); ++ppp->dev->stats.rx_dropped; ppp_receive_error(ppp); } skb = head; if (head != tail) { struct sk_buff **fragpp = &skb_shinfo(skb)->frag_list; p = skb_queue_next(list, head); __skb_unlink(skb, list); skb_queue_walk_from_safe(list, p, tmp) { __skb_unlink(p, list); *fragpp = p; p->next = NULL; fragpp = &p->next; skb->len += p->len; skb->data_len += p->len; skb->truesize += p->truesize; if (p == tail) break; } } else { __skb_unlink(skb, list); } ppp->nextseq = PPP_MP_CB(tail)->sequence + 1; } return skb; } #endif /* CONFIG_PPP_MULTILINK */ /* * Channel interface. */ /* Create a new, unattached ppp channel. */ int ppp_register_channel(struct ppp_channel *chan) { return ppp_register_net_channel(current->nsproxy->net_ns, chan); } /* Create a new, unattached ppp channel for specified net. */ int ppp_register_net_channel(struct net *net, struct ppp_channel *chan) { struct channel *pch; struct ppp_net *pn; pch = kzalloc_obj(struct channel); if (!pch) return -ENOMEM; pn = ppp_pernet(net); pch->chan = chan; pch->chan_net = get_net_track(net, &pch->ns_tracker, GFP_KERNEL); chan->ppp = pch; init_ppp_file(&pch->file, CHANNEL); pch->file.hdrlen = chan->hdrlen; #ifdef CONFIG_PPP_MULTILINK pch->lastseq = -1; #endif /* CONFIG_PPP_MULTILINK */ init_rwsem(&pch->chan_sem); spin_lock_init(&pch->downl); spin_lock_init(&pch->upl); spin_lock_bh(&pn->all_channels_lock); pch->file.index = ++pn->last_channel_index; list_add(&pch->list, &pn->new_channels); atomic_inc(&channel_count); spin_unlock_bh(&pn->all_channels_lock); return 0; } /* * Return the index of a channel. */ int ppp_channel_index(struct ppp_channel *chan) { struct channel *pch = chan->ppp; if (pch) return pch->file.index; return -1; } /* * Return the PPP unit number to which a channel is connected. */ int ppp_unit_number(struct ppp_channel *chan) { struct channel *pch = chan->ppp; struct ppp *ppp; int unit = -1; if (pch) { rcu_read_lock(); ppp = rcu_dereference(pch->ppp); if (ppp) unit = ppp->file.index; rcu_read_unlock(); } return unit; } /* * Return the PPP device interface name of a channel. */ char *ppp_dev_name(struct ppp_channel *chan) { struct channel *pch = chan->ppp; char *name = NULL; struct ppp *ppp; if (pch) { rcu_read_lock(); ppp = rcu_dereference(pch->ppp); if (ppp && ppp->dev) name = ppp->dev->name; rcu_read_unlock(); } return name; } /* * Disconnect a channel from the generic layer. * This must be called in process context. */ void ppp_unregister_channel(struct ppp_channel *chan) { struct channel *pch = chan->ppp; struct ppp_net *pn; if (!pch) return; /* should never happen */ chan->ppp = NULL; /* * This ensures that we have returned from any calls into * the channel's start_xmit or ioctl routine before we proceed. */ down_write(&pch->chan_sem); spin_lock_bh(&pch->downl); WRITE_ONCE(pch->chan, NULL); spin_unlock_bh(&pch->downl); up_write(&pch->chan_sem); ppp_disconnect_channel(pch); pn = ppp_pernet(pch->chan_net); spin_lock_bh(&pn->all_channels_lock); list_del(&pch->list); spin_unlock_bh(&pn->all_channels_lock); ppp_unbridge_channels(pch); pch->file.dead = 1; wake_up_interruptible(&pch->file.rwait); if (refcount_dec_and_test(&pch->file.refcnt)) ppp_destroy_channel(pch); } /* * Callback from a channel when it can accept more to transmit. * This should be called at BH/softirq level, not interrupt level. */ void ppp_output_wakeup(struct ppp_channel *chan) { struct channel *pch = chan->ppp; if (!pch) return; ppp_channel_push(pch); } /* * Compression control. */ /* Process the PPPIOCSCOMPRESS ioctl. */ static int ppp_set_compress(struct ppp *ppp, struct ppp_option_data *data) { int err = -EFAULT; struct compressor *cp, *ocomp; void *state, *ostate; unsigned char ccp_option[CCP_MAX_OPTION_LENGTH]; if (data->length > CCP_MAX_OPTION_LENGTH) goto out; if (copy_from_user(ccp_option, data->ptr, data->length)) goto out; err = -EINVAL; if (data->length < 2 || ccp_option[1] < 2 || ccp_option[1] > data->length) goto out; cp = try_then_request_module( find_compressor(ccp_option[0]), "ppp-compress-%d", ccp_option[0]); if (!cp) goto out; err = -ENOBUFS; if (data->transmit) { state = cp->comp_alloc(ccp_option, data->length); if (state) { ppp_xmit_lock(ppp); ppp->xstate &= ~SC_COMP_RUN; ocomp = ppp->xcomp; ostate = ppp->xc_state; ppp->xcomp = cp; ppp->xc_state = state; ppp_xmit_unlock(ppp); if (ostate) { ocomp->comp_free(ostate); module_put(ocomp->owner); } err = 0; } else module_put(cp->owner); } else { state = cp->decomp_alloc(ccp_option, data->length); if (state) { ppp_recv_lock(ppp); ppp->rstate &= ~SC_DECOMP_RUN; ocomp = ppp->rcomp; ostate = ppp->rc_state; ppp->rcomp = cp; ppp->rc_state = state; ppp_recv_unlock(ppp); if (ostate) { ocomp->decomp_free(ostate); module_put(ocomp->owner); } err = 0; } else module_put(cp->owner); } out: return err; } /* * Look at a CCP packet and update our state accordingly. * We assume the caller has the xmit or recv path locked. */ static void ppp_ccp_peek(struct ppp *ppp, struct sk_buff *skb, int inbound) { unsigned char *dp; int len; if (!pskb_may_pull(skb, CCP_HDRLEN + 2)) return; /* no header */ dp = skb->data + 2; switch (CCP_CODE(dp)) { case CCP_CONFREQ: /* A ConfReq starts negotiation of compression * in one direction of transmission, * and hence brings it down...but which way? * * Remember: * A ConfReq indicates what the sender would like to receive */ if(inbound) /* He is proposing what I should send */ ppp->xstate &= ~SC_COMP_RUN; else /* I am proposing to what he should send */ ppp->rstate &= ~SC_DECOMP_RUN; break; case CCP_TERMREQ: case CCP_TERMACK: /* * CCP is going down, both directions of transmission */ ppp->rstate &= ~SC_DECOMP_RUN; ppp->xstate &= ~SC_COMP_RUN; break; case CCP_CONFACK: if ((ppp->flags & (SC_CCP_OPEN | SC_CCP_UP)) != SC_CCP_OPEN) break; len = CCP_LENGTH(dp); if (!pskb_may_pull(skb, len + 2)) return; /* too short */ dp += CCP_HDRLEN; len -= CCP_HDRLEN; if (len < CCP_OPT_MINLEN || len < CCP_OPT_LENGTH(dp)) break; if (inbound) { /* we will start receiving compressed packets */ if (!ppp->rc_state) break; if (ppp->rcomp->decomp_init(ppp->rc_state, dp, len, ppp->file.index, 0, ppp->mru, ppp->debug)) { ppp->rstate |= SC_DECOMP_RUN; ppp->rstate &= ~(SC_DC_ERROR | SC_DC_FERROR); } } else { /* we will soon start sending compressed packets */ if (!ppp->xc_state) break; if (ppp->xcomp->comp_init(ppp->xc_state, dp, len, ppp->file.index, 0, ppp->debug)) ppp->xstate |= SC_COMP_RUN; } break; case CCP_RESETACK: /* reset the [de]compressor */ if ((ppp->flags & SC_CCP_UP) == 0) break; if (inbound) { if (ppp->rc_state && (ppp->rstate & SC_DECOMP_RUN)) { ppp->rcomp->decomp_reset(ppp->rc_state); ppp->rstate &= ~SC_DC_ERROR; } } else { if (ppp->xc_state && (ppp->xstate & SC_COMP_RUN)) ppp->xcomp->comp_reset(ppp->xc_state); } break; } } /* Free up compression resources. */ static void ppp_ccp_closed(struct ppp *ppp) { void *xstate, *rstate; struct compressor *xcomp, *rcomp; ppp_lock(ppp); ppp->flags &= ~(SC_CCP_OPEN | SC_CCP_UP); ppp->xstate = 0; xcomp = ppp->xcomp; xstate = ppp->xc_state; ppp->xc_state = NULL; ppp->rstate = 0; rcomp = ppp->rcomp; rstate = ppp->rc_state; ppp->rc_state = NULL; ppp_unlock(ppp); if (xstate) { xcomp->comp_free(xstate); module_put(xcomp->owner); } if (rstate) { rcomp->decomp_free(rstate); module_put(rcomp->owner); } } /* List of compressors. */ static LIST_HEAD(compressor_list); static DEFINE_SPINLOCK(compressor_list_lock); struct compressor_entry { struct list_head list; struct compressor *comp; }; static struct compressor_entry * find_comp_entry(int proto) { struct compressor_entry *ce; list_for_each_entry(ce, &compressor_list, list) { if (ce->comp->compress_proto == proto) return ce; } return NULL; } /* Register a compressor */ int ppp_register_compressor(struct compressor *cp) { struct compressor_entry *ce; int ret; spin_lock(&compressor_list_lock); ret = -EEXIST; if (find_comp_entry(cp->compress_proto)) goto out; ret = -ENOMEM; ce = kmalloc_obj(struct compressor_entry, GFP_ATOMIC); if (!ce) goto out; ret = 0; ce->comp = cp; list_add(&ce->list, &compressor_list); out: spin_unlock(&compressor_list_lock); return ret; } /* Unregister a compressor */ void ppp_unregister_compressor(struct compressor *cp) { struct compressor_entry *ce; spin_lock(&compressor_list_lock); ce = find_comp_entry(cp->compress_proto); if (ce && ce->comp == cp) { list_del(&ce->list); kfree(ce); } spin_unlock(&compressor_list_lock); } /* Find a compressor. */ static struct compressor * find_compressor(int type) { struct compressor_entry *ce; struct compressor *cp = NULL; spin_lock(&compressor_list_lock); ce = find_comp_entry(type); if (ce) { cp = ce->comp; if (!try_module_get(cp->owner)) cp = NULL; } spin_unlock(&compressor_list_lock); return cp; } /* * Miscelleneous stuff. */ static void ppp_get_stats(struct ppp *ppp, struct ppp_stats *st) { struct slcompress *vj = ppp->vj; int cpu; memset(st, 0, sizeof(*st)); for_each_possible_cpu(cpu) { struct pcpu_sw_netstats *p = per_cpu_ptr(ppp->dev->tstats, cpu); u64 rx_packets, rx_bytes, tx_packets, tx_bytes; rx_packets = u64_stats_read(&p->rx_packets); rx_bytes = u64_stats_read(&p->rx_bytes); tx_packets = u64_stats_read(&p->tx_packets); tx_bytes = u64_stats_read(&p->tx_bytes); st->p.ppp_ipackets += rx_packets; st->p.ppp_ibytes += rx_bytes; st->p.ppp_opackets += tx_packets; st->p.ppp_obytes += tx_bytes; } st->p.ppp_ierrors = ppp->dev->stats.rx_errors; st->p.ppp_oerrors = ppp->dev->stats.tx_errors; if (!vj) return; st->vj.vjs_packets = vj->sls_o_compressed + vj->sls_o_uncompressed; st->vj.vjs_compressed = vj->sls_o_compressed; st->vj.vjs_searches = vj->sls_o_searches; st->vj.vjs_misses = vj->sls_o_misses; st->vj.vjs_errorin = vj->sls_i_error; st->vj.vjs_tossed = vj->sls_i_tossed; st->vj.vjs_uncompressedin = vj->sls_i_uncompressed; st->vj.vjs_compressedin = vj->sls_i_compressed; } /* * Stuff for handling the lists of ppp units and channels * and for initialization. */ /* * Create a new ppp interface unit. Fails if it can't allocate memory * or if there is already a unit with the requested number. * unit == -1 means allocate a new number. */ static int ppp_create_interface(struct net *net, struct file *file, int *unit) { struct ppp_config conf = { .file = file, .unit = *unit, .ifname_is_set = false, }; struct net_device *dev; struct ppp *ppp; int err; dev = alloc_netdev(sizeof(struct ppp), "", NET_NAME_ENUM, ppp_setup); if (!dev) { err = -ENOMEM; goto err; } dev_net_set(dev, net); dev->rtnl_link_ops = &ppp_link_ops; rtnl_lock(); err = ppp_dev_configure(net, dev, &conf); if (err < 0) goto err_dev; ppp = netdev_priv(dev); *unit = ppp->file.index; rtnl_unlock(); return 0; err_dev: rtnl_unlock(); free_netdev(dev); err: return err; } /* * Initialize a ppp_file structure. */ static void init_ppp_file(struct ppp_file *pf, int kind) { pf->kind = kind; skb_queue_head_init(&pf->xq); skb_queue_head_init(&pf->rq); refcount_set(&pf->refcnt, 1); init_waitqueue_head(&pf->rwait); } /* * Free the memory used by a ppp unit. This is only called once * there are no channels connected to the unit and no file structs * that reference the unit. */ static void ppp_destroy_interface(struct ppp *ppp) { atomic_dec(&ppp_unit_count); if (!ppp->file.dead || ppp->n_channels) { /* "can't happen" */ netdev_err(ppp->dev, "ppp: destroying ppp struct %p " "but dead=%d n_channels=%d !\n", ppp, ppp->file.dead, ppp->n_channels); return; } ppp_ccp_closed(ppp); if (ppp->vj) { slhc_free(ppp->vj); ppp->vj = NULL; } skb_queue_purge(&ppp->file.xq); skb_queue_purge(&ppp->file.rq); #ifdef CONFIG_PPP_MULTILINK skb_queue_purge(&ppp->mrq); #endif /* CONFIG_PPP_MULTILINK */ #ifdef CONFIG_PPP_FILTER if (ppp->pass_filter) { bpf_prog_destroy(ppp->pass_filter); ppp->pass_filter = NULL; } if (ppp->active_filter) { bpf_prog_destroy(ppp->active_filter); ppp->active_filter = NULL; } #endif /* CONFIG_PPP_FILTER */ kfree_skb(ppp->xmit_pending); free_percpu(ppp->xmit_recursion); free_netdev(ppp->dev); } /* * Locate an existing ppp unit. * The caller should have locked the all_ppp_mutex. */ static struct ppp * ppp_find_unit(struct ppp_net *pn, int unit) { return unit_find(&pn->units_idr, unit); } /* * Locate an existing ppp channel. * The caller should have locked the all_channels_lock. * First we look in the new_channels list, then in the * all_channels list. If found in the new_channels list, * we move it to the all_channels list. This is for speed * when we have a lot of channels in use. */ static struct channel * ppp_find_channel(struct ppp_net *pn, int unit) { struct channel *pch; list_for_each_entry(pch, &pn->new_channels, list) { if (pch->file.index == unit) { list_move(&pch->list, &pn->all_channels); return pch; } } list_for_each_entry(pch, &pn->all_channels, list) { if (pch->file.index == unit) return pch; } return NULL; } /* * Connect a PPP channel to a PPP interface unit. */ static int ppp_connect_channel(struct channel *pch, int unit) { struct ppp *ppp; struct ppp_net *pn; int ret = -ENXIO; int hdrlen; pn = ppp_pernet(pch->chan_net); mutex_lock(&pn->all_ppp_mutex); ppp = ppp_find_unit(pn, unit); if (!ppp) goto out; spin_lock(&pch->upl); ret = -EINVAL; if (rcu_dereference_protected(pch->ppp, lockdep_is_held(&pch->upl)) || rcu_dereference_protected(pch->bridge, lockdep_is_held(&pch->upl))) goto outl; ppp_lock(ppp); spin_lock_bh(&pch->downl); if (!pch->chan) { /* Don't connect unregistered channels */ spin_unlock_bh(&pch->downl); ppp_unlock(ppp); ret = -ENOTCONN; goto outl; } if (pch->chan->direct_xmit) ppp->dev->priv_flags |= IFF_NO_QUEUE; else ppp->dev->priv_flags &= ~IFF_NO_QUEUE; spin_unlock_bh(&pch->downl); if (pch->file.hdrlen > ppp->file.hdrlen) ppp->file.hdrlen = pch->file.hdrlen; hdrlen = pch->file.hdrlen + 2; /* for protocol bytes */ if (hdrlen > ppp->dev->hard_header_len) ppp->dev->hard_header_len = hdrlen; list_add_tail_rcu(&pch->clist, &ppp->channels); ++ppp->n_channels; rcu_assign_pointer(pch->ppp, ppp); refcount_inc(&ppp->file.refcnt); ppp_unlock(ppp); ret = 0; outl: spin_unlock(&pch->upl); out: mutex_unlock(&pn->all_ppp_mutex); return ret; } /* * Disconnect a channel from its ppp unit. */ static int ppp_disconnect_channel(struct channel *pch) { struct ppp *ppp; int err = -EINVAL; spin_lock(&pch->upl); ppp = rcu_replace_pointer(pch->ppp, NULL, lockdep_is_held(&pch->upl)); spin_unlock(&pch->upl); if (ppp) { /* remove it from the ppp unit's list */ ppp_lock(ppp); list_del_rcu(&pch->clist); if (--ppp->n_channels == 0) wake_up_interruptible(&ppp->file.rwait); ppp_unlock(ppp); synchronize_net(); if (refcount_dec_and_test(&ppp->file.refcnt)) ppp_destroy_interface(ppp); err = 0; } return err; } /* * Free up the resources used by a ppp channel. */ static void ppp_destroy_channel(struct channel *pch) { put_net_track(pch->chan_net, &pch->ns_tracker); pch->chan_net = NULL; atomic_dec(&channel_count); if (!pch->file.dead) { /* "can't happen" */ pr_err("ppp: destroying undead channel %p !\n", pch); return; } skb_queue_purge(&pch->file.xq); skb_queue_purge(&pch->file.rq); kfree(pch); } static void __exit ppp_cleanup(void) { /* should never happen */ if (atomic_read(&ppp_unit_count) || atomic_read(&channel_count)) pr_err("PPP: removing module but units remain!\n"); rtnl_link_unregister(&ppp_link_ops); unregister_chrdev(PPP_MAJOR, "ppp"); device_destroy(&ppp_class, MKDEV(PPP_MAJOR, 0)); class_unregister(&ppp_class); unregister_pernet_device(&ppp_net_ops); } /* * Units handling. Caller must protect concurrent access * by holding all_ppp_mutex */ /* associate pointer with specified number */ static int unit_set(struct idr *p, void *ptr, int n) { int unit; unit = idr_alloc(p, ptr, n, n + 1, GFP_KERNEL); if (unit == -ENOSPC) unit = -EINVAL; return unit; } /* get new free unit number and associate pointer with it */ static int unit_get(struct idr *p, void *ptr, int min) { return idr_alloc(p, ptr, min, 0, GFP_KERNEL); } /* put unit number back to a pool */ static void unit_put(struct idr *p, int n) { idr_remove(p, n); } /* get pointer associated with the number */ static void *unit_find(struct idr *p, int n) { return idr_find(p, n); } /* Module/initialization stuff */ module_init(ppp_init); module_exit(ppp_cleanup); EXPORT_SYMBOL(ppp_register_net_channel); EXPORT_SYMBOL(ppp_register_channel); EXPORT_SYMBOL(ppp_unregister_channel); EXPORT_SYMBOL(ppp_channel_index); EXPORT_SYMBOL(ppp_unit_number); EXPORT_SYMBOL(ppp_dev_name); EXPORT_SYMBOL(ppp_input); EXPORT_SYMBOL(ppp_input_error); EXPORT_SYMBOL(ppp_output_wakeup); EXPORT_SYMBOL(ppp_register_compressor); EXPORT_SYMBOL(ppp_unregister_compressor); MODULE_DESCRIPTION("Generic PPP layer driver"); MODULE_LICENSE("GPL"); MODULE_ALIAS_CHARDEV(PPP_MAJOR, 0); MODULE_ALIAS_RTNL_LINK("ppp"); MODULE_ALIAS("devname:ppp");
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * include/net/l3mdev.h - L3 master device API * Copyright (c) 2015 Cumulus Networks * Copyright (c) 2015 David Ahern <dsa@cumulusnetworks.com> */ #ifndef _NET_L3MDEV_H_ #define _NET_L3MDEV_H_ #include <net/dst.h> #include <net/fib_rules.h> enum l3mdev_type { L3MDEV_TYPE_UNSPEC, L3MDEV_TYPE_VRF, __L3MDEV_TYPE_MAX }; #define L3MDEV_TYPE_MAX (__L3MDEV_TYPE_MAX - 1) typedef int (*lookup_by_table_id_t)(struct net *net, u32 table_d); /** * struct l3mdev_ops - l3mdev operations * * @l3mdev_fib_table: Get FIB table id to use for lookups * * @l3mdev_l3_rcv: Hook in L3 receive path * * @l3mdev_l3_out: Hook in L3 output path * * @l3mdev_link_scope_lookup: IPv6 lookup for linklocal and mcast destinations */ struct l3mdev_ops { u32 (*l3mdev_fib_table)(const struct net_device *dev); struct sk_buff * (*l3mdev_l3_rcv)(struct net_device *dev, struct sk_buff *skb, u16 proto); struct sk_buff * (*l3mdev_l3_out)(struct net_device *dev, struct sock *sk, struct sk_buff *skb, u16 proto); /* IPv6 ops */ struct dst_entry * (*l3mdev_link_scope_lookup)(const struct net_device *dev, struct flowi6 *fl6); }; #ifdef CONFIG_NET_L3_MASTER_DEV int l3mdev_table_lookup_register(enum l3mdev_type l3type, lookup_by_table_id_t fn); void l3mdev_table_lookup_unregister(enum l3mdev_type l3type, lookup_by_table_id_t fn); int l3mdev_ifindex_lookup_by_table_id(enum l3mdev_type l3type, struct net *net, u32 table_id); int l3mdev_fib_rule_match(struct net *net, struct flowi *fl, struct fib_lookup_arg *arg); static inline bool l3mdev_fib_rule_iif_match(const struct flowi *fl, int iifindex) { return !(fl->flowi_flags & FLOWI_FLAG_L3MDEV_OIF) && fl->flowi_l3mdev == iifindex; } static inline bool l3mdev_fib_rule_oif_match(const struct flowi *fl, int oifindex) { return fl->flowi_flags & FLOWI_FLAG_L3MDEV_OIF && fl->flowi_l3mdev == oifindex; } void l3mdev_update_flow(struct net *net, struct flowi *fl); int l3mdev_master_ifindex_rcu(const struct net_device *dev); static inline int l3mdev_master_ifindex(struct net_device *dev) { int ifindex; rcu_read_lock(); ifindex = l3mdev_master_ifindex_rcu(dev); rcu_read_unlock(); return ifindex; } static inline int l3mdev_master_ifindex_by_index(struct net *net, int ifindex) { struct net_device *dev; int rc = 0; if (ifindex) { rcu_read_lock(); dev = dev_get_by_index_rcu(net, ifindex); if (dev) rc = l3mdev_master_ifindex_rcu(dev); rcu_read_unlock(); } return rc; } static inline struct net_device *l3mdev_master_dev_rcu(const struct net_device *_dev) { /* netdev_master_upper_dev_get_rcu calls * list_first_or_null_rcu to walk the upper dev list. * list_first_or_null_rcu does not handle a const arg. We aren't * making changes, just want the master device from that list so * typecast to remove the const */ struct net_device *dev = (struct net_device *)_dev; struct net_device *master; if (!dev) return NULL; if (netif_is_l3_master(dev)) master = dev; else if (netif_is_l3_slave(dev)) master = netdev_master_upper_dev_get_rcu(dev); else master = NULL; return master; } int l3mdev_master_upper_ifindex_by_index_rcu(struct net *net, int ifindex); static inline int l3mdev_master_upper_ifindex_by_index(struct net *net, int ifindex) { rcu_read_lock(); ifindex = l3mdev_master_upper_ifindex_by_index_rcu(net, ifindex); rcu_read_unlock(); return ifindex; } u32 l3mdev_fib_table_rcu(const struct net_device *dev); u32 l3mdev_fib_table_by_index(struct net *net, int ifindex); static inline u32 l3mdev_fib_table(const struct net_device *dev) { u32 tb_id; rcu_read_lock(); tb_id = l3mdev_fib_table_rcu(dev); rcu_read_unlock(); return tb_id; } static inline bool netif_index_is_l3_master(struct net *net, int ifindex) { struct net_device *dev; bool rc = false; if (ifindex == 0) return false; rcu_read_lock(); dev = dev_get_by_index_rcu(net, ifindex); if (dev) rc = netif_is_l3_master(dev); rcu_read_unlock(); return rc; } struct dst_entry *l3mdev_link_scope_lookup(struct net *net, struct flowi6 *fl6); static inline struct sk_buff *l3mdev_l3_rcv(struct sk_buff *skb, u16 proto) { struct net_device *master = NULL; if (netif_is_l3_slave(skb->dev)) master = netdev_master_upper_dev_get_rcu(skb->dev); else if (netif_is_l3_master(skb->dev) || netif_has_l3_rx_handler(skb->dev)) master = skb->dev; if (master && master->l3mdev_ops->l3mdev_l3_rcv) skb = master->l3mdev_ops->l3mdev_l3_rcv(master, skb, proto); return skb; } static inline struct sk_buff *l3mdev_ip_rcv(struct sk_buff *skb) { return l3mdev_l3_rcv(skb, AF_INET); } static inline struct sk_buff *l3mdev_ip6_rcv(struct sk_buff *skb) { return l3mdev_l3_rcv(skb, AF_INET6); } static inline struct sk_buff *l3mdev_l3_out(struct sock *sk, struct sk_buff *skb, u16 proto) { struct net_device *dev; rcu_read_lock(); dev = skb_dst_dev_rcu(skb); if (netif_is_l3_slave(dev)) { struct net_device *master; master = netdev_master_upper_dev_get_rcu(dev); if (master && master->l3mdev_ops->l3mdev_l3_out) skb = master->l3mdev_ops->l3mdev_l3_out(master, sk, skb, proto); } rcu_read_unlock(); return skb; } static inline struct sk_buff *l3mdev_ip_out(struct sock *sk, struct sk_buff *skb) { return l3mdev_l3_out(sk, skb, AF_INET); } static inline struct sk_buff *l3mdev_ip6_out(struct sock *sk, struct sk_buff *skb) { return l3mdev_l3_out(sk, skb, AF_INET6); } #else static inline int l3mdev_master_ifindex_rcu(const struct net_device *dev) { return 0; } static inline int l3mdev_master_ifindex(struct net_device *dev) { return 0; } static inline int l3mdev_master_ifindex_by_index(struct net *net, int ifindex) { return 0; } static inline int l3mdev_master_upper_ifindex_by_index_rcu(struct net *net, int ifindex) { return 0; } static inline int l3mdev_master_upper_ifindex_by_index(struct net *net, int ifindex) { return 0; } static inline struct net_device *l3mdev_master_dev_rcu(const struct net_device *dev) { return NULL; } static inline u32 l3mdev_fib_table_rcu(const struct net_device *dev) { return 0; } static inline u32 l3mdev_fib_table(const struct net_device *dev) { return 0; } static inline u32 l3mdev_fib_table_by_index(struct net *net, int ifindex) { return 0; } static inline bool netif_index_is_l3_master(struct net *net, int ifindex) { return false; } static inline struct dst_entry *l3mdev_link_scope_lookup(struct net *net, struct flowi6 *fl6) { return NULL; } static inline struct sk_buff *l3mdev_ip_rcv(struct sk_buff *skb) { return skb; } static inline struct sk_buff *l3mdev_ip6_rcv(struct sk_buff *skb) { return skb; } static inline struct sk_buff *l3mdev_ip_out(struct sock *sk, struct sk_buff *skb) { return skb; } static inline struct sk_buff *l3mdev_ip6_out(struct sock *sk, struct sk_buff *skb) { return skb; } static inline int l3mdev_table_lookup_register(enum l3mdev_type l3type, lookup_by_table_id_t fn) { return -EOPNOTSUPP; } static inline void l3mdev_table_lookup_unregister(enum l3mdev_type l3type, lookup_by_table_id_t fn) { } static inline int l3mdev_ifindex_lookup_by_table_id(enum l3mdev_type l3type, struct net *net, u32 table_id) { return -ENODEV; } static inline int l3mdev_fib_rule_match(struct net *net, struct flowi *fl, struct fib_lookup_arg *arg) { return 1; } static inline bool l3mdev_fib_rule_iif_match(const struct flowi *fl, int iifindex) { return false; } static inline bool l3mdev_fib_rule_oif_match(const struct flowi *fl, int oifindex) { return false; } static inline void l3mdev_update_flow(struct net *net, struct flowi *fl) { } #endif #endif /* _NET_L3MDEV_H_ */
4 4 4 3 4 4 4 4 4 4 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 // SPDX-License-Identifier: GPL-2.0-or-later /* * ip6_flowlabel.c IPv6 flowlabel manager. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> */ #include <linux/capability.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/in6.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/export.h> #include <linux/pid_namespace.h> #include <linux/jump_label_ratelimit.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/ipv6.h> #include <net/rawv6.h> #include <net/transp_v6.h> #include <linux/uaccess.h> #define FL_MIN_LINGER 6 /* Minimal linger. It is set to 6sec specified in old IPv6 RFC. Well, it was reasonable value. */ #define FL_MAX_LINGER 150 /* Maximal linger timeout */ /* FL hash table */ #define FL_MAX_PER_SOCK 32 #define FL_MAX_SIZE 4096 #define FL_HASH_MASK 255 #define FL_HASH(l) (ntohl(l)&FL_HASH_MASK) static atomic_t fl_size = ATOMIC_INIT(0); static struct ip6_flowlabel __rcu *fl_ht[FL_HASH_MASK+1]; static void ip6_fl_gc(struct timer_list *unused); static DEFINE_TIMER(ip6_fl_gc_timer, ip6_fl_gc); /* FL hash table lock: it protects only of GC */ static DEFINE_SPINLOCK(ip6_fl_lock); /* Big socket sock */ static DEFINE_SPINLOCK(ip6_sk_fl_lock); DEFINE_STATIC_KEY_DEFERRED_FALSE(ipv6_flowlabel_exclusive, HZ); EXPORT_SYMBOL(ipv6_flowlabel_exclusive); #define for_each_fl_rcu(hash, fl) \ for (fl = rcu_dereference(fl_ht[(hash)]); \ fl != NULL; \ fl = rcu_dereference(fl->next)) #define for_each_fl_continue_rcu(fl) \ for (fl = rcu_dereference(fl->next); \ fl != NULL; \ fl = rcu_dereference(fl->next)) #define for_each_sk_fl_rcu(sk, sfl) \ for (sfl = rcu_dereference(inet_sk(sk)->ipv6_fl_list); \ sfl != NULL; \ sfl = rcu_dereference(sfl->next)) static inline struct ip6_flowlabel *__fl_lookup(struct net *net, __be32 label) { struct ip6_flowlabel *fl; for_each_fl_rcu(FL_HASH(label), fl) { if (fl->label == label && net_eq(fl->fl_net, net)) return fl; } return NULL; } static struct ip6_flowlabel *fl_lookup(struct net *net, __be32 label) { struct ip6_flowlabel *fl; rcu_read_lock(); fl = __fl_lookup(net, label); if (fl && !atomic_inc_not_zero(&fl->users)) fl = NULL; rcu_read_unlock(); return fl; } static bool fl_shared_exclusive(struct ip6_flowlabel *fl) { return fl->share == IPV6_FL_S_EXCL || fl->share == IPV6_FL_S_PROCESS || fl->share == IPV6_FL_S_USER; } static void fl_free_rcu(struct rcu_head *head) { struct ip6_flowlabel *fl = container_of(head, struct ip6_flowlabel, rcu); if (fl->share == IPV6_FL_S_PROCESS) put_pid(fl->owner.pid); kfree(fl->opt); kfree(fl); } static void fl_free(struct ip6_flowlabel *fl) { if (!fl) return; if (fl_shared_exclusive(fl) || fl->opt) static_branch_slow_dec_deferred(&ipv6_flowlabel_exclusive); call_rcu(&fl->rcu, fl_free_rcu); } static void fl_release(struct ip6_flowlabel *fl) { spin_lock_bh(&ip6_fl_lock); fl->lastuse = jiffies; if (atomic_dec_and_test(&fl->users)) { unsigned long ttd = fl->lastuse + fl->linger; if (time_after(ttd, fl->expires)) fl->expires = ttd; ttd = fl->expires; if (!timer_pending(&ip6_fl_gc_timer) || time_after(ip6_fl_gc_timer.expires, ttd)) mod_timer(&ip6_fl_gc_timer, ttd); } spin_unlock_bh(&ip6_fl_lock); } static void ip6_fl_gc(struct timer_list *unused) { int i; unsigned long now = jiffies; unsigned long sched = 0; spin_lock(&ip6_fl_lock); for (i = 0; i <= FL_HASH_MASK; i++) { struct ip6_flowlabel *fl; struct ip6_flowlabel __rcu **flp; flp = &fl_ht[i]; while ((fl = rcu_dereference_protected(*flp, lockdep_is_held(&ip6_fl_lock))) != NULL) { if (atomic_read(&fl->users) == 0) { unsigned long ttd = fl->lastuse + fl->linger; if (time_after(ttd, fl->expires)) fl->expires = ttd; ttd = fl->expires; if (time_after_eq(now, ttd)) { *flp = fl->next; fl_free(fl); atomic_dec(&fl_size); continue; } if (!sched || time_before(ttd, sched)) sched = ttd; } flp = &fl->next; } } if (!sched && atomic_read(&fl_size)) sched = now + FL_MAX_LINGER; if (sched) { mod_timer(&ip6_fl_gc_timer, sched); } spin_unlock(&ip6_fl_lock); } static void __net_exit ip6_fl_purge(struct net *net) { int i; spin_lock_bh(&ip6_fl_lock); for (i = 0; i <= FL_HASH_MASK; i++) { struct ip6_flowlabel *fl; struct ip6_flowlabel __rcu **flp; flp = &fl_ht[i]; while ((fl = rcu_dereference_protected(*flp, lockdep_is_held(&ip6_fl_lock))) != NULL) { if (net_eq(fl->fl_net, net) && atomic_read(&fl->users) == 0) { *flp = fl->next; fl_free(fl); atomic_dec(&fl_size); continue; } flp = &fl->next; } } spin_unlock_bh(&ip6_fl_lock); } static struct ip6_flowlabel *fl_intern(struct net *net, struct ip6_flowlabel *fl, __be32 label) { struct ip6_flowlabel *lfl; fl->label = label & IPV6_FLOWLABEL_MASK; rcu_read_lock(); spin_lock_bh(&ip6_fl_lock); if (label == 0) { for (;;) { fl->label = htonl(get_random_u32())&IPV6_FLOWLABEL_MASK; if (fl->label) { lfl = __fl_lookup(net, fl->label); if (!lfl) break; } } } else { /* * we dropper the ip6_fl_lock, so this entry could reappear * and we need to recheck with it. * * OTOH no need to search the active socket first, like it is * done in ipv6_flowlabel_opt - sock is locked, so new entry * with the same label can only appear on another sock */ lfl = __fl_lookup(net, fl->label); if (lfl) { atomic_inc(&lfl->users); spin_unlock_bh(&ip6_fl_lock); rcu_read_unlock(); return lfl; } } fl->lastuse = jiffies; fl->next = fl_ht[FL_HASH(fl->label)]; rcu_assign_pointer(fl_ht[FL_HASH(fl->label)], fl); atomic_inc(&fl_size); spin_unlock_bh(&ip6_fl_lock); rcu_read_unlock(); return NULL; } /* Socket flowlabel lists */ struct ip6_flowlabel *__fl6_sock_lookup(struct sock *sk, __be32 label) { struct ipv6_fl_socklist *sfl; label &= IPV6_FLOWLABEL_MASK; rcu_read_lock(); for_each_sk_fl_rcu(sk, sfl) { struct ip6_flowlabel *fl = sfl->fl; if (fl->label == label && atomic_inc_not_zero(&fl->users)) { fl->lastuse = jiffies; rcu_read_unlock(); return fl; } } rcu_read_unlock(); return NULL; } EXPORT_SYMBOL_GPL(__fl6_sock_lookup); void fl6_free_socklist(struct sock *sk) { struct inet_sock *inet = inet_sk(sk); struct ipv6_fl_socklist *sfl; if (!rcu_access_pointer(inet->ipv6_fl_list)) return; spin_lock_bh(&ip6_sk_fl_lock); while ((sfl = rcu_dereference_protected(inet->ipv6_fl_list, lockdep_is_held(&ip6_sk_fl_lock))) != NULL) { inet->ipv6_fl_list = sfl->next; spin_unlock_bh(&ip6_sk_fl_lock); fl_release(sfl->fl); kfree_rcu(sfl, rcu); spin_lock_bh(&ip6_sk_fl_lock); } spin_unlock_bh(&ip6_sk_fl_lock); } /* Service routines */ /* It is the only difficult place. flowlabel enforces equal headers before and including routing header, however user may supply options following rthdr. */ struct ipv6_txoptions *fl6_merge_options(struct ipv6_txoptions *opt_space, struct ip6_flowlabel *fl, struct ipv6_txoptions *fopt) { struct ipv6_txoptions *fl_opt = fl->opt; if (!fopt || fopt->opt_flen == 0) return fl_opt; if (fl_opt) { opt_space->hopopt = fl_opt->hopopt; opt_space->dst0opt = fl_opt->dst0opt; opt_space->srcrt = fl_opt->srcrt; opt_space->opt_nflen = fl_opt->opt_nflen; } else { if (fopt->opt_nflen == 0) return fopt; opt_space->hopopt = NULL; opt_space->dst0opt = NULL; opt_space->srcrt = NULL; opt_space->opt_nflen = 0; } opt_space->dst1opt = fopt->dst1opt; opt_space->opt_flen = fopt->opt_flen; opt_space->tot_len = fopt->tot_len; return opt_space; } EXPORT_SYMBOL_GPL(fl6_merge_options); static unsigned long check_linger(unsigned long ttl) { if (ttl < FL_MIN_LINGER) return FL_MIN_LINGER*HZ; if (ttl > FL_MAX_LINGER && !capable(CAP_NET_ADMIN)) return 0; return ttl*HZ; } static int fl6_renew(struct ip6_flowlabel *fl, unsigned long linger, unsigned long expires) { linger = check_linger(linger); if (!linger) return -EPERM; expires = check_linger(expires); if (!expires) return -EPERM; spin_lock_bh(&ip6_fl_lock); fl->lastuse = jiffies; if (time_before(fl->linger, linger)) fl->linger = linger; if (time_before(expires, fl->linger)) expires = fl->linger; if (time_before(fl->expires, fl->lastuse + expires)) fl->expires = fl->lastuse + expires; spin_unlock_bh(&ip6_fl_lock); return 0; } static struct ip6_flowlabel * fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq, sockptr_t optval, int optlen, int *err_p) { struct ip6_flowlabel *fl = NULL; int olen; int addr_type; int err; olen = optlen - CMSG_ALIGN(sizeof(*freq)); err = -EINVAL; if (olen > 64 * 1024) goto done; err = -ENOMEM; fl = kzalloc_obj(*fl); if (!fl) goto done; if (olen > 0) { struct msghdr msg; struct flowi6 flowi6; struct ipcm6_cookie ipc6; err = -ENOMEM; fl->opt = kmalloc(sizeof(*fl->opt) + olen, GFP_KERNEL); if (!fl->opt) goto done; memset(fl->opt, 0, sizeof(*fl->opt)); fl->opt->tot_len = sizeof(*fl->opt) + olen; err = -EFAULT; if (copy_from_sockptr_offset(fl->opt + 1, optval, CMSG_ALIGN(sizeof(*freq)), olen)) goto done; msg.msg_controllen = olen; msg.msg_control = (void *)(fl->opt+1); memset(&flowi6, 0, sizeof(flowi6)); ipc6.opt = fl->opt; err = ip6_datagram_send_ctl(net, sk, &msg, &flowi6, &ipc6); if (err) goto done; err = -EINVAL; if (fl->opt->opt_flen) goto done; if (fl->opt->opt_nflen == 0) { kfree(fl->opt); fl->opt = NULL; } } fl->fl_net = net; fl->expires = jiffies; err = fl6_renew(fl, freq->flr_linger, freq->flr_expires); if (err) goto done; fl->share = freq->flr_share; addr_type = ipv6_addr_type(&freq->flr_dst); if ((addr_type & IPV6_ADDR_MAPPED) || addr_type == IPV6_ADDR_ANY) { err = -EINVAL; goto done; } fl->dst = freq->flr_dst; atomic_set(&fl->users, 1); switch (fl->share) { case IPV6_FL_S_EXCL: case IPV6_FL_S_ANY: break; case IPV6_FL_S_PROCESS: fl->owner.pid = get_task_pid(current, PIDTYPE_PID); break; case IPV6_FL_S_USER: fl->owner.uid = current_euid(); break; default: err = -EINVAL; goto done; } if (fl_shared_exclusive(fl) || fl->opt) { WRITE_ONCE(sock_net(sk)->ipv6.flowlabel_has_excl, 1); static_branch_deferred_inc(&ipv6_flowlabel_exclusive); } return fl; done: if (fl) { kfree(fl->opt); kfree(fl); } *err_p = err; return NULL; } static int mem_check(struct sock *sk) { int room = FL_MAX_SIZE - atomic_read(&fl_size); struct ipv6_fl_socklist *sfl; int count = 0; if (room > FL_MAX_SIZE - FL_MAX_PER_SOCK) return 0; rcu_read_lock(); for_each_sk_fl_rcu(sk, sfl) count++; rcu_read_unlock(); if (room <= 0 || ((count >= FL_MAX_PER_SOCK || (count > 0 && room < FL_MAX_SIZE/2) || room < FL_MAX_SIZE/4) && !capable(CAP_NET_ADMIN))) return -ENOBUFS; return 0; } static inline void fl_link(struct sock *sk, struct ipv6_fl_socklist *sfl, struct ip6_flowlabel *fl) { struct inet_sock *inet = inet_sk(sk); spin_lock_bh(&ip6_sk_fl_lock); sfl->fl = fl; sfl->next = inet->ipv6_fl_list; rcu_assign_pointer(inet->ipv6_fl_list, sfl); spin_unlock_bh(&ip6_sk_fl_lock); } int ipv6_flowlabel_opt_get(struct sock *sk, struct in6_flowlabel_req *freq, int flags) { struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_fl_socklist *sfl; if (flags & IPV6_FL_F_REMOTE) { freq->flr_label = np->rcv_flowinfo & IPV6_FLOWLABEL_MASK; return 0; } if (inet6_test_bit(REPFLOW, sk)) { freq->flr_label = np->flow_label; return 0; } rcu_read_lock(); for_each_sk_fl_rcu(sk, sfl) { if (sfl->fl->label == (np->flow_label & IPV6_FLOWLABEL_MASK)) { spin_lock_bh(&ip6_fl_lock); freq->flr_label = sfl->fl->label; freq->flr_dst = sfl->fl->dst; freq->flr_share = sfl->fl->share; freq->flr_expires = (sfl->fl->expires - jiffies) / HZ; freq->flr_linger = sfl->fl->linger / HZ; spin_unlock_bh(&ip6_fl_lock); rcu_read_unlock(); return 0; } } rcu_read_unlock(); return -ENOENT; } #define socklist_dereference(__sflp) \ rcu_dereference_protected(__sflp, lockdep_is_held(&ip6_sk_fl_lock)) static int ipv6_flowlabel_put(struct sock *sk, struct in6_flowlabel_req *freq) { struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_fl_socklist __rcu **sflp; struct ipv6_fl_socklist *sfl; if (freq->flr_flags & IPV6_FL_F_REFLECT) { if (sk->sk_protocol != IPPROTO_TCP) return -ENOPROTOOPT; if (!inet6_test_bit(REPFLOW, sk)) return -ESRCH; np->flow_label = 0; inet6_clear_bit(REPFLOW, sk); return 0; } spin_lock_bh(&ip6_sk_fl_lock); for (sflp = &inet_sk(sk)->ipv6_fl_list; (sfl = socklist_dereference(*sflp)) != NULL; sflp = &sfl->next) { if (sfl->fl->label == freq->flr_label) goto found; } spin_unlock_bh(&ip6_sk_fl_lock); return -ESRCH; found: if (freq->flr_label == (np->flow_label & IPV6_FLOWLABEL_MASK)) np->flow_label &= ~IPV6_FLOWLABEL_MASK; *sflp = sfl->next; spin_unlock_bh(&ip6_sk_fl_lock); fl_release(sfl->fl); kfree_rcu(sfl, rcu); return 0; } static int ipv6_flowlabel_renew(struct sock *sk, struct in6_flowlabel_req *freq) { struct net *net = sock_net(sk); struct ipv6_fl_socklist *sfl; int err; rcu_read_lock(); for_each_sk_fl_rcu(sk, sfl) { if (sfl->fl->label == freq->flr_label) { err = fl6_renew(sfl->fl, freq->flr_linger, freq->flr_expires); rcu_read_unlock(); return err; } } rcu_read_unlock(); if (freq->flr_share == IPV6_FL_S_NONE && ns_capable(net->user_ns, CAP_NET_ADMIN)) { struct ip6_flowlabel *fl = fl_lookup(net, freq->flr_label); if (fl) { err = fl6_renew(fl, freq->flr_linger, freq->flr_expires); fl_release(fl); return err; } } return -ESRCH; } static int ipv6_flowlabel_get(struct sock *sk, struct in6_flowlabel_req *freq, sockptr_t optval, int optlen) { struct ipv6_fl_socklist *sfl, *sfl1 = NULL; struct ip6_flowlabel *fl, *fl1 = NULL; struct net *net = sock_net(sk); int err; if (freq->flr_flags & IPV6_FL_F_REFLECT) { if (net->ipv6.sysctl.flowlabel_consistency) { net_info_ratelimited("Can not set IPV6_FL_F_REFLECT if flowlabel_consistency sysctl is enable\n"); return -EPERM; } if (sk->sk_protocol != IPPROTO_TCP) return -ENOPROTOOPT; inet6_set_bit(REPFLOW, sk); return 0; } if (freq->flr_label & ~IPV6_FLOWLABEL_MASK) return -EINVAL; if (net->ipv6.sysctl.flowlabel_state_ranges && (freq->flr_label & IPV6_FLOWLABEL_STATELESS_FLAG)) return -ERANGE; fl = fl_create(net, sk, freq, optval, optlen, &err); if (!fl) return err; sfl1 = kmalloc_obj(*sfl1); if (freq->flr_label) { err = -EEXIST; rcu_read_lock(); for_each_sk_fl_rcu(sk, sfl) { if (sfl->fl->label == freq->flr_label) { if (freq->flr_flags & IPV6_FL_F_EXCL) { rcu_read_unlock(); goto done; } fl1 = sfl->fl; if (!atomic_inc_not_zero(&fl1->users)) fl1 = NULL; break; } } rcu_read_unlock(); if (!fl1) fl1 = fl_lookup(net, freq->flr_label); if (fl1) { recheck: err = -EEXIST; if (freq->flr_flags&IPV6_FL_F_EXCL) goto release; err = -EPERM; if (fl1->share == IPV6_FL_S_EXCL || fl1->share != fl->share || ((fl1->share == IPV6_FL_S_PROCESS) && (fl1->owner.pid != fl->owner.pid)) || ((fl1->share == IPV6_FL_S_USER) && !uid_eq(fl1->owner.uid, fl->owner.uid))) goto release; err = -ENOMEM; if (!sfl1) goto release; if (fl->linger > fl1->linger) fl1->linger = fl->linger; if ((long)(fl->expires - fl1->expires) > 0) fl1->expires = fl->expires; fl_link(sk, sfl1, fl1); fl_free(fl); return 0; release: fl_release(fl1); goto done; } } err = -ENOENT; if (!(freq->flr_flags & IPV6_FL_F_CREATE)) goto done; err = -ENOMEM; if (!sfl1) goto done; err = mem_check(sk); if (err != 0) goto done; fl1 = fl_intern(net, fl, freq->flr_label); if (fl1) goto recheck; if (!freq->flr_label) { size_t offset = offsetof(struct in6_flowlabel_req, flr_label); if (copy_to_sockptr_offset(optval, offset, &fl->label, sizeof(fl->label))) { /* Intentionally ignore fault. */ } } fl_link(sk, sfl1, fl); return 0; done: fl_free(fl); kfree(sfl1); return err; } int ipv6_flowlabel_opt(struct sock *sk, sockptr_t optval, int optlen) { struct in6_flowlabel_req freq; if (optlen < sizeof(freq)) return -EINVAL; if (copy_from_sockptr(&freq, optval, sizeof(freq))) return -EFAULT; switch (freq.flr_action) { case IPV6_FL_A_PUT: return ipv6_flowlabel_put(sk, &freq); case IPV6_FL_A_RENEW: return ipv6_flowlabel_renew(sk, &freq); case IPV6_FL_A_GET: return ipv6_flowlabel_get(sk, &freq, optval, optlen); default: return -EINVAL; } } #ifdef CONFIG_PROC_FS struct ip6fl_iter_state { struct seq_net_private p; struct pid_namespace *pid_ns; int bucket; }; #define ip6fl_seq_private(seq) ((struct ip6fl_iter_state *)(seq)->private) static struct ip6_flowlabel *ip6fl_get_first(struct seq_file *seq) { struct ip6_flowlabel *fl = NULL; struct ip6fl_iter_state *state = ip6fl_seq_private(seq); struct net *net = seq_file_net(seq); for (state->bucket = 0; state->bucket <= FL_HASH_MASK; ++state->bucket) { for_each_fl_rcu(state->bucket, fl) { if (net_eq(fl->fl_net, net)) goto out; } } fl = NULL; out: return fl; } static struct ip6_flowlabel *ip6fl_get_next(struct seq_file *seq, struct ip6_flowlabel *fl) { struct ip6fl_iter_state *state = ip6fl_seq_private(seq); struct net *net = seq_file_net(seq); for_each_fl_continue_rcu(fl) { if (net_eq(fl->fl_net, net)) goto out; } try_again: if (++state->bucket <= FL_HASH_MASK) { for_each_fl_rcu(state->bucket, fl) { if (net_eq(fl->fl_net, net)) goto out; } goto try_again; } fl = NULL; out: return fl; } static struct ip6_flowlabel *ip6fl_get_idx(struct seq_file *seq, loff_t pos) { struct ip6_flowlabel *fl = ip6fl_get_first(seq); if (fl) while (pos && (fl = ip6fl_get_next(seq, fl)) != NULL) --pos; return pos ? NULL : fl; } static void *ip6fl_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { struct ip6fl_iter_state *state = ip6fl_seq_private(seq); state->pid_ns = proc_pid_ns(file_inode(seq->file)->i_sb); rcu_read_lock(); return *pos ? ip6fl_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; } static void *ip6fl_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct ip6_flowlabel *fl; if (v == SEQ_START_TOKEN) fl = ip6fl_get_first(seq); else fl = ip6fl_get_next(seq, v); ++*pos; return fl; } static void ip6fl_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { rcu_read_unlock(); } static int ip6fl_seq_show(struct seq_file *seq, void *v) { struct ip6fl_iter_state *state = ip6fl_seq_private(seq); if (v == SEQ_START_TOKEN) { seq_puts(seq, "Label S Owner Users Linger Expires Dst Opt\n"); } else { struct ip6_flowlabel *fl = v; seq_printf(seq, "%05X %-1d %-6d %-6d %-6ld %-8ld %pi6 %-4d\n", (unsigned int)ntohl(fl->label), fl->share, ((fl->share == IPV6_FL_S_PROCESS) ? pid_nr_ns(fl->owner.pid, state->pid_ns) : ((fl->share == IPV6_FL_S_USER) ? from_kuid_munged(seq_user_ns(seq), fl->owner.uid) : 0)), atomic_read(&fl->users), fl->linger/HZ, (long)(fl->expires - jiffies)/HZ, &fl->dst, fl->opt ? fl->opt->opt_nflen : 0); } return 0; } static const struct seq_operations ip6fl_seq_ops = { .start = ip6fl_seq_start, .next = ip6fl_seq_next, .stop = ip6fl_seq_stop, .show = ip6fl_seq_show, }; static int __net_init ip6_flowlabel_proc_init(struct net *net) { if (!proc_create_net("ip6_flowlabel", 0444, net->proc_net, &ip6fl_seq_ops, sizeof(struct ip6fl_iter_state))) return -ENOMEM; return 0; } static void __net_exit ip6_flowlabel_proc_fini(struct net *net) { remove_proc_entry("ip6_flowlabel", net->proc_net); } #else static inline int ip6_flowlabel_proc_init(struct net *net) { return 0; } static inline void ip6_flowlabel_proc_fini(struct net *net) { } #endif static void __net_exit ip6_flowlabel_net_exit(struct net *net) { ip6_fl_purge(net); ip6_flowlabel_proc_fini(net); } static struct pernet_operations ip6_flowlabel_net_ops = { .init = ip6_flowlabel_proc_init, .exit = ip6_flowlabel_net_exit, }; int ip6_flowlabel_init(void) { return register_pernet_subsys(&ip6_flowlabel_net_ops); } void ip6_flowlabel_cleanup(void) { static_key_deferred_flush(&ipv6_flowlabel_exclusive); timer_delete(&ip6_fl_gc_timer); unregister_pernet_subsys(&ip6_flowlabel_net_ops); }
11 2 2 11 11 11 11 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 // SPDX-License-Identifier: GPL-2.0 /* * Key setup for v1 encryption policies * * Copyright 2015, 2019 Google LLC */ /* * This file implements compatibility functions for the original encryption * policy version ("v1"), including: * * - Deriving per-file encryption keys using the AES-128-ECB based KDF * (rather than the new method of using HKDF-SHA512) * * - Retrieving fscrypt master keys from process-subscribed keyrings * (rather than the new method of using a filesystem-level keyring) * * - Handling policies with the DIRECT_KEY flag set using a master key table * (rather than the new method of implementing DIRECT_KEY with per-mode keys * managed alongside the master keys in the filesystem-level keyring) */ #include <crypto/skcipher.h> #include <crypto/utils.h> #include <keys/user-type.h> #include <linux/hashtable.h> #include <linux/scatterlist.h> #include "fscrypt_private.h" /* Table of keys referenced by DIRECT_KEY policies */ static DEFINE_HASHTABLE(fscrypt_direct_keys, 6); /* 6 bits = 64 buckets */ static DEFINE_SPINLOCK(fscrypt_direct_keys_lock); /* * v1 key derivation function. This generates the derived key by encrypting the * master key with AES-128-ECB using the nonce as the AES key. This provides a * unique derived key with sufficient entropy for each inode. However, it's * nonstandard, non-extensible, doesn't evenly distribute the entropy from the * master key, and is trivially reversible: an attacker who compromises a * derived key can "decrypt" it to get back to the master key, then derive any * other key. For all new code, use HKDF instead. * * The master key must be at least as long as the derived key. If the master * key is longer, then only the first 'derived_keysize' bytes are used. */ static int derive_key_aes(const u8 *master_key, const u8 nonce[FSCRYPT_FILE_NONCE_SIZE], u8 *derived_key, unsigned int derived_keysize) { struct crypto_sync_skcipher *tfm; int err; tfm = crypto_alloc_sync_skcipher("ecb(aes)", 0, FSCRYPT_CRYPTOAPI_MASK); if (IS_ERR(tfm)) return PTR_ERR(tfm); err = crypto_sync_skcipher_setkey(tfm, nonce, FSCRYPT_FILE_NONCE_SIZE); if (err == 0) { SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); struct scatterlist src_sg, dst_sg; skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL); sg_init_one(&src_sg, master_key, derived_keysize); sg_init_one(&dst_sg, derived_key, derived_keysize); skcipher_request_set_crypt(req, &src_sg, &dst_sg, derived_keysize, NULL); err = crypto_skcipher_encrypt(req); } crypto_free_sync_skcipher(tfm); return err; } /* * Search the current task's subscribed keyrings for a "logon" key with * description prefix:descriptor, and if found acquire a read lock on it and * return a pointer to its validated payload in *payload_ret. */ static struct key * find_and_lock_process_key(const char *prefix, const u8 descriptor[FSCRYPT_KEY_DESCRIPTOR_SIZE], unsigned int min_keysize, const struct fscrypt_key **payload_ret) { char *description; struct key *key; const struct user_key_payload *ukp; const struct fscrypt_key *payload; description = kasprintf(GFP_KERNEL, "%s%*phN", prefix, FSCRYPT_KEY_DESCRIPTOR_SIZE, descriptor); if (!description) return ERR_PTR(-ENOMEM); key = request_key(&key_type_logon, description, NULL); kfree(description); if (IS_ERR(key)) return key; down_read(&key->sem); ukp = user_key_payload_locked(key); if (!ukp) /* was the key revoked before we acquired its semaphore? */ goto invalid; payload = (const struct fscrypt_key *)ukp->data; if (ukp->datalen != sizeof(struct fscrypt_key) || payload->size < 1 || payload->size > sizeof(payload->raw)) { fscrypt_warn(NULL, "key with description '%s' has invalid payload", key->description); goto invalid; } if (payload->size < min_keysize) { fscrypt_warn(NULL, "key with description '%s' is too short (got %u bytes, need %u+ bytes)", key->description, payload->size, min_keysize); goto invalid; } *payload_ret = payload; return key; invalid: up_read(&key->sem); key_put(key); return ERR_PTR(-ENOKEY); } /* Master key referenced by DIRECT_KEY policy */ struct fscrypt_direct_key { struct super_block *dk_sb; struct hlist_node dk_node; refcount_t dk_refcount; const struct fscrypt_mode *dk_mode; struct fscrypt_prepared_key dk_key; u8 dk_descriptor[FSCRYPT_KEY_DESCRIPTOR_SIZE]; u8 dk_raw[FSCRYPT_MAX_RAW_KEY_SIZE]; }; static void free_direct_key(struct fscrypt_direct_key *dk) { if (dk) { fscrypt_destroy_prepared_key(dk->dk_sb, &dk->dk_key); kfree_sensitive(dk); } } void fscrypt_put_direct_key(struct fscrypt_direct_key *dk) { if (!refcount_dec_and_lock(&dk->dk_refcount, &fscrypt_direct_keys_lock)) return; hash_del(&dk->dk_node); spin_unlock(&fscrypt_direct_keys_lock); free_direct_key(dk); } /* * Find/insert the given key into the fscrypt_direct_keys table. If found, it * is returned with elevated refcount, and 'to_insert' is freed if non-NULL. If * not found, 'to_insert' is inserted and returned if it's non-NULL; otherwise * NULL is returned. */ static struct fscrypt_direct_key * find_or_insert_direct_key(struct fscrypt_direct_key *to_insert, const u8 *raw_key, const struct fscrypt_inode_info *ci) { unsigned long hash_key; struct fscrypt_direct_key *dk; /* * Careful: to avoid potentially leaking secret key bytes via timing * information, we must key the hash table by descriptor rather than by * raw key, and use crypto_memneq() when comparing raw keys. */ BUILD_BUG_ON(sizeof(hash_key) > FSCRYPT_KEY_DESCRIPTOR_SIZE); memcpy(&hash_key, ci->ci_policy.v1.master_key_descriptor, sizeof(hash_key)); spin_lock(&fscrypt_direct_keys_lock); hash_for_each_possible(fscrypt_direct_keys, dk, dk_node, hash_key) { if (memcmp(ci->ci_policy.v1.master_key_descriptor, dk->dk_descriptor, FSCRYPT_KEY_DESCRIPTOR_SIZE) != 0) continue; if (ci->ci_mode != dk->dk_mode) continue; if (!fscrypt_is_key_prepared(&dk->dk_key, ci)) continue; if (crypto_memneq(raw_key, dk->dk_raw, ci->ci_mode->keysize)) continue; /* using existing tfm with same (descriptor, mode, raw_key) */ refcount_inc(&dk->dk_refcount); spin_unlock(&fscrypt_direct_keys_lock); free_direct_key(to_insert); return dk; } if (to_insert) hash_add(fscrypt_direct_keys, &to_insert->dk_node, hash_key); spin_unlock(&fscrypt_direct_keys_lock); return to_insert; } /* Prepare to encrypt directly using the master key in the given mode */ static struct fscrypt_direct_key * fscrypt_get_direct_key(const struct fscrypt_inode_info *ci, const u8 *raw_key) { struct fscrypt_direct_key *dk; int err; /* Is there already a tfm for this key? */ dk = find_or_insert_direct_key(NULL, raw_key, ci); if (dk) return dk; /* Nope, allocate one. */ dk = kzalloc_obj(*dk); if (!dk) return ERR_PTR(-ENOMEM); dk->dk_sb = ci->ci_inode->i_sb; refcount_set(&dk->dk_refcount, 1); dk->dk_mode = ci->ci_mode; err = fscrypt_prepare_key(&dk->dk_key, raw_key, ci); if (err) goto err_free_dk; memcpy(dk->dk_descriptor, ci->ci_policy.v1.master_key_descriptor, FSCRYPT_KEY_DESCRIPTOR_SIZE); memcpy(dk->dk_raw, raw_key, ci->ci_mode->keysize); return find_or_insert_direct_key(dk, raw_key, ci); err_free_dk: free_direct_key(dk); return ERR_PTR(err); } /* v1 policy, DIRECT_KEY: use the master key directly */ static int setup_v1_file_key_direct(struct fscrypt_inode_info *ci, const u8 *raw_master_key) { struct fscrypt_direct_key *dk; dk = fscrypt_get_direct_key(ci, raw_master_key); if (IS_ERR(dk)) return PTR_ERR(dk); ci->ci_direct_key = dk; ci->ci_enc_key = dk->dk_key; return 0; } /* v1 policy, !DIRECT_KEY: derive the file's encryption key */ static int setup_v1_file_key_derived(struct fscrypt_inode_info *ci, const u8 *raw_master_key) { u8 *derived_key; int err; /* * This cannot be a stack buffer because it will be passed to the * scatterlist crypto API during derive_key_aes(). */ derived_key = kmalloc(ci->ci_mode->keysize, GFP_KERNEL); if (!derived_key) return -ENOMEM; err = derive_key_aes(raw_master_key, ci->ci_nonce, derived_key, ci->ci_mode->keysize); if (err) goto out; err = fscrypt_set_per_file_enc_key(ci, derived_key); out: kfree_sensitive(derived_key); return err; } int fscrypt_setup_v1_file_key(struct fscrypt_inode_info *ci, const u8 *raw_master_key) { if (ci->ci_policy.v1.flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) return setup_v1_file_key_direct(ci, raw_master_key); else return setup_v1_file_key_derived(ci, raw_master_key); } int fscrypt_setup_v1_file_key_via_subscribed_keyrings(struct fscrypt_inode_info *ci) { const struct super_block *sb = ci->ci_inode->i_sb; struct key *key; const struct fscrypt_key *payload; int err; key = find_and_lock_process_key(FSCRYPT_KEY_DESC_PREFIX, ci->ci_policy.v1.master_key_descriptor, ci->ci_mode->keysize, &payload); if (key == ERR_PTR(-ENOKEY) && sb->s_cop->legacy_key_prefix) { key = find_and_lock_process_key(sb->s_cop->legacy_key_prefix, ci->ci_policy.v1.master_key_descriptor, ci->ci_mode->keysize, &payload); } if (IS_ERR(key)) return PTR_ERR(key); err = fscrypt_setup_v1_file_key(ci, payload->raw); up_read(&key->sem); key_put(key); return err; }
3 3 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 // SPDX-License-Identifier: GPL-2.0-only #include <linux/types.h> #include <linux/skbuff.h> #include <linux/socket.h> #include <linux/sysctl.h> #include <linux/net.h> #include <linux/module.h> #include <linux/if_arp.h> #include <linux/ipv6.h> #include <linux/mpls.h> #include <linux/netconf.h> #include <linux/nospec.h> #include <linux/vmalloc.h> #include <linux/percpu.h> #include <net/gso.h> #include <net/ip.h> #include <net/dst.h> #include <net/sock.h> #include <net/arp.h> #include <net/ip_fib.h> #include <net/netevent.h> #include <net/ip_tunnels.h> #include <net/netns/generic.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/ipv6.h> #endif #include <net/ipv6_stubs.h> #include <net/rtnh.h> #include "internal.h" /* max memory we will use for mpls_route */ #define MAX_MPLS_ROUTE_MEM 4096 /* Maximum number of labels to look ahead at when selecting a path of * a multipath route */ #define MAX_MP_SELECT_LABELS 4 #define MPLS_NEIGH_TABLE_UNSPEC (NEIGH_LINK_TABLE + 1) static int label_limit = (1 << 20) - 1; static int ttl_max = 255; #if IS_ENABLED(CONFIG_NET_IP_TUNNEL) static size_t ipgre_mpls_encap_hlen(struct ip_tunnel_encap *e) { return sizeof(struct mpls_shim_hdr); } static const struct ip_tunnel_encap_ops mpls_iptun_ops = { .encap_hlen = ipgre_mpls_encap_hlen, }; static int ipgre_tunnel_encap_add_mpls_ops(void) { return ip_tunnel_encap_add_ops(&mpls_iptun_ops, TUNNEL_ENCAP_MPLS); } static void ipgre_tunnel_encap_del_mpls_ops(void) { ip_tunnel_encap_del_ops(&mpls_iptun_ops, TUNNEL_ENCAP_MPLS); } #else static int ipgre_tunnel_encap_add_mpls_ops(void) { return 0; } static void ipgre_tunnel_encap_del_mpls_ops(void) { } #endif static void rtmsg_lfib(int event, u32 label, struct mpls_route *rt, struct nlmsghdr *nlh, struct net *net, u32 portid, unsigned int nlm_flags); static struct mpls_route *mpls_route_input(struct net *net, unsigned int index) { struct mpls_route __rcu **platform_label; platform_label = mpls_dereference(net, net->mpls.platform_label); return mpls_dereference(net, platform_label[index]); } static struct mpls_route __rcu **mpls_platform_label_rcu(struct net *net, size_t *platform_labels) { struct mpls_route __rcu **platform_label; unsigned int sequence; do { sequence = read_seqcount_begin(&net->mpls.platform_label_seq); platform_label = rcu_dereference(net->mpls.platform_label); *platform_labels = net->mpls.platform_labels; } while (read_seqcount_retry(&net->mpls.platform_label_seq, sequence)); return platform_label; } static struct mpls_route *mpls_route_input_rcu(struct net *net, unsigned int index) { struct mpls_route __rcu **platform_label; size_t platform_labels; platform_label = mpls_platform_label_rcu(net, &platform_labels); if (index >= platform_labels) return NULL; return rcu_dereference(platform_label[index]); } bool mpls_output_possible(const struct net_device *dev) { return dev && (dev->flags & IFF_UP) && netif_carrier_ok(dev); } EXPORT_SYMBOL_GPL(mpls_output_possible); static u8 *__mpls_nh_via(struct mpls_route *rt, struct mpls_nh *nh) { return (u8 *)nh + rt->rt_via_offset; } static const u8 *mpls_nh_via(const struct mpls_route *rt, const struct mpls_nh *nh) { return __mpls_nh_via((struct mpls_route *)rt, (struct mpls_nh *)nh); } static unsigned int mpls_nh_header_size(const struct mpls_nh *nh) { /* The size of the layer 2.5 labels to be added for this route */ return nh->nh_labels * sizeof(struct mpls_shim_hdr); } unsigned int mpls_dev_mtu(const struct net_device *dev) { /* The amount of data the layer 2 frame can hold */ return dev->mtu; } EXPORT_SYMBOL_GPL(mpls_dev_mtu); bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) { if (skb->len <= mtu) return false; if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) return false; return true; } EXPORT_SYMBOL_GPL(mpls_pkt_too_big); void mpls_stats_inc_outucastpkts(struct net *net, struct net_device *dev, const struct sk_buff *skb) { struct mpls_dev *mdev; if (skb->protocol == htons(ETH_P_MPLS_UC)) { mdev = mpls_dev_rcu(dev); if (mdev) MPLS_INC_STATS_LEN(mdev, skb->len, tx_packets, tx_bytes); } else if (skb->protocol == htons(ETH_P_IP)) { IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len); #if IS_ENABLED(CONFIG_IPV6) } else if (skb->protocol == htons(ETH_P_IPV6)) { struct inet6_dev *in6dev = in6_dev_rcu(dev); if (in6dev) IP6_UPD_PO_STATS(net, in6dev, IPSTATS_MIB_OUT, skb->len); #endif } } EXPORT_SYMBOL_GPL(mpls_stats_inc_outucastpkts); static u32 mpls_multipath_hash(struct mpls_route *rt, struct sk_buff *skb) { struct mpls_entry_decoded dec; unsigned int mpls_hdr_len = 0; struct mpls_shim_hdr *hdr; bool eli_seen = false; int label_index; u32 hash = 0; for (label_index = 0; label_index < MAX_MP_SELECT_LABELS; label_index++) { mpls_hdr_len += sizeof(*hdr); if (!pskb_may_pull(skb, mpls_hdr_len)) break; /* Read and decode the current label */ hdr = mpls_hdr(skb) + label_index; dec = mpls_entry_decode(hdr); /* RFC6790 - reserved labels MUST NOT be used as keys * for the load-balancing function */ if (likely(dec.label >= MPLS_LABEL_FIRST_UNRESERVED)) { hash = jhash_1word(dec.label, hash); /* The entropy label follows the entropy label * indicator, so this means that the entropy * label was just added to the hash - no need to * go any deeper either in the label stack or in the * payload */ if (eli_seen) break; } else if (dec.label == MPLS_LABEL_ENTROPY) { eli_seen = true; } if (!dec.bos) continue; /* found bottom label; does skb have room for a header? */ if (pskb_may_pull(skb, mpls_hdr_len + sizeof(struct iphdr))) { const struct iphdr *v4hdr; v4hdr = (const struct iphdr *)(hdr + 1); if (v4hdr->version == 4) { hash = jhash_3words(ntohl(v4hdr->saddr), ntohl(v4hdr->daddr), v4hdr->protocol, hash); } else if (v4hdr->version == 6 && pskb_may_pull(skb, mpls_hdr_len + sizeof(struct ipv6hdr))) { const struct ipv6hdr *v6hdr; v6hdr = (const struct ipv6hdr *)(hdr + 1); hash = __ipv6_addr_jhash(&v6hdr->saddr, hash); hash = __ipv6_addr_jhash(&v6hdr->daddr, hash); hash = jhash_1word(v6hdr->nexthdr, hash); } } break; } return hash; } static struct mpls_nh *mpls_get_nexthop(struct mpls_route *rt, u8 index) { return (struct mpls_nh *)((u8 *)rt->rt_nh + index * rt->rt_nh_size); } /* number of alive nexthops (rt->rt_nhn_alive) and the flags for * a next hop (nh->nh_flags) are modified by netdev event handlers. * Since those fields can change at any moment, use READ_ONCE to * access both. */ static const struct mpls_nh *mpls_select_multipath(struct mpls_route *rt, struct sk_buff *skb) { u32 hash = 0; int nh_index = 0; int n = 0; u8 alive; /* No need to look further into packet if there's only * one path */ if (rt->rt_nhn == 1) return rt->rt_nh; alive = READ_ONCE(rt->rt_nhn_alive); if (alive == 0) return NULL; hash = mpls_multipath_hash(rt, skb); nh_index = hash % alive; if (alive == rt->rt_nhn) goto out; for_nexthops(rt) { unsigned int nh_flags = READ_ONCE(nh->nh_flags); if (nh_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN)) continue; if (n == nh_index) return nh; n++; } endfor_nexthops(rt); out: return mpls_get_nexthop(rt, nh_index); } static bool mpls_egress(struct net *net, struct mpls_route *rt, struct sk_buff *skb, struct mpls_entry_decoded dec) { enum mpls_payload_type payload_type; bool success = false; /* The IPv4 code below accesses through the IPv4 header * checksum, which is 12 bytes into the packet. * The IPv6 code below accesses through the IPv6 hop limit * which is 8 bytes into the packet. * * For all supported cases there should always be at least 12 * bytes of packet data present. The IPv4 header is 20 bytes * without options and the IPv6 header is always 40 bytes * long. */ if (!pskb_may_pull(skb, 12)) return false; payload_type = rt->rt_payload_type; if (payload_type == MPT_UNSPEC) payload_type = ip_hdr(skb)->version; switch (payload_type) { case MPT_IPV4: { struct iphdr *hdr4 = ip_hdr(skb); u8 new_ttl; skb->protocol = htons(ETH_P_IP); /* If propagating TTL, take the decremented TTL from * the incoming MPLS header, otherwise decrement the * TTL, but only if not 0 to avoid underflow. */ if (rt->rt_ttl_propagate == MPLS_TTL_PROP_ENABLED || (rt->rt_ttl_propagate == MPLS_TTL_PROP_DEFAULT && net->mpls.ip_ttl_propagate)) new_ttl = dec.ttl; else new_ttl = hdr4->ttl ? hdr4->ttl - 1 : 0; csum_replace2(&hdr4->check, htons(hdr4->ttl << 8), htons(new_ttl << 8)); hdr4->ttl = new_ttl; success = true; break; } case MPT_IPV6: { struct ipv6hdr *hdr6 = ipv6_hdr(skb); skb->protocol = htons(ETH_P_IPV6); /* If propagating TTL, take the decremented TTL from * the incoming MPLS header, otherwise decrement the * hop limit, but only if not 0 to avoid underflow. */ if (rt->rt_ttl_propagate == MPLS_TTL_PROP_ENABLED || (rt->rt_ttl_propagate == MPLS_TTL_PROP_DEFAULT && net->mpls.ip_ttl_propagate)) hdr6->hop_limit = dec.ttl; else if (hdr6->hop_limit) hdr6->hop_limit = hdr6->hop_limit - 1; success = true; break; } case MPT_UNSPEC: /* Should have decided which protocol it is by now */ break; } return success; } static int mpls_forward(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct net *net = dev_net_rcu(dev); struct mpls_shim_hdr *hdr; const struct mpls_nh *nh; struct mpls_route *rt; struct mpls_entry_decoded dec; struct net_device *out_dev; struct mpls_dev *out_mdev; struct mpls_dev *mdev; unsigned int hh_len; unsigned int new_header_size; unsigned int mtu; int err; /* Careful this entire function runs inside of an rcu critical section */ mdev = mpls_dev_rcu(dev); if (!mdev) goto drop; MPLS_INC_STATS_LEN(mdev, skb->len, rx_packets, rx_bytes); if (!mdev->input_enabled) { MPLS_INC_STATS(mdev, rx_dropped); goto drop; } if (skb->pkt_type != PACKET_HOST) goto err; if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) goto err; if (!pskb_may_pull(skb, sizeof(*hdr))) goto err; skb_dst_drop(skb); /* Read and decode the label */ hdr = mpls_hdr(skb); dec = mpls_entry_decode(hdr); rt = mpls_route_input_rcu(net, dec.label); if (!rt) { MPLS_INC_STATS(mdev, rx_noroute); goto drop; } nh = mpls_select_multipath(rt, skb); if (!nh) goto err; /* Pop the label */ skb_pull(skb, sizeof(*hdr)); skb_reset_network_header(skb); skb_orphan(skb); if (skb_warn_if_lro(skb)) goto err; skb_forward_csum(skb); /* Verify ttl is valid */ if (dec.ttl <= 1) goto err; /* Find the output device */ out_dev = nh->nh_dev; if (!mpls_output_possible(out_dev)) goto tx_err; /* Verify the destination can hold the packet */ new_header_size = mpls_nh_header_size(nh); mtu = mpls_dev_mtu(out_dev); if (mpls_pkt_too_big(skb, mtu - new_header_size)) goto tx_err; hh_len = LL_RESERVED_SPACE(out_dev); if (!out_dev->header_ops) hh_len = 0; /* Ensure there is enough space for the headers in the skb */ if (skb_cow(skb, hh_len + new_header_size)) goto tx_err; skb->dev = out_dev; skb->protocol = htons(ETH_P_MPLS_UC); dec.ttl -= 1; if (unlikely(!new_header_size && dec.bos)) { /* Penultimate hop popping */ if (!mpls_egress(net, rt, skb, dec)) goto err; } else { bool bos; int i; skb_push(skb, new_header_size); skb_reset_network_header(skb); /* Push the new labels */ hdr = mpls_hdr(skb); bos = dec.bos; for (i = nh->nh_labels - 1; i >= 0; i--) { hdr[i] = mpls_entry_encode(nh->nh_label[i], dec.ttl, 0, bos); bos = false; } } mpls_stats_inc_outucastpkts(net, out_dev, skb); /* If via wasn't specified then send out using device address */ if (nh->nh_via_table == MPLS_NEIGH_TABLE_UNSPEC) err = neigh_xmit(NEIGH_LINK_TABLE, out_dev, out_dev->dev_addr, skb); else err = neigh_xmit(nh->nh_via_table, out_dev, mpls_nh_via(rt, nh), skb); if (err) net_dbg_ratelimited("%s: packet transmission failed: %d\n", __func__, err); return 0; tx_err: out_mdev = out_dev ? mpls_dev_rcu(out_dev) : NULL; if (out_mdev) MPLS_INC_STATS(out_mdev, tx_errors); goto drop; err: MPLS_INC_STATS(mdev, rx_errors); drop: kfree_skb(skb); return NET_RX_DROP; } static struct packet_type mpls_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_MPLS_UC), .func = mpls_forward, }; static const struct nla_policy rtm_mpls_policy[RTA_MAX+1] = { [RTA_DST] = { .type = NLA_U32 }, [RTA_OIF] = { .type = NLA_U32 }, [RTA_TTL_PROPAGATE] = { .type = NLA_U8 }, }; struct mpls_route_config { u32 rc_protocol; u32 rc_ifindex; u8 rc_via_table; u8 rc_via_alen; u8 rc_via[MAX_VIA_ALEN]; u32 rc_label; u8 rc_ttl_propagate; u8 rc_output_labels; u32 rc_output_label[MAX_NEW_LABELS]; u32 rc_nlflags; enum mpls_payload_type rc_payload_type; struct nl_info rc_nlinfo; struct rtnexthop *rc_mp; int rc_mp_len; }; /* all nexthops within a route have the same size based on max * number of labels and max via length for a hop */ static struct mpls_route *mpls_rt_alloc(u8 num_nh, u8 max_alen, u8 max_labels) { u8 nh_size = MPLS_NH_SIZE(max_labels, max_alen); struct mpls_route *rt; size_t size; size = sizeof(*rt) + num_nh * nh_size; if (size > MAX_MPLS_ROUTE_MEM) return ERR_PTR(-EINVAL); rt = kzalloc(size, GFP_KERNEL); if (!rt) return ERR_PTR(-ENOMEM); rt->rt_nhn = num_nh; rt->rt_nhn_alive = num_nh; rt->rt_nh_size = nh_size; rt->rt_via_offset = MPLS_NH_VIA_OFF(max_labels); return rt; } static void mpls_rt_free_rcu(struct rcu_head *head) { struct mpls_route *rt; rt = container_of(head, struct mpls_route, rt_rcu); change_nexthops(rt) { netdev_put(nh->nh_dev, &nh->nh_dev_tracker); } endfor_nexthops(rt); kfree(rt); } static void mpls_rt_free(struct mpls_route *rt) { if (rt) call_rcu(&rt->rt_rcu, mpls_rt_free_rcu); } static void mpls_notify_route(struct net *net, unsigned index, struct mpls_route *old, struct mpls_route *new, const struct nl_info *info) { struct nlmsghdr *nlh = info ? info->nlh : NULL; unsigned portid = info ? info->portid : 0; int event = new ? RTM_NEWROUTE : RTM_DELROUTE; struct mpls_route *rt = new ? new : old; unsigned nlm_flags = (old && new) ? NLM_F_REPLACE : 0; /* Ignore reserved labels for now */ if (rt && (index >= MPLS_LABEL_FIRST_UNRESERVED)) rtmsg_lfib(event, index, rt, nlh, net, portid, nlm_flags); } static void mpls_route_update(struct net *net, unsigned index, struct mpls_route *new, const struct nl_info *info) { struct mpls_route __rcu **platform_label; struct mpls_route *rt; platform_label = mpls_dereference(net, net->mpls.platform_label); rt = mpls_dereference(net, platform_label[index]); rcu_assign_pointer(platform_label[index], new); mpls_notify_route(net, index, rt, new, info); /* If we removed a route free it now */ mpls_rt_free(rt); } static unsigned int find_free_label(struct net *net) { unsigned int index; for (index = MPLS_LABEL_FIRST_UNRESERVED; index < net->mpls.platform_labels; index++) { if (!mpls_route_input(net, index)) return index; } return LABEL_NOT_SPECIFIED; } #if IS_ENABLED(CONFIG_INET) static struct net_device *inet_fib_lookup_dev(struct net *net, struct mpls_nh *nh, const void *addr) { struct net_device *dev; struct rtable *rt; struct in_addr daddr; memcpy(&daddr, addr, sizeof(struct in_addr)); rt = ip_route_output(net, daddr.s_addr, 0, 0, 0, RT_SCOPE_UNIVERSE); if (IS_ERR(rt)) return ERR_CAST(rt); dev = rt->dst.dev; netdev_hold(dev, &nh->nh_dev_tracker, GFP_KERNEL); ip_rt_put(rt); return dev; } #else static struct net_device *inet_fib_lookup_dev(struct net *net, struct mpls_nh *nh, const void *addr) { return ERR_PTR(-EAFNOSUPPORT); } #endif #if IS_ENABLED(CONFIG_IPV6) static struct net_device *inet6_fib_lookup_dev(struct net *net, struct mpls_nh *nh, const void *addr) { struct net_device *dev; struct dst_entry *dst; struct flowi6 fl6; if (!ipv6_stub) return ERR_PTR(-EAFNOSUPPORT); memset(&fl6, 0, sizeof(fl6)); memcpy(&fl6.daddr, addr, sizeof(struct in6_addr)); dst = ipv6_stub->ipv6_dst_lookup_flow(net, NULL, &fl6, NULL); if (IS_ERR(dst)) return ERR_CAST(dst); dev = dst->dev; netdev_hold(dev, &nh->nh_dev_tracker, GFP_KERNEL); dst_release(dst); return dev; } #else static struct net_device *inet6_fib_lookup_dev(struct net *net, struct mpls_nh *nh, const void *addr) { return ERR_PTR(-EAFNOSUPPORT); } #endif static struct net_device *find_outdev(struct net *net, struct mpls_route *rt, struct mpls_nh *nh, int oif) { struct net_device *dev = NULL; if (!oif) { switch (nh->nh_via_table) { case NEIGH_ARP_TABLE: dev = inet_fib_lookup_dev(net, nh, mpls_nh_via(rt, nh)); break; case NEIGH_ND_TABLE: dev = inet6_fib_lookup_dev(net, nh, mpls_nh_via(rt, nh)); break; case NEIGH_LINK_TABLE: break; } } else { dev = netdev_get_by_index(net, oif, &nh->nh_dev_tracker, GFP_KERNEL); } if (!dev) return ERR_PTR(-ENODEV); if (IS_ERR(dev)) return dev; nh->nh_dev = dev; return dev; } static int mpls_nh_assign_dev(struct net *net, struct mpls_route *rt, struct mpls_nh *nh, int oif) { struct net_device *dev = NULL; int err = -ENODEV; dev = find_outdev(net, rt, nh, oif); if (IS_ERR(dev)) { err = PTR_ERR(dev); goto errout; } /* Ensure this is a supported device */ err = -EINVAL; if (!mpls_dev_get(net, dev)) goto errout_put; if ((nh->nh_via_table == NEIGH_LINK_TABLE) && (dev->addr_len != nh->nh_via_alen)) goto errout_put; if (!(dev->flags & IFF_UP)) { nh->nh_flags |= RTNH_F_DEAD; } else { unsigned int flags; flags = netif_get_flags(dev); if (!(flags & (IFF_RUNNING | IFF_LOWER_UP))) nh->nh_flags |= RTNH_F_LINKDOWN; } return 0; errout_put: netdev_put(nh->nh_dev, &nh->nh_dev_tracker); nh->nh_dev = NULL; errout: return err; } static int nla_get_via(const struct nlattr *nla, u8 *via_alen, u8 *via_table, u8 via_addr[], struct netlink_ext_ack *extack) { struct rtvia *via = nla_data(nla); int err = -EINVAL; int alen; if (nla_len(nla) < offsetof(struct rtvia, rtvia_addr)) { NL_SET_ERR_MSG_ATTR(extack, nla, "Invalid attribute length for RTA_VIA"); goto errout; } alen = nla_len(nla) - offsetof(struct rtvia, rtvia_addr); if (alen > MAX_VIA_ALEN) { NL_SET_ERR_MSG_ATTR(extack, nla, "Invalid address length for RTA_VIA"); goto errout; } /* Validate the address family */ switch (via->rtvia_family) { case AF_PACKET: *via_table = NEIGH_LINK_TABLE; break; case AF_INET: *via_table = NEIGH_ARP_TABLE; if (alen != 4) goto errout; break; case AF_INET6: *via_table = NEIGH_ND_TABLE; if (alen != 16) goto errout; break; default: /* Unsupported address family */ goto errout; } memcpy(via_addr, via->rtvia_addr, alen); *via_alen = alen; err = 0; errout: return err; } static int mpls_nh_build_from_cfg(struct mpls_route_config *cfg, struct mpls_route *rt) { struct net *net = cfg->rc_nlinfo.nl_net; struct mpls_nh *nh = rt->rt_nh; int err; int i; if (!nh) return -ENOMEM; nh->nh_labels = cfg->rc_output_labels; for (i = 0; i < nh->nh_labels; i++) nh->nh_label[i] = cfg->rc_output_label[i]; nh->nh_via_table = cfg->rc_via_table; memcpy(__mpls_nh_via(rt, nh), cfg->rc_via, cfg->rc_via_alen); nh->nh_via_alen = cfg->rc_via_alen; err = mpls_nh_assign_dev(net, rt, nh, cfg->rc_ifindex); if (err) goto errout; if (nh->nh_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN)) rt->rt_nhn_alive--; return 0; errout: return err; } static int mpls_nh_build(struct net *net, struct mpls_route *rt, struct mpls_nh *nh, int oif, struct nlattr *via, struct nlattr *newdst, u8 max_labels, struct netlink_ext_ack *extack) { int err = -ENOMEM; if (!nh) goto errout; if (newdst) { err = nla_get_labels(newdst, max_labels, &nh->nh_labels, nh->nh_label, extack); if (err) goto errout; } if (via) { err = nla_get_via(via, &nh->nh_via_alen, &nh->nh_via_table, __mpls_nh_via(rt, nh), extack); if (err) goto errout; } else { nh->nh_via_table = MPLS_NEIGH_TABLE_UNSPEC; } err = mpls_nh_assign_dev(net, rt, nh, oif); if (err) goto errout; return 0; errout: return err; } static u8 mpls_count_nexthops(struct rtnexthop *rtnh, int len, u8 cfg_via_alen, u8 *max_via_alen, u8 *max_labels) { int remaining = len; u8 nhs = 0; *max_via_alen = 0; *max_labels = 0; while (rtnh_ok(rtnh, remaining)) { struct nlattr *nla, *attrs = rtnh_attrs(rtnh); int attrlen; u8 n_labels = 0; attrlen = rtnh_attrlen(rtnh); nla = nla_find(attrs, attrlen, RTA_VIA); if (nla && nla_len(nla) >= offsetof(struct rtvia, rtvia_addr)) { int via_alen = nla_len(nla) - offsetof(struct rtvia, rtvia_addr); if (via_alen <= MAX_VIA_ALEN) *max_via_alen = max_t(u16, *max_via_alen, via_alen); } nla = nla_find(attrs, attrlen, RTA_NEWDST); if (nla && nla_get_labels(nla, MAX_NEW_LABELS, &n_labels, NULL, NULL) != 0) return 0; *max_labels = max_t(u8, *max_labels, n_labels); /* number of nexthops is tracked by a u8. * Check for overflow. */ if (nhs == 255) return 0; nhs++; rtnh = rtnh_next(rtnh, &remaining); } /* leftover implies invalid nexthop configuration, discard it */ return remaining > 0 ? 0 : nhs; } static int mpls_nh_build_multi(struct mpls_route_config *cfg, struct mpls_route *rt, u8 max_labels, struct netlink_ext_ack *extack) { struct rtnexthop *rtnh = cfg->rc_mp; struct nlattr *nla_via, *nla_newdst; int remaining = cfg->rc_mp_len; int err = 0; rt->rt_nhn = 0; change_nexthops(rt) { int attrlen; nla_via = NULL; nla_newdst = NULL; err = -EINVAL; if (!rtnh_ok(rtnh, remaining)) goto errout; /* neither weighted multipath nor any flags * are supported */ if (rtnh->rtnh_hops || rtnh->rtnh_flags) goto errout; attrlen = rtnh_attrlen(rtnh); if (attrlen > 0) { struct nlattr *attrs = rtnh_attrs(rtnh); nla_via = nla_find(attrs, attrlen, RTA_VIA); nla_newdst = nla_find(attrs, attrlen, RTA_NEWDST); } err = mpls_nh_build(cfg->rc_nlinfo.nl_net, rt, nh, rtnh->rtnh_ifindex, nla_via, nla_newdst, max_labels, extack); if (err) goto errout; if (nh->nh_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN)) rt->rt_nhn_alive--; rtnh = rtnh_next(rtnh, &remaining); rt->rt_nhn++; } endfor_nexthops(rt); return 0; errout: return err; } static bool mpls_label_ok(struct net *net, unsigned int *index, struct netlink_ext_ack *extack) { /* Reserved labels may not be set */ if (*index < MPLS_LABEL_FIRST_UNRESERVED) { NL_SET_ERR_MSG(extack, "Invalid label - must be MPLS_LABEL_FIRST_UNRESERVED or higher"); return false; } /* The full 20 bit range may not be supported. */ if (*index >= net->mpls.platform_labels) { NL_SET_ERR_MSG(extack, "Label >= configured maximum in platform_labels"); return false; } *index = array_index_nospec(*index, net->mpls.platform_labels); return true; } static int mpls_route_add(struct mpls_route_config *cfg, struct netlink_ext_ack *extack) { struct net *net = cfg->rc_nlinfo.nl_net; struct mpls_route *rt, *old; int err = -EINVAL; u8 max_via_alen; unsigned index; u8 max_labels; u8 nhs; index = cfg->rc_label; /* If a label was not specified during insert pick one */ if ((index == LABEL_NOT_SPECIFIED) && (cfg->rc_nlflags & NLM_F_CREATE)) { index = find_free_label(net); } if (!mpls_label_ok(net, &index, extack)) goto errout; /* Append makes no sense with mpls */ err = -EOPNOTSUPP; if (cfg->rc_nlflags & NLM_F_APPEND) { NL_SET_ERR_MSG(extack, "MPLS does not support route append"); goto errout; } err = -EEXIST; old = mpls_route_input(net, index); if ((cfg->rc_nlflags & NLM_F_EXCL) && old) goto errout; err = -EEXIST; if (!(cfg->rc_nlflags & NLM_F_REPLACE) && old) goto errout; err = -ENOENT; if (!(cfg->rc_nlflags & NLM_F_CREATE) && !old) goto errout; err = -EINVAL; if (cfg->rc_mp) { nhs = mpls_count_nexthops(cfg->rc_mp, cfg->rc_mp_len, cfg->rc_via_alen, &max_via_alen, &max_labels); } else { max_via_alen = cfg->rc_via_alen; max_labels = cfg->rc_output_labels; nhs = 1; } if (nhs == 0) { NL_SET_ERR_MSG(extack, "Route does not contain a nexthop"); goto errout; } rt = mpls_rt_alloc(nhs, max_via_alen, max_labels); if (IS_ERR(rt)) { err = PTR_ERR(rt); goto errout; } rt->rt_protocol = cfg->rc_protocol; rt->rt_payload_type = cfg->rc_payload_type; rt->rt_ttl_propagate = cfg->rc_ttl_propagate; if (cfg->rc_mp) err = mpls_nh_build_multi(cfg, rt, max_labels, extack); else err = mpls_nh_build_from_cfg(cfg, rt); if (err) goto freert; mpls_route_update(net, index, rt, &cfg->rc_nlinfo); return 0; freert: mpls_rt_free(rt); errout: return err; } static int mpls_route_del(struct mpls_route_config *cfg, struct netlink_ext_ack *extack) { struct net *net = cfg->rc_nlinfo.nl_net; unsigned index; int err = -EINVAL; index = cfg->rc_label; if (!mpls_label_ok(net, &index, extack)) goto errout; mpls_route_update(net, index, NULL, &cfg->rc_nlinfo); err = 0; errout: return err; } static void mpls_get_stats(struct mpls_dev *mdev, struct mpls_link_stats *stats) { struct mpls_pcpu_stats *p; int i; memset(stats, 0, sizeof(*stats)); for_each_possible_cpu(i) { struct mpls_link_stats local; unsigned int start; p = per_cpu_ptr(mdev->stats, i); do { start = u64_stats_fetch_begin(&p->syncp); local = p->stats; } while (u64_stats_fetch_retry(&p->syncp, start)); stats->rx_packets += local.rx_packets; stats->rx_bytes += local.rx_bytes; stats->tx_packets += local.tx_packets; stats->tx_bytes += local.tx_bytes; stats->rx_errors += local.rx_errors; stats->tx_errors += local.tx_errors; stats->rx_dropped += local.rx_dropped; stats->tx_dropped += local.tx_dropped; stats->rx_noroute += local.rx_noroute; } } static int mpls_fill_stats_af(struct sk_buff *skb, const struct net_device *dev) { struct mpls_link_stats *stats; struct mpls_dev *mdev; struct nlattr *nla; mdev = mpls_dev_rcu(dev); if (!mdev) return -ENODATA; nla = nla_reserve_64bit(skb, MPLS_STATS_LINK, sizeof(struct mpls_link_stats), MPLS_STATS_UNSPEC); if (!nla) return -EMSGSIZE; stats = nla_data(nla); mpls_get_stats(mdev, stats); return 0; } static size_t mpls_get_stats_af_size(const struct net_device *dev) { struct mpls_dev *mdev; mdev = mpls_dev_rcu(dev); if (!mdev) return 0; return nla_total_size_64bit(sizeof(struct mpls_link_stats)); } static int mpls_netconf_fill_devconf(struct sk_buff *skb, struct mpls_dev *mdev, u32 portid, u32 seq, int event, unsigned int flags, int type) { struct nlmsghdr *nlh; struct netconfmsg *ncm; bool all = false; nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct netconfmsg), flags); if (!nlh) return -EMSGSIZE; if (type == NETCONFA_ALL) all = true; ncm = nlmsg_data(nlh); ncm->ncm_family = AF_MPLS; if (nla_put_s32(skb, NETCONFA_IFINDEX, mdev->dev->ifindex) < 0) goto nla_put_failure; if ((all || type == NETCONFA_INPUT) && nla_put_s32(skb, NETCONFA_INPUT, READ_ONCE(mdev->input_enabled)) < 0) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int mpls_netconf_msgsize_devconf(int type) { int size = NLMSG_ALIGN(sizeof(struct netconfmsg)) + nla_total_size(4); /* NETCONFA_IFINDEX */ bool all = false; if (type == NETCONFA_ALL) all = true; if (all || type == NETCONFA_INPUT) size += nla_total_size(4); return size; } static void mpls_netconf_notify_devconf(struct net *net, int event, int type, struct mpls_dev *mdev) { struct sk_buff *skb; int err = -ENOBUFS; skb = nlmsg_new(mpls_netconf_msgsize_devconf(type), GFP_KERNEL); if (!skb) goto errout; err = mpls_netconf_fill_devconf(skb, mdev, 0, 0, event, 0, type); if (err < 0) { /* -EMSGSIZE implies BUG in mpls_netconf_msgsize_devconf() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, net, 0, RTNLGRP_MPLS_NETCONF, NULL, GFP_KERNEL); return; errout: rtnl_set_sk_err(net, RTNLGRP_MPLS_NETCONF, err); } static const struct nla_policy devconf_mpls_policy[NETCONFA_MAX + 1] = { [NETCONFA_IFINDEX] = { .len = sizeof(int) }, }; static int mpls_netconf_valid_get_req(struct sk_buff *skb, const struct nlmsghdr *nlh, struct nlattr **tb, struct netlink_ext_ack *extack) { int i, err; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(struct netconfmsg))) { NL_SET_ERR_MSG_MOD(extack, "Invalid header for netconf get request"); return -EINVAL; } if (!netlink_strict_get_check(skb)) return nlmsg_parse_deprecated(nlh, sizeof(struct netconfmsg), tb, NETCONFA_MAX, devconf_mpls_policy, extack); err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct netconfmsg), tb, NETCONFA_MAX, devconf_mpls_policy, extack); if (err) return err; for (i = 0; i <= NETCONFA_MAX; i++) { if (!tb[i]) continue; switch (i) { case NETCONFA_IFINDEX: break; default: NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in netconf get request"); return -EINVAL; } } return 0; } static int mpls_netconf_get_devconf(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); struct nlattr *tb[NETCONFA_MAX + 1]; struct net_device *dev; struct mpls_dev *mdev; struct sk_buff *skb; int ifindex; int err; err = mpls_netconf_valid_get_req(in_skb, nlh, tb, extack); if (err < 0) goto errout; if (!tb[NETCONFA_IFINDEX]) { err = -EINVAL; goto errout; } ifindex = nla_get_s32(tb[NETCONFA_IFINDEX]); skb = nlmsg_new(mpls_netconf_msgsize_devconf(NETCONFA_ALL), GFP_KERNEL); if (!skb) { err = -ENOBUFS; goto errout; } rcu_read_lock(); dev = dev_get_by_index_rcu(net, ifindex); if (!dev) { err = -EINVAL; goto errout_unlock; } mdev = mpls_dev_rcu(dev); if (!mdev) { err = -EINVAL; goto errout_unlock; } err = mpls_netconf_fill_devconf(skb, mdev, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, RTM_NEWNETCONF, 0, NETCONFA_ALL); if (err < 0) { /* -EMSGSIZE implies BUG in mpls_netconf_msgsize_devconf() */ WARN_ON(err == -EMSGSIZE); goto errout_unlock; } err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); rcu_read_unlock(); errout: return err; errout_unlock: rcu_read_unlock(); kfree_skb(skb); goto errout; } static int mpls_netconf_dump_devconf(struct sk_buff *skb, struct netlink_callback *cb) { const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); struct { unsigned long ifindex; } *ctx = (void *)cb->ctx; struct net_device *dev; struct mpls_dev *mdev; int err = 0; if (cb->strict_check) { struct netlink_ext_ack *extack = cb->extack; struct netconfmsg *ncm; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ncm))) { NL_SET_ERR_MSG_MOD(extack, "Invalid header for netconf dump request"); return -EINVAL; } if (nlmsg_attrlen(nlh, sizeof(*ncm))) { NL_SET_ERR_MSG_MOD(extack, "Invalid data after header in netconf dump request"); return -EINVAL; } } rcu_read_lock(); for_each_netdev_dump(net, dev, ctx->ifindex) { mdev = mpls_dev_rcu(dev); if (!mdev) continue; err = mpls_netconf_fill_devconf(skb, mdev, NETLINK_CB(cb->skb).portid, nlh->nlmsg_seq, RTM_NEWNETCONF, NLM_F_MULTI, NETCONFA_ALL); if (err < 0) break; } rcu_read_unlock(); return err; } #define MPLS_PERDEV_SYSCTL_OFFSET(field) \ (&((struct mpls_dev *)0)->field) static int mpls_conf_proc(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int oval = *(int *)ctl->data; int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); if (write) { struct mpls_dev *mdev = ctl->extra1; int i = (int *)ctl->data - (int *)mdev; struct net *net = ctl->extra2; int val = *(int *)ctl->data; if (i == offsetof(struct mpls_dev, input_enabled) && val != oval) { mpls_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_INPUT, mdev); } } return ret; } static const struct ctl_table mpls_dev_table[] = { { .procname = "input", .maxlen = sizeof(int), .mode = 0644, .proc_handler = mpls_conf_proc, .data = MPLS_PERDEV_SYSCTL_OFFSET(input_enabled), }, }; static int mpls_dev_sysctl_register(struct net_device *dev, struct mpls_dev *mdev) { char path[sizeof("net/mpls/conf/") + IFNAMSIZ]; size_t table_size = ARRAY_SIZE(mpls_dev_table); struct net *net = dev_net(dev); struct ctl_table *table; int i; table = kmemdup(&mpls_dev_table, sizeof(mpls_dev_table), GFP_KERNEL); if (!table) goto out; /* Table data contains only offsets relative to the base of * the mdev at this point, so make them absolute. */ for (i = 0; i < table_size; i++) { table[i].data = (char *)mdev + (uintptr_t)table[i].data; table[i].extra1 = mdev; table[i].extra2 = net; } snprintf(path, sizeof(path), "net/mpls/conf/%s", dev->name); mdev->sysctl = register_net_sysctl_sz(net, path, table, table_size); if (!mdev->sysctl) goto free; mpls_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_ALL, mdev); return 0; free: kfree(table); out: mdev->sysctl = NULL; return -ENOBUFS; } static void mpls_dev_sysctl_unregister(struct net_device *dev, struct mpls_dev *mdev) { struct net *net = dev_net(dev); const struct ctl_table *table; if (!mdev->sysctl) return; table = mdev->sysctl->ctl_table_arg; unregister_net_sysctl_table(mdev->sysctl); kfree(table); mpls_netconf_notify_devconf(net, RTM_DELNETCONF, 0, mdev); } static struct mpls_dev *mpls_add_dev(struct net_device *dev) { struct mpls_dev *mdev; int err = -ENOMEM; int i; mdev = kzalloc_obj(*mdev); if (!mdev) return ERR_PTR(err); mdev->stats = alloc_percpu(struct mpls_pcpu_stats); if (!mdev->stats) goto free; for_each_possible_cpu(i) { struct mpls_pcpu_stats *mpls_stats; mpls_stats = per_cpu_ptr(mdev->stats, i); u64_stats_init(&mpls_stats->syncp); } mdev->dev = dev; err = mpls_dev_sysctl_register(dev, mdev); if (err) goto free; rcu_assign_pointer(dev->mpls_ptr, mdev); return mdev; free: free_percpu(mdev->stats); kfree(mdev); return ERR_PTR(err); } static void mpls_dev_destroy_rcu(struct rcu_head *head) { struct mpls_dev *mdev = container_of(head, struct mpls_dev, rcu); free_percpu(mdev->stats); kfree(mdev); } static int mpls_ifdown(struct net_device *dev, int event) { struct net *net = dev_net(dev); unsigned int index; for (index = 0; index < net->mpls.platform_labels; index++) { struct mpls_route *rt; bool nh_del = false; u8 alive = 0; rt = mpls_route_input(net, index); if (!rt) continue; if (event == NETDEV_UNREGISTER) { u8 deleted = 0; for_nexthops(rt) { if (!nh->nh_dev || nh->nh_dev == dev) deleted++; if (nh->nh_dev == dev) nh_del = true; } endfor_nexthops(rt); /* if there are no more nexthops, delete the route */ if (deleted == rt->rt_nhn) { mpls_route_update(net, index, NULL, NULL); continue; } if (nh_del) { size_t size = sizeof(*rt) + rt->rt_nhn * rt->rt_nh_size; struct mpls_route *orig = rt; rt = kmemdup(orig, size, GFP_KERNEL); if (!rt) return -ENOMEM; } } change_nexthops(rt) { unsigned int nh_flags = nh->nh_flags; if (nh->nh_dev != dev) { if (nh_del) netdev_hold(nh->nh_dev, &nh->nh_dev_tracker, GFP_KERNEL); goto next; } switch (event) { case NETDEV_DOWN: case NETDEV_UNREGISTER: nh_flags |= RTNH_F_DEAD; fallthrough; case NETDEV_CHANGE: nh_flags |= RTNH_F_LINKDOWN; break; } if (event == NETDEV_UNREGISTER) nh->nh_dev = NULL; if (nh->nh_flags != nh_flags) WRITE_ONCE(nh->nh_flags, nh_flags); next: if (!(nh_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN))) alive++; } endfor_nexthops(rt); WRITE_ONCE(rt->rt_nhn_alive, alive); if (nh_del) mpls_route_update(net, index, rt, NULL); } return 0; } static void mpls_ifup(struct net_device *dev, unsigned int flags) { struct net *net = dev_net(dev); unsigned int index; u8 alive; for (index = 0; index < net->mpls.platform_labels; index++) { struct mpls_route *rt; rt = mpls_route_input(net, index); if (!rt) continue; alive = 0; change_nexthops(rt) { unsigned int nh_flags = nh->nh_flags; if (!(nh_flags & flags)) { alive++; continue; } if (nh->nh_dev != dev) continue; alive++; nh_flags &= ~flags; WRITE_ONCE(nh->nh_flags, nh_flags); } endfor_nexthops(rt); WRITE_ONCE(rt->rt_nhn_alive, alive); } } static int mpls_dev_notify(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); struct mpls_dev *mdev; unsigned int flags; int err; mutex_lock(&net->mpls.platform_mutex); if (event == NETDEV_REGISTER) { mdev = mpls_add_dev(dev); if (IS_ERR(mdev)) { err = PTR_ERR(mdev); goto err; } goto out; } mdev = mpls_dev_get(net, dev); if (!mdev) goto out; switch (event) { case NETDEV_DOWN: err = mpls_ifdown(dev, event); if (err) goto err; break; case NETDEV_UP: flags = netif_get_flags(dev); if (flags & (IFF_RUNNING | IFF_LOWER_UP)) mpls_ifup(dev, RTNH_F_DEAD | RTNH_F_LINKDOWN); else mpls_ifup(dev, RTNH_F_DEAD); break; case NETDEV_CHANGE: flags = netif_get_flags(dev); if (flags & (IFF_RUNNING | IFF_LOWER_UP)) { mpls_ifup(dev, RTNH_F_DEAD | RTNH_F_LINKDOWN); } else { err = mpls_ifdown(dev, event); if (err) goto err; } break; case NETDEV_UNREGISTER: err = mpls_ifdown(dev, event); if (err) goto err; mdev = mpls_dev_get(net, dev); if (mdev) { mpls_dev_sysctl_unregister(dev, mdev); RCU_INIT_POINTER(dev->mpls_ptr, NULL); call_rcu(&mdev->rcu, mpls_dev_destroy_rcu); } break; case NETDEV_CHANGENAME: mdev = mpls_dev_get(net, dev); if (mdev) { mpls_dev_sysctl_unregister(dev, mdev); err = mpls_dev_sysctl_register(dev, mdev); if (err) goto err; } break; } out: mutex_unlock(&net->mpls.platform_mutex); return NOTIFY_OK; err: mutex_unlock(&net->mpls.platform_mutex); return notifier_from_errno(err); } static struct notifier_block mpls_dev_notifier = { .notifier_call = mpls_dev_notify, }; static int nla_put_via(struct sk_buff *skb, u8 table, const void *addr, int alen) { static const int table_to_family[NEIGH_NR_TABLES + 1] = { AF_INET, AF_INET6, AF_PACKET, }; struct nlattr *nla; struct rtvia *via; int family = AF_UNSPEC; nla = nla_reserve(skb, RTA_VIA, alen + 2); if (!nla) return -EMSGSIZE; if (table <= NEIGH_NR_TABLES) family = table_to_family[table]; via = nla_data(nla); via->rtvia_family = family; memcpy(via->rtvia_addr, addr, alen); return 0; } int nla_put_labels(struct sk_buff *skb, int attrtype, u8 labels, const u32 label[]) { struct nlattr *nla; struct mpls_shim_hdr *nla_label; bool bos; int i; nla = nla_reserve(skb, attrtype, labels*4); if (!nla) return -EMSGSIZE; nla_label = nla_data(nla); bos = true; for (i = labels - 1; i >= 0; i--) { nla_label[i] = mpls_entry_encode(label[i], 0, 0, bos); bos = false; } return 0; } EXPORT_SYMBOL_GPL(nla_put_labels); int nla_get_labels(const struct nlattr *nla, u8 max_labels, u8 *labels, u32 label[], struct netlink_ext_ack *extack) { unsigned len = nla_len(nla); struct mpls_shim_hdr *nla_label; u8 nla_labels; bool bos; int i; /* len needs to be an even multiple of 4 (the label size). Number * of labels is a u8 so check for overflow. */ if (len & 3 || len / 4 > 255) { NL_SET_ERR_MSG_ATTR(extack, nla, "Invalid length for labels attribute"); return -EINVAL; } /* Limit the number of new labels allowed */ nla_labels = len/4; if (nla_labels > max_labels) { NL_SET_ERR_MSG(extack, "Too many labels"); return -EINVAL; } /* when label == NULL, caller wants number of labels */ if (!label) goto out; nla_label = nla_data(nla); bos = true; for (i = nla_labels - 1; i >= 0; i--, bos = false) { struct mpls_entry_decoded dec; dec = mpls_entry_decode(nla_label + i); /* Ensure the bottom of stack flag is properly set * and ttl and tc are both clear. */ if (dec.ttl) { NL_SET_ERR_MSG_ATTR(extack, nla, "TTL in label must be 0"); return -EINVAL; } if (dec.tc) { NL_SET_ERR_MSG_ATTR(extack, nla, "Traffic class in label must be 0"); return -EINVAL; } if (dec.bos != bos) { NL_SET_BAD_ATTR(extack, nla); if (bos) { NL_SET_ERR_MSG(extack, "BOS bit must be set in first label"); } else { NL_SET_ERR_MSG(extack, "BOS bit can only be set in first label"); } return -EINVAL; } switch (dec.label) { case MPLS_LABEL_IMPLNULL: /* RFC3032: This is a label that an LSR may * assign and distribute, but which never * actually appears in the encapsulation. */ NL_SET_ERR_MSG_ATTR(extack, nla, "Implicit NULL Label (3) can not be used in encapsulation"); return -EINVAL; } label[i] = dec.label; } out: *labels = nla_labels; return 0; } EXPORT_SYMBOL_GPL(nla_get_labels); static int rtm_to_route_config(struct sk_buff *skb, struct nlmsghdr *nlh, struct mpls_route_config *cfg, struct netlink_ext_ack *extack) { struct rtmsg *rtm; struct nlattr *tb[RTA_MAX+1]; int index; int err; err = nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_mpls_policy, extack); if (err < 0) goto errout; err = -EINVAL; rtm = nlmsg_data(nlh); if (rtm->rtm_family != AF_MPLS) { NL_SET_ERR_MSG(extack, "Invalid address family in rtmsg"); goto errout; } if (rtm->rtm_dst_len != 20) { NL_SET_ERR_MSG(extack, "rtm_dst_len must be 20 for MPLS"); goto errout; } if (rtm->rtm_src_len != 0) { NL_SET_ERR_MSG(extack, "rtm_src_len must be 0 for MPLS"); goto errout; } if (rtm->rtm_tos != 0) { NL_SET_ERR_MSG(extack, "rtm_tos must be 0 for MPLS"); goto errout; } if (rtm->rtm_table != RT_TABLE_MAIN) { NL_SET_ERR_MSG(extack, "MPLS only supports the main route table"); goto errout; } /* Any value is acceptable for rtm_protocol */ /* As mpls uses destination specific addresses * (or source specific address in the case of multicast) * all addresses have universal scope. */ if (rtm->rtm_scope != RT_SCOPE_UNIVERSE) { NL_SET_ERR_MSG(extack, "Invalid route scope - MPLS only supports UNIVERSE"); goto errout; } if (rtm->rtm_type != RTN_UNICAST) { NL_SET_ERR_MSG(extack, "Invalid route type - MPLS only supports UNICAST"); goto errout; } if (rtm->rtm_flags != 0) { NL_SET_ERR_MSG(extack, "rtm_flags must be 0 for MPLS"); goto errout; } cfg->rc_label = LABEL_NOT_SPECIFIED; cfg->rc_protocol = rtm->rtm_protocol; cfg->rc_via_table = MPLS_NEIGH_TABLE_UNSPEC; cfg->rc_ttl_propagate = MPLS_TTL_PROP_DEFAULT; cfg->rc_nlflags = nlh->nlmsg_flags; cfg->rc_nlinfo.portid = NETLINK_CB(skb).portid; cfg->rc_nlinfo.nlh = nlh; cfg->rc_nlinfo.nl_net = sock_net(skb->sk); for (index = 0; index <= RTA_MAX; index++) { struct nlattr *nla = tb[index]; if (!nla) continue; switch (index) { case RTA_OIF: cfg->rc_ifindex = nla_get_u32(nla); break; case RTA_NEWDST: if (nla_get_labels(nla, MAX_NEW_LABELS, &cfg->rc_output_labels, cfg->rc_output_label, extack)) goto errout; break; case RTA_DST: { u8 label_count; if (nla_get_labels(nla, 1, &label_count, &cfg->rc_label, extack)) goto errout; if (!mpls_label_ok(cfg->rc_nlinfo.nl_net, &cfg->rc_label, extack)) goto errout; break; } case RTA_GATEWAY: NL_SET_ERR_MSG(extack, "MPLS does not support RTA_GATEWAY attribute"); goto errout; case RTA_VIA: { if (nla_get_via(nla, &cfg->rc_via_alen, &cfg->rc_via_table, cfg->rc_via, extack)) goto errout; break; } case RTA_MULTIPATH: { cfg->rc_mp = nla_data(nla); cfg->rc_mp_len = nla_len(nla); break; } case RTA_TTL_PROPAGATE: { u8 ttl_propagate = nla_get_u8(nla); if (ttl_propagate > 1) { NL_SET_ERR_MSG_ATTR(extack, nla, "RTA_TTL_PROPAGATE can only be 0 or 1"); goto errout; } cfg->rc_ttl_propagate = ttl_propagate ? MPLS_TTL_PROP_ENABLED : MPLS_TTL_PROP_DISABLED; break; } default: NL_SET_ERR_MSG_ATTR(extack, nla, "Unknown attribute"); /* Unsupported attribute */ goto errout; } } err = 0; errout: return err; } static int mpls_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct mpls_route_config *cfg; int err; cfg = kzalloc_obj(*cfg); if (!cfg) return -ENOMEM; err = rtm_to_route_config(skb, nlh, cfg, extack); if (err < 0) goto out; mutex_lock(&net->mpls.platform_mutex); err = mpls_route_del(cfg, extack); mutex_unlock(&net->mpls.platform_mutex); out: kfree(cfg); return err; } static int mpls_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct mpls_route_config *cfg; int err; cfg = kzalloc_obj(*cfg); if (!cfg) return -ENOMEM; err = rtm_to_route_config(skb, nlh, cfg, extack); if (err < 0) goto out; mutex_lock(&net->mpls.platform_mutex); err = mpls_route_add(cfg, extack); mutex_unlock(&net->mpls.platform_mutex); out: kfree(cfg); return err; } static int mpls_dump_route(struct sk_buff *skb, u32 portid, u32 seq, int event, u32 label, struct mpls_route *rt, int flags) { struct net_device *dev; struct nlmsghdr *nlh; struct rtmsg *rtm; nlh = nlmsg_put(skb, portid, seq, event, sizeof(*rtm), flags); if (nlh == NULL) return -EMSGSIZE; rtm = nlmsg_data(nlh); rtm->rtm_family = AF_MPLS; rtm->rtm_dst_len = 20; rtm->rtm_src_len = 0; rtm->rtm_tos = 0; rtm->rtm_table = RT_TABLE_MAIN; rtm->rtm_protocol = rt->rt_protocol; rtm->rtm_scope = RT_SCOPE_UNIVERSE; rtm->rtm_type = RTN_UNICAST; rtm->rtm_flags = 0; if (nla_put_labels(skb, RTA_DST, 1, &label)) goto nla_put_failure; if (rt->rt_ttl_propagate != MPLS_TTL_PROP_DEFAULT) { bool ttl_propagate = rt->rt_ttl_propagate == MPLS_TTL_PROP_ENABLED; if (nla_put_u8(skb, RTA_TTL_PROPAGATE, ttl_propagate)) goto nla_put_failure; } if (rt->rt_nhn == 1) { const struct mpls_nh *nh = rt->rt_nh; if (nh->nh_labels && nla_put_labels(skb, RTA_NEWDST, nh->nh_labels, nh->nh_label)) goto nla_put_failure; if (nh->nh_via_table != MPLS_NEIGH_TABLE_UNSPEC && nla_put_via(skb, nh->nh_via_table, mpls_nh_via(rt, nh), nh->nh_via_alen)) goto nla_put_failure; dev = nh->nh_dev; if (dev && nla_put_u32(skb, RTA_OIF, dev->ifindex)) goto nla_put_failure; if (nh->nh_flags & RTNH_F_LINKDOWN) rtm->rtm_flags |= RTNH_F_LINKDOWN; if (nh->nh_flags & RTNH_F_DEAD) rtm->rtm_flags |= RTNH_F_DEAD; } else { struct rtnexthop *rtnh; struct nlattr *mp; u8 linkdown = 0; u8 dead = 0; mp = nla_nest_start_noflag(skb, RTA_MULTIPATH); if (!mp) goto nla_put_failure; for_nexthops(rt) { dev = nh->nh_dev; if (!dev) continue; rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh)); if (!rtnh) goto nla_put_failure; rtnh->rtnh_ifindex = dev->ifindex; if (nh->nh_flags & RTNH_F_LINKDOWN) { rtnh->rtnh_flags |= RTNH_F_LINKDOWN; linkdown++; } if (nh->nh_flags & RTNH_F_DEAD) { rtnh->rtnh_flags |= RTNH_F_DEAD; dead++; } if (nh->nh_labels && nla_put_labels(skb, RTA_NEWDST, nh->nh_labels, nh->nh_label)) goto nla_put_failure; if (nh->nh_via_table != MPLS_NEIGH_TABLE_UNSPEC && nla_put_via(skb, nh->nh_via_table, mpls_nh_via(rt, nh), nh->nh_via_alen)) goto nla_put_failure; /* length of rtnetlink header + attributes */ rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh; } endfor_nexthops(rt); if (linkdown == rt->rt_nhn) rtm->rtm_flags |= RTNH_F_LINKDOWN; if (dead == rt->rt_nhn) rtm->rtm_flags |= RTNH_F_DEAD; nla_nest_end(skb, mp); } nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } #if IS_ENABLED(CONFIG_INET) static int mpls_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh, struct fib_dump_filter *filter, struct netlink_callback *cb) { return ip_valid_fib_dump_req(net, nlh, filter, cb); } #else static int mpls_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh, struct fib_dump_filter *filter, struct netlink_callback *cb) { struct netlink_ext_ack *extack = cb->extack; struct nlattr *tb[RTA_MAX + 1]; struct rtmsg *rtm; int err, i; rtm = nlmsg_payload(nlh, sizeof(*rtm)); if (!rtm) { NL_SET_ERR_MSG_MOD(extack, "Invalid header for FIB dump request"); return -EINVAL; } if (rtm->rtm_dst_len || rtm->rtm_src_len || rtm->rtm_tos || rtm->rtm_table || rtm->rtm_scope || rtm->rtm_type || rtm->rtm_flags) { NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for FIB dump request"); return -EINVAL; } if (rtm->rtm_protocol) { filter->protocol = rtm->rtm_protocol; filter->filter_set = 1; cb->answer_flags = NLM_F_DUMP_FILTERED; } err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_mpls_policy, extack); if (err < 0) return err; for (i = 0; i <= RTA_MAX; ++i) { int ifindex; if (i == RTA_OIF) { ifindex = nla_get_u32(tb[i]); filter->dev = dev_get_by_index_rcu(net, ifindex); if (!filter->dev) return -ENODEV; filter->filter_set = 1; } else if (tb[i]) { NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in dump request"); return -EINVAL; } } return 0; } #endif static bool mpls_rt_uses_dev(struct mpls_route *rt, const struct net_device *dev) { if (rt->rt_nhn == 1) { struct mpls_nh *nh = rt->rt_nh; if (nh->nh_dev == dev) return true; } else { for_nexthops(rt) { if (nh->nh_dev == dev) return true; } endfor_nexthops(rt); } return false; } static int mpls_dump_routes(struct sk_buff *skb, struct netlink_callback *cb) { const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); struct mpls_route __rcu **platform_label; struct fib_dump_filter filter = { .rtnl_held = false, }; unsigned int flags = NLM_F_MULTI; size_t platform_labels; unsigned int index; int err; rcu_read_lock(); if (cb->strict_check) { err = mpls_valid_fib_dump_req(net, nlh, &filter, cb); if (err < 0) goto err; /* for MPLS, there is only 1 table with fixed type and flags. * If either are set in the filter then return nothing. */ if ((filter.table_id && filter.table_id != RT_TABLE_MAIN) || (filter.rt_type && filter.rt_type != RTN_UNICAST) || filter.flags) goto unlock; } index = cb->args[0]; if (index < MPLS_LABEL_FIRST_UNRESERVED) index = MPLS_LABEL_FIRST_UNRESERVED; platform_label = mpls_platform_label_rcu(net, &platform_labels); if (filter.filter_set) flags |= NLM_F_DUMP_FILTERED; for (; index < platform_labels; index++) { struct mpls_route *rt; rt = rcu_dereference(platform_label[index]); if (!rt) continue; if ((filter.dev && !mpls_rt_uses_dev(rt, filter.dev)) || (filter.protocol && rt->rt_protocol != filter.protocol)) continue; if (mpls_dump_route(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWROUTE, index, rt, flags) < 0) break; } cb->args[0] = index; unlock: rcu_read_unlock(); return skb->len; err: rcu_read_unlock(); return err; } static inline size_t lfib_nlmsg_size(struct mpls_route *rt) { size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg)) + nla_total_size(4) /* RTA_DST */ + nla_total_size(1); /* RTA_TTL_PROPAGATE */ if (rt->rt_nhn == 1) { struct mpls_nh *nh = rt->rt_nh; if (nh->nh_dev) payload += nla_total_size(4); /* RTA_OIF */ if (nh->nh_via_table != MPLS_NEIGH_TABLE_UNSPEC) /* RTA_VIA */ payload += nla_total_size(2 + nh->nh_via_alen); if (nh->nh_labels) /* RTA_NEWDST */ payload += nla_total_size(nh->nh_labels * 4); } else { /* each nexthop is packed in an attribute */ size_t nhsize = 0; for_nexthops(rt) { if (!nh->nh_dev) continue; nhsize += nla_total_size(sizeof(struct rtnexthop)); /* RTA_VIA */ if (nh->nh_via_table != MPLS_NEIGH_TABLE_UNSPEC) nhsize += nla_total_size(2 + nh->nh_via_alen); if (nh->nh_labels) nhsize += nla_total_size(nh->nh_labels * 4); } endfor_nexthops(rt); /* nested attribute */ payload += nla_total_size(nhsize); } return payload; } static void rtmsg_lfib(int event, u32 label, struct mpls_route *rt, struct nlmsghdr *nlh, struct net *net, u32 portid, unsigned int nlm_flags) { struct sk_buff *skb; u32 seq = nlh ? nlh->nlmsg_seq : 0; int err = -ENOBUFS; skb = nlmsg_new(lfib_nlmsg_size(rt), GFP_KERNEL); if (skb == NULL) goto errout; err = mpls_dump_route(skb, portid, seq, event, label, rt, nlm_flags); if (err < 0) { /* -EMSGSIZE implies BUG in lfib_nlmsg_size */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, net, portid, RTNLGRP_MPLS_ROUTE, nlh, GFP_KERNEL); return; errout: rtnl_set_sk_err(net, RTNLGRP_MPLS_ROUTE, err); } static int mpls_valid_getroute_req(struct sk_buff *skb, const struct nlmsghdr *nlh, struct nlattr **tb, struct netlink_ext_ack *extack) { struct rtmsg *rtm; int i, err; rtm = nlmsg_payload(nlh, sizeof(*rtm)); if (!rtm) { NL_SET_ERR_MSG_MOD(extack, "Invalid header for get route request"); return -EINVAL; } if (!netlink_strict_get_check(skb)) return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_mpls_policy, extack); if ((rtm->rtm_dst_len && rtm->rtm_dst_len != 20) || rtm->rtm_src_len || rtm->rtm_tos || rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope || rtm->rtm_type) { NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request"); return -EINVAL; } if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) { NL_SET_ERR_MSG_MOD(extack, "Invalid flags for get route request"); return -EINVAL; } err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_mpls_policy, extack); if (err) return err; if ((tb[RTA_DST] || tb[RTA_NEWDST]) && !rtm->rtm_dst_len) { NL_SET_ERR_MSG_MOD(extack, "rtm_dst_len must be 20 for MPLS"); return -EINVAL; } for (i = 0; i <= RTA_MAX; i++) { if (!tb[i]) continue; switch (i) { case RTA_DST: case RTA_NEWDST: break; default: NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request"); return -EINVAL; } } return 0; } static int mpls_getroute(struct sk_buff *in_skb, struct nlmsghdr *in_nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); u32 portid = NETLINK_CB(in_skb).portid; u32 in_label = LABEL_NOT_SPECIFIED; struct nlattr *tb[RTA_MAX + 1]; struct mpls_route *rt = NULL; u32 labels[MAX_NEW_LABELS]; struct mpls_shim_hdr *hdr; unsigned int hdr_size = 0; const struct mpls_nh *nh; struct net_device *dev; struct rtmsg *rtm, *r; struct nlmsghdr *nlh; struct sk_buff *skb; u8 n_labels; int err; mutex_lock(&net->mpls.platform_mutex); err = mpls_valid_getroute_req(in_skb, in_nlh, tb, extack); if (err < 0) goto errout; rtm = nlmsg_data(in_nlh); if (tb[RTA_DST]) { u8 label_count; if (nla_get_labels(tb[RTA_DST], 1, &label_count, &in_label, extack)) { err = -EINVAL; goto errout; } if (!mpls_label_ok(net, &in_label, extack)) { err = -EINVAL; goto errout; } } if (in_label < net->mpls.platform_labels) rt = mpls_route_input(net, in_label); if (!rt) { err = -ENETUNREACH; goto errout; } if (rtm->rtm_flags & RTM_F_FIB_MATCH) { skb = nlmsg_new(lfib_nlmsg_size(rt), GFP_KERNEL); if (!skb) { err = -ENOBUFS; goto errout; } err = mpls_dump_route(skb, portid, in_nlh->nlmsg_seq, RTM_NEWROUTE, in_label, rt, 0); if (err < 0) { /* -EMSGSIZE implies BUG in lfib_nlmsg_size */ WARN_ON(err == -EMSGSIZE); goto errout_free; } err = rtnl_unicast(skb, net, portid); goto errout; } if (tb[RTA_NEWDST]) { if (nla_get_labels(tb[RTA_NEWDST], MAX_NEW_LABELS, &n_labels, labels, extack) != 0) { err = -EINVAL; goto errout; } hdr_size = n_labels * sizeof(struct mpls_shim_hdr); } skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) { err = -ENOBUFS; goto errout; } skb->protocol = htons(ETH_P_MPLS_UC); if (hdr_size) { bool bos; int i; if (skb_cow(skb, hdr_size)) { err = -ENOBUFS; goto errout_free; } skb_reserve(skb, hdr_size); skb_push(skb, hdr_size); skb_reset_network_header(skb); /* Push new labels */ hdr = mpls_hdr(skb); bos = true; for (i = n_labels - 1; i >= 0; i--) { hdr[i] = mpls_entry_encode(labels[i], 1, 0, bos); bos = false; } } nh = mpls_select_multipath(rt, skb); if (!nh) { err = -ENETUNREACH; goto errout_free; } if (hdr_size) { skb_pull(skb, hdr_size); skb_reset_network_header(skb); } nlh = nlmsg_put(skb, portid, in_nlh->nlmsg_seq, RTM_NEWROUTE, sizeof(*r), 0); if (!nlh) { err = -EMSGSIZE; goto errout_free; } r = nlmsg_data(nlh); r->rtm_family = AF_MPLS; r->rtm_dst_len = 20; r->rtm_src_len = 0; r->rtm_table = RT_TABLE_MAIN; r->rtm_type = RTN_UNICAST; r->rtm_scope = RT_SCOPE_UNIVERSE; r->rtm_protocol = rt->rt_protocol; r->rtm_flags = 0; if (nla_put_labels(skb, RTA_DST, 1, &in_label)) goto nla_put_failure; if (nh->nh_labels && nla_put_labels(skb, RTA_NEWDST, nh->nh_labels, nh->nh_label)) goto nla_put_failure; if (nh->nh_via_table != MPLS_NEIGH_TABLE_UNSPEC && nla_put_via(skb, nh->nh_via_table, mpls_nh_via(rt, nh), nh->nh_via_alen)) goto nla_put_failure; dev = nh->nh_dev; if (dev && nla_put_u32(skb, RTA_OIF, dev->ifindex)) goto nla_put_failure; nlmsg_end(skb, nlh); err = rtnl_unicast(skb, net, portid); errout: mutex_unlock(&net->mpls.platform_mutex); return err; nla_put_failure: nlmsg_cancel(skb, nlh); err = -EMSGSIZE; errout_free: mutex_unlock(&net->mpls.platform_mutex); kfree_skb(skb); return err; } static int resize_platform_label_table(struct net *net, size_t limit) { size_t size = sizeof(struct mpls_route *) * limit; size_t old_limit; size_t cp_size; struct mpls_route __rcu **labels = NULL, **old; struct mpls_route *rt0 = NULL, *rt2 = NULL; unsigned index; if (size) { labels = kvzalloc(size, GFP_KERNEL); if (!labels) goto nolabels; } /* In case the predefined labels need to be populated */ if (limit > MPLS_LABEL_IPV4NULL) { struct net_device *lo = net->loopback_dev; rt0 = mpls_rt_alloc(1, lo->addr_len, 0); if (IS_ERR(rt0)) goto nort0; rt0->rt_nh->nh_dev = lo; netdev_hold(lo, &rt0->rt_nh->nh_dev_tracker, GFP_KERNEL); rt0->rt_protocol = RTPROT_KERNEL; rt0->rt_payload_type = MPT_IPV4; rt0->rt_ttl_propagate = MPLS_TTL_PROP_DEFAULT; rt0->rt_nh->nh_via_table = NEIGH_LINK_TABLE; rt0->rt_nh->nh_via_alen = lo->addr_len; memcpy(__mpls_nh_via(rt0, rt0->rt_nh), lo->dev_addr, lo->addr_len); } if (limit > MPLS_LABEL_IPV6NULL) { struct net_device *lo = net->loopback_dev; rt2 = mpls_rt_alloc(1, lo->addr_len, 0); if (IS_ERR(rt2)) goto nort2; rt2->rt_nh->nh_dev = lo; netdev_hold(lo, &rt2->rt_nh->nh_dev_tracker, GFP_KERNEL); rt2->rt_protocol = RTPROT_KERNEL; rt2->rt_payload_type = MPT_IPV6; rt2->rt_ttl_propagate = MPLS_TTL_PROP_DEFAULT; rt2->rt_nh->nh_via_table = NEIGH_LINK_TABLE; rt2->rt_nh->nh_via_alen = lo->addr_len; memcpy(__mpls_nh_via(rt2, rt2->rt_nh), lo->dev_addr, lo->addr_len); } mutex_lock(&net->mpls.platform_mutex); /* Remember the original table */ old = mpls_dereference(net, net->mpls.platform_label); old_limit = net->mpls.platform_labels; /* Free any labels beyond the new table */ for (index = limit; index < old_limit; index++) mpls_route_update(net, index, NULL, NULL); /* Copy over the old labels */ cp_size = size; if (old_limit < limit) cp_size = old_limit * sizeof(struct mpls_route *); memcpy(labels, old, cp_size); /* If needed set the predefined labels */ if ((old_limit <= MPLS_LABEL_IPV6NULL) && (limit > MPLS_LABEL_IPV6NULL)) { RCU_INIT_POINTER(labels[MPLS_LABEL_IPV6NULL], rt2); rt2 = NULL; } if ((old_limit <= MPLS_LABEL_IPV4NULL) && (limit > MPLS_LABEL_IPV4NULL)) { RCU_INIT_POINTER(labels[MPLS_LABEL_IPV4NULL], rt0); rt0 = NULL; } /* Update the global pointers */ local_bh_disable(); write_seqcount_begin(&net->mpls.platform_label_seq); net->mpls.platform_labels = limit; rcu_assign_pointer(net->mpls.platform_label, labels); write_seqcount_end(&net->mpls.platform_label_seq); local_bh_enable(); mutex_unlock(&net->mpls.platform_mutex); mpls_rt_free(rt2); mpls_rt_free(rt0); if (old) { synchronize_rcu(); kvfree(old); } return 0; nort2: mpls_rt_free(rt0); nort0: kvfree(labels); nolabels: return -ENOMEM; } static int mpls_platform_labels(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = table->data; int platform_labels = net->mpls.platform_labels; int ret; struct ctl_table tmp = { .procname = table->procname, .data = &platform_labels, .maxlen = sizeof(int), .mode = table->mode, .extra1 = SYSCTL_ZERO, .extra2 = &label_limit, }; ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); if (write && ret == 0) ret = resize_platform_label_table(net, platform_labels); return ret; } #define MPLS_NS_SYSCTL_OFFSET(field) \ (&((struct net *)0)->field) static const struct ctl_table mpls_table[] = { { .procname = "platform_labels", .data = NULL, .maxlen = sizeof(int), .mode = 0644, .proc_handler = mpls_platform_labels, }, { .procname = "ip_ttl_propagate", .data = MPLS_NS_SYSCTL_OFFSET(mpls.ip_ttl_propagate), .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "default_ttl", .data = MPLS_NS_SYSCTL_OFFSET(mpls.default_ttl), .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE, .extra2 = &ttl_max, }, }; static __net_init int mpls_net_init(struct net *net) { size_t table_size = ARRAY_SIZE(mpls_table); struct ctl_table *table; int i; mutex_init(&net->mpls.platform_mutex); seqcount_mutex_init(&net->mpls.platform_label_seq, &net->mpls.platform_mutex); net->mpls.platform_labels = 0; net->mpls.platform_label = NULL; net->mpls.ip_ttl_propagate = 1; net->mpls.default_ttl = 255; table = kmemdup(mpls_table, sizeof(mpls_table), GFP_KERNEL); if (table == NULL) return -ENOMEM; /* Table data contains only offsets relative to the base of * the mdev at this point, so make them absolute. */ for (i = 0; i < table_size; i++) table[i].data = (char *)net + (uintptr_t)table[i].data; net->mpls.ctl = register_net_sysctl_sz(net, "net/mpls", table, table_size); if (net->mpls.ctl == NULL) { kfree(table); return -ENOMEM; } return 0; } static __net_exit void mpls_net_exit(struct net *net) { struct mpls_route __rcu **platform_label; size_t platform_labels; const struct ctl_table *table; unsigned int index; table = net->mpls.ctl->ctl_table_arg; unregister_net_sysctl_table(net->mpls.ctl); kfree(table); /* An rcu grace period has passed since there was a device in * the network namespace (and thus the last in flight packet) * left this network namespace. This is because * unregister_netdevice_many and netdev_run_todo has completed * for each network device that was in this network namespace. * * As such no additional rcu synchronization is necessary when * freeing the platform_label table. */ mutex_lock(&net->mpls.platform_mutex); platform_label = mpls_dereference(net, net->mpls.platform_label); platform_labels = net->mpls.platform_labels; for (index = 0; index < platform_labels; index++) { struct mpls_route *rt; rt = mpls_dereference(net, platform_label[index]); mpls_notify_route(net, index, rt, NULL, NULL); mpls_rt_free(rt); } mutex_unlock(&net->mpls.platform_mutex); kvfree(platform_label); } static struct pernet_operations mpls_net_ops = { .init = mpls_net_init, .exit = mpls_net_exit, }; static struct rtnl_af_ops mpls_af_ops __read_mostly = { .family = AF_MPLS, .fill_stats_af = mpls_fill_stats_af, .get_stats_af_size = mpls_get_stats_af_size, }; static const struct rtnl_msg_handler mpls_rtnl_msg_handlers[] __initdata_or_module = { {THIS_MODULE, PF_MPLS, RTM_NEWROUTE, mpls_rtm_newroute, NULL, RTNL_FLAG_DOIT_UNLOCKED}, {THIS_MODULE, PF_MPLS, RTM_DELROUTE, mpls_rtm_delroute, NULL, RTNL_FLAG_DOIT_UNLOCKED}, {THIS_MODULE, PF_MPLS, RTM_GETROUTE, mpls_getroute, mpls_dump_routes, RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED}, {THIS_MODULE, PF_MPLS, RTM_GETNETCONF, mpls_netconf_get_devconf, mpls_netconf_dump_devconf, RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED}, }; static int __init mpls_init(void) { int err; BUILD_BUG_ON(sizeof(struct mpls_shim_hdr) != 4); err = register_pernet_subsys(&mpls_net_ops); if (err) goto out; err = register_netdevice_notifier(&mpls_dev_notifier); if (err) goto out_unregister_pernet; dev_add_pack(&mpls_packet_type); err = rtnl_af_register(&mpls_af_ops); if (err) goto out_unregister_dev_type; err = rtnl_register_many(mpls_rtnl_msg_handlers); if (err) goto out_unregister_rtnl_af; err = ipgre_tunnel_encap_add_mpls_ops(); if (err) { pr_err("Can't add mpls over gre tunnel ops\n"); goto out_unregister_rtnl; } err = 0; out: return err; out_unregister_rtnl: rtnl_unregister_many(mpls_rtnl_msg_handlers); out_unregister_rtnl_af: rtnl_af_unregister(&mpls_af_ops); out_unregister_dev_type: dev_remove_pack(&mpls_packet_type); unregister_netdevice_notifier(&mpls_dev_notifier); out_unregister_pernet: unregister_pernet_subsys(&mpls_net_ops); goto out; } module_init(mpls_init); static void __exit mpls_exit(void) { rtnl_unregister_all(PF_MPLS); rtnl_af_unregister(&mpls_af_ops); dev_remove_pack(&mpls_packet_type); unregister_netdevice_notifier(&mpls_dev_notifier); unregister_pernet_subsys(&mpls_net_ops); ipgre_tunnel_encap_del_mpls_ops(); } module_exit(mpls_exit); MODULE_DESCRIPTION("MultiProtocol Label Switching"); MODULE_LICENSE("GPL v2"); MODULE_ALIAS_NETPROTO(PF_MPLS);
9 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PKEYS_H #define _ASM_X86_PKEYS_H /* * If more than 16 keys are ever supported, a thorough audit * will be necessary to ensure that the types that store key * numbers and masks have sufficient capacity. */ #define arch_max_pkey() (cpu_feature_enabled(X86_FEATURE_OSPKE) ? 16 : 1) extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val); static inline bool arch_pkeys_enabled(void) { return cpu_feature_enabled(X86_FEATURE_OSPKE); } /* * Try to dedicate one of the protection keys to be used as an * execute-only protection key. */ extern int __execute_only_pkey(struct mm_struct *mm); static inline int execute_only_pkey(struct mm_struct *mm) { if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) return ARCH_DEFAULT_PKEY; return __execute_only_pkey(mm); } extern int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, int pkey); static inline int arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, int pkey) { if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) return 0; return __arch_override_mprotect_pkey(vma, prot, pkey); } #define ARCH_VM_PKEY_FLAGS (VM_PKEY_BIT0 | VM_PKEY_BIT1 | VM_PKEY_BIT2 | VM_PKEY_BIT3) #define mm_pkey_allocation_map(mm) (mm->context.pkey_allocation_map) #define mm_set_pkey_allocated(mm, pkey) do { \ mm_pkey_allocation_map(mm) |= (1U << pkey); \ } while (0) #define mm_set_pkey_free(mm, pkey) do { \ mm_pkey_allocation_map(mm) &= ~(1U << pkey); \ } while (0) static inline bool mm_pkey_is_allocated(struct mm_struct *mm, int pkey) { /* * "Allocated" pkeys are those that have been returned * from pkey_alloc() or pkey 0 which is allocated * implicitly when the mm is created. */ if (pkey < 0) return false; if (pkey >= arch_max_pkey()) return false; /* * The exec-only pkey is set in the allocation map, but * is not available to any of the user interfaces like * mprotect_pkey(). */ if (pkey == mm->context.execute_only_pkey) return false; return mm_pkey_allocation_map(mm) & (1U << pkey); } /* * Returns a positive, 4-bit key on success, or -1 on failure. */ static inline int mm_pkey_alloc(struct mm_struct *mm) { /* * Note: this is the one and only place we make sure * that the pkey is valid as far as the hardware is * concerned. The rest of the kernel trusts that * only good, valid pkeys come out of here. */ u16 all_pkeys_mask = ((1U << arch_max_pkey()) - 1); int ret; /* * Are we out of pkeys? We must handle this specially * because ffz() behavior is undefined if there are no * zeros. */ if (mm_pkey_allocation_map(mm) == all_pkeys_mask) return -1; ret = ffz(mm_pkey_allocation_map(mm)); mm_set_pkey_allocated(mm, ret); return ret; } static inline int mm_pkey_free(struct mm_struct *mm, int pkey) { if (!mm_pkey_is_allocated(mm, pkey)) return -EINVAL; mm_set_pkey_free(mm, pkey); return 0; } static inline int vma_pkey(struct vm_area_struct *vma) { unsigned long vma_pkey_mask = VM_PKEY_BIT0 | VM_PKEY_BIT1 | VM_PKEY_BIT2 | VM_PKEY_BIT3; return (vma->vm_flags & vma_pkey_mask) >> VM_PKEY_SHIFT; } #endif /*_ASM_X86_PKEYS_H */
206 207 771 769 770 634 634 105 104 104 103 103 91 73 73 73 91 91 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2011 IBM Corporation * * Author: * Mimi Zohar <zohar@us.ibm.com> */ #include <linux/module.h> #include <linux/init.h> #include <linux/file.h> #include <linux/binfmts.h> #include <linux/fs.h> #include <linux/xattr.h> #include <linux/magic.h> #include <linux/ima.h> #include <linux/evm.h> #include <linux/fsverity.h> #include <keys/system_keyring.h> #include <uapi/linux/fsverity.h> #include "ima.h" #ifdef CONFIG_IMA_APPRAISE_BOOTPARAM static char *ima_appraise_cmdline_default __initdata; core_param(ima_appraise, ima_appraise_cmdline_default, charp, 0); void __init ima_appraise_parse_cmdline(void) { const char *str = ima_appraise_cmdline_default; bool sb_state = arch_ima_get_secureboot(); int appraisal_state = ima_appraise; if (!str) return; if (strncmp(str, "off", 3) == 0) appraisal_state = 0; else if (strncmp(str, "log", 3) == 0) appraisal_state = IMA_APPRAISE_LOG; else if (strncmp(str, "fix", 3) == 0) appraisal_state = IMA_APPRAISE_FIX; else if (strncmp(str, "enforce", 7) == 0) appraisal_state = IMA_APPRAISE_ENFORCE; else pr_err("invalid \"%s\" appraise option", str); /* If appraisal state was changed, but secure boot is enabled, * keep its default */ if (sb_state) { if (!(appraisal_state & IMA_APPRAISE_ENFORCE)) pr_info("Secure boot enabled: ignoring ima_appraise=%s option", str); } else { ima_appraise = appraisal_state; } } #endif /* * is_ima_appraise_enabled - return appraise status * * Only return enabled, if not in ima_appraise="fix" or "log" modes. */ bool is_ima_appraise_enabled(void) { return ima_appraise & IMA_APPRAISE_ENFORCE; } /* * ima_must_appraise - set appraise flag * * Return 1 to appraise or hash */ int ima_must_appraise(struct mnt_idmap *idmap, struct inode *inode, int mask, enum ima_hooks func) { struct lsm_prop prop; if (!ima_appraise) return 0; security_current_getlsmprop_subj(&prop); return ima_match_policy(idmap, inode, current_cred(), &prop, func, mask, IMA_APPRAISE | IMA_HASH, NULL, NULL, NULL, NULL); } static int ima_fix_xattr(struct dentry *dentry, struct ima_iint_cache *iint) { int rc, offset; u8 algo = iint->ima_hash->algo; if (algo <= HASH_ALGO_SHA1) { offset = 1; iint->ima_hash->xattr.sha1.type = IMA_XATTR_DIGEST; } else { offset = 0; iint->ima_hash->xattr.ng.type = IMA_XATTR_DIGEST_NG; iint->ima_hash->xattr.ng.algo = algo; } rc = __vfs_setxattr_noperm(&nop_mnt_idmap, dentry, XATTR_NAME_IMA, &iint->ima_hash->xattr.data[offset], (sizeof(iint->ima_hash->xattr) - offset) + iint->ima_hash->length, 0); return rc; } /* Return specific func appraised cached result */ enum integrity_status ima_get_cache_status(struct ima_iint_cache *iint, enum ima_hooks func) { switch (func) { case MMAP_CHECK: case MMAP_CHECK_REQPROT: return iint->ima_mmap_status; case BPRM_CHECK: return iint->ima_bprm_status; case CREDS_CHECK: return iint->ima_creds_status; case FILE_CHECK: case POST_SETATTR: return iint->ima_file_status; case MODULE_CHECK ... MAX_CHECK - 1: default: return iint->ima_read_status; } } static void ima_set_cache_status(struct ima_iint_cache *iint, enum ima_hooks func, enum integrity_status status) { switch (func) { case MMAP_CHECK: case MMAP_CHECK_REQPROT: iint->ima_mmap_status = status; break; case BPRM_CHECK: iint->ima_bprm_status = status; break; case CREDS_CHECK: iint->ima_creds_status = status; break; case FILE_CHECK: case POST_SETATTR: iint->ima_file_status = status; break; case MODULE_CHECK ... MAX_CHECK - 1: default: iint->ima_read_status = status; break; } } static void ima_cache_flags(struct ima_iint_cache *iint, enum ima_hooks func) { switch (func) { case MMAP_CHECK: case MMAP_CHECK_REQPROT: iint->flags |= (IMA_MMAP_APPRAISED | IMA_APPRAISED); break; case BPRM_CHECK: iint->flags |= (IMA_BPRM_APPRAISED | IMA_APPRAISED); break; case CREDS_CHECK: iint->flags |= (IMA_CREDS_APPRAISED | IMA_APPRAISED); break; case FILE_CHECK: case POST_SETATTR: iint->flags |= (IMA_FILE_APPRAISED | IMA_APPRAISED); break; case MODULE_CHECK ... MAX_CHECK - 1: default: iint->flags |= (IMA_READ_APPRAISED | IMA_APPRAISED); break; } } enum hash_algo ima_get_hash_algo(const struct evm_ima_xattr_data *xattr_value, int xattr_len) { struct signature_v2_hdr *sig; enum hash_algo ret; if (!xattr_value || xattr_len < 2) /* return default hash algo */ return ima_hash_algo; switch (xattr_value->type) { case IMA_VERITY_DIGSIG: sig = (typeof(sig))xattr_value; if (sig->version != 3 || xattr_len <= sizeof(*sig) || sig->hash_algo >= HASH_ALGO__LAST) return ima_hash_algo; return sig->hash_algo; case EVM_IMA_XATTR_DIGSIG: sig = (typeof(sig))xattr_value; if (sig->version != 2 || xattr_len <= sizeof(*sig) || sig->hash_algo >= HASH_ALGO__LAST) return ima_hash_algo; return sig->hash_algo; case IMA_XATTR_DIGEST_NG: /* first byte contains algorithm id */ ret = xattr_value->data[0]; if (ret < HASH_ALGO__LAST) return ret; break; case IMA_XATTR_DIGEST: /* this is for backward compatibility */ if (xattr_len == 21) { unsigned int zero = 0; if (!memcmp(&xattr_value->data[16], &zero, 4)) return HASH_ALGO_MD5; else return HASH_ALGO_SHA1; } else if (xattr_len == 17) return HASH_ALGO_MD5; break; } /* return default hash algo */ return ima_hash_algo; } int ima_read_xattr(struct dentry *dentry, struct evm_ima_xattr_data **xattr_value, int xattr_len) { int ret; ret = vfs_getxattr_alloc(&nop_mnt_idmap, dentry, XATTR_NAME_IMA, (char **)xattr_value, xattr_len, GFP_NOFS); if (ret == -EOPNOTSUPP) ret = 0; return ret; } /* * calc_file_id_hash - calculate the hash of the ima_file_id struct data * @type: xattr type [enum evm_ima_xattr_type] * @algo: hash algorithm [enum hash_algo] * @digest: pointer to the digest to be hashed * @hash: (out) pointer to the hash * * IMA signature version 3 disambiguates the data that is signed by * indirectly signing the hash of the ima_file_id structure data. * * Signing the ima_file_id struct is currently only supported for * IMA_VERITY_DIGSIG type xattrs. * * Return 0 on success, error code otherwise. */ static int calc_file_id_hash(enum evm_ima_xattr_type type, enum hash_algo algo, const u8 *digest, struct ima_digest_data *hash) { struct ima_file_id file_id = { .hash_type = IMA_VERITY_DIGSIG, .hash_algorithm = algo}; unsigned int unused = HASH_MAX_DIGESTSIZE - hash_digest_size[algo]; if (type != IMA_VERITY_DIGSIG) return -EINVAL; memcpy(file_id.hash, digest, hash_digest_size[algo]); hash->algo = algo; hash->length = hash_digest_size[algo]; return ima_calc_buffer_hash(&file_id, sizeof(file_id) - unused, hash); } /* * xattr_verify - verify xattr digest or signature * * Verify whether the hash or signature matches the file contents. * * Return 0 on success, error code otherwise. */ static int xattr_verify(enum ima_hooks func, struct ima_iint_cache *iint, struct evm_ima_xattr_data *xattr_value, int xattr_len, enum integrity_status *status, const char **cause) { struct ima_max_digest_data hash; struct signature_v2_hdr *sig; int rc = -EINVAL, hash_start = 0; int mask; switch (xattr_value->type) { case IMA_XATTR_DIGEST_NG: /* first byte contains algorithm id */ hash_start = 1; fallthrough; case IMA_XATTR_DIGEST: if (*status != INTEGRITY_PASS_IMMUTABLE) { if (iint->flags & IMA_DIGSIG_REQUIRED) { if (iint->flags & IMA_VERITY_REQUIRED) *cause = "verity-signature-required"; else *cause = "IMA-signature-required"; *status = INTEGRITY_FAIL; break; } clear_bit(IMA_DIGSIG, &iint->atomic_flags); } else { set_bit(IMA_DIGSIG, &iint->atomic_flags); } if (xattr_len - sizeof(xattr_value->type) - hash_start >= iint->ima_hash->length) /* * xattr length may be longer. md5 hash in previous * version occupied 20 bytes in xattr, instead of 16 */ rc = memcmp(&xattr_value->data[hash_start], iint->ima_hash->digest, iint->ima_hash->length); else rc = -EINVAL; if (rc) { *cause = "invalid-hash"; *status = INTEGRITY_FAIL; break; } *status = INTEGRITY_PASS; break; case EVM_IMA_XATTR_DIGSIG: set_bit(IMA_DIGSIG, &iint->atomic_flags); mask = IMA_DIGSIG_REQUIRED | IMA_VERITY_REQUIRED; if ((iint->flags & mask) == mask) { *cause = "verity-signature-required"; *status = INTEGRITY_FAIL; break; } sig = (typeof(sig))xattr_value; if (sig->version >= 3) { *cause = "invalid-signature-version"; *status = INTEGRITY_FAIL; break; } rc = integrity_digsig_verify(INTEGRITY_KEYRING_IMA, (const char *)xattr_value, xattr_len, iint->ima_hash->digest, iint->ima_hash->length); if (rc == -EOPNOTSUPP) { *status = INTEGRITY_UNKNOWN; break; } if (IS_ENABLED(CONFIG_INTEGRITY_PLATFORM_KEYRING) && rc && func == KEXEC_KERNEL_CHECK) rc = integrity_digsig_verify(INTEGRITY_KEYRING_PLATFORM, (const char *)xattr_value, xattr_len, iint->ima_hash->digest, iint->ima_hash->length); if (rc) { *cause = "invalid-signature"; *status = INTEGRITY_FAIL; } else { *status = INTEGRITY_PASS; } break; case IMA_VERITY_DIGSIG: set_bit(IMA_DIGSIG, &iint->atomic_flags); if (iint->flags & IMA_DIGSIG_REQUIRED) { if (!(iint->flags & IMA_VERITY_REQUIRED)) { *cause = "IMA-signature-required"; *status = INTEGRITY_FAIL; break; } } sig = (typeof(sig))xattr_value; if (sig->version != 3) { *cause = "invalid-signature-version"; *status = INTEGRITY_FAIL; break; } rc = calc_file_id_hash(IMA_VERITY_DIGSIG, iint->ima_hash->algo, iint->ima_hash->digest, container_of(&hash.hdr, struct ima_digest_data, hdr)); if (rc) { *cause = "sigv3-hashing-error"; *status = INTEGRITY_FAIL; break; } rc = integrity_digsig_verify(INTEGRITY_KEYRING_IMA, (const char *)xattr_value, xattr_len, hash.digest, hash.hdr.length); if (rc) { *cause = "invalid-verity-signature"; *status = INTEGRITY_FAIL; } else { *status = INTEGRITY_PASS; } break; default: *status = INTEGRITY_UNKNOWN; *cause = "unknown-ima-data"; break; } return rc; } /* * modsig_verify - verify modsig signature * * Verify whether the signature matches the file contents. * * Return 0 on success, error code otherwise. */ static int modsig_verify(enum ima_hooks func, const struct modsig *modsig, enum integrity_status *status, const char **cause) { int rc; rc = integrity_modsig_verify(INTEGRITY_KEYRING_IMA, modsig); if (IS_ENABLED(CONFIG_INTEGRITY_PLATFORM_KEYRING) && rc && func == KEXEC_KERNEL_CHECK) rc = integrity_modsig_verify(INTEGRITY_KEYRING_PLATFORM, modsig); if (rc) { *cause = "invalid-signature"; *status = INTEGRITY_FAIL; } else { *status = INTEGRITY_PASS; } return rc; } /* * ima_check_blacklist - determine if the binary is blacklisted. * * Add the hash of the blacklisted binary to the measurement list, based * on policy. * * Returns -EPERM if the hash is blacklisted. */ int ima_check_blacklist(struct ima_iint_cache *iint, const struct modsig *modsig, int pcr) { enum hash_algo hash_algo; const u8 *digest = NULL; u32 digestsize = 0; int rc = 0; if (!(iint->flags & IMA_CHECK_BLACKLIST)) return 0; if (iint->flags & IMA_MODSIG_ALLOWED && modsig) { ima_get_modsig_digest(modsig, &hash_algo, &digest, &digestsize); rc = is_binary_blacklisted(digest, digestsize); } else if (iint->flags & IMA_DIGSIG_REQUIRED && iint->ima_hash) rc = is_binary_blacklisted(iint->ima_hash->digest, iint->ima_hash->length); if ((rc == -EPERM) && (iint->flags & IMA_MEASURE)) process_buffer_measurement(&nop_mnt_idmap, NULL, digest, digestsize, "blacklisted-hash", NONE, pcr, NULL, false, NULL, 0); return rc; } /* * ima_appraise_measurement - appraise file measurement * * Call evm_verifyxattr() to verify the integrity of 'security.ima'. * Assuming success, compare the xattr hash with the collected measurement. * * Return 0 on success, error code otherwise */ int ima_appraise_measurement(enum ima_hooks func, struct ima_iint_cache *iint, struct file *file, const unsigned char *filename, struct evm_ima_xattr_data *xattr_value, int xattr_len, const struct modsig *modsig, bool bprm_is_check) { static const char op[] = "appraise_data"; int audit_msgno = AUDIT_INTEGRITY_DATA; const char *cause = "unknown"; struct dentry *dentry = file_dentry(file); struct inode *inode = d_backing_inode(dentry); enum integrity_status status = INTEGRITY_UNKNOWN; int rc = xattr_len; bool try_modsig = iint->flags & IMA_MODSIG_ALLOWED && modsig; /* If not appraising a modsig, we need an xattr. */ if (!(inode->i_opflags & IOP_XATTR) && !try_modsig) return INTEGRITY_UNKNOWN; /* * Unlike any of the other LSM hooks where the kernel enforces file * integrity, enforcing file integrity for the bprm_creds_for_exec() * LSM hook with the AT_EXECVE_CHECK flag is left up to the discretion * of the script interpreter(userspace). Differentiate kernel and * userspace enforced integrity audit messages. */ if (bprm_is_check) audit_msgno = AUDIT_INTEGRITY_USERSPACE; /* If reading the xattr failed and there's no modsig, error out. */ if (rc <= 0 && !try_modsig) { if (rc && rc != -ENODATA) goto out; if (iint->flags & IMA_DIGSIG_REQUIRED) { if (iint->flags & IMA_VERITY_REQUIRED) cause = "verity-signature-required"; else cause = "IMA-signature-required"; } else { cause = "missing-hash"; } status = INTEGRITY_NOLABEL; if (file->f_mode & FMODE_CREATED) iint->flags |= IMA_NEW_FILE; if ((iint->flags & IMA_NEW_FILE) && (!(iint->flags & IMA_DIGSIG_REQUIRED) || (inode->i_size == 0))) status = INTEGRITY_PASS; goto out; } status = evm_verifyxattr(dentry, XATTR_NAME_IMA, xattr_value, rc < 0 ? 0 : rc); switch (status) { case INTEGRITY_PASS: case INTEGRITY_PASS_IMMUTABLE: case INTEGRITY_UNKNOWN: break; case INTEGRITY_NOXATTRS: /* No EVM protected xattrs. */ /* It's fine not to have xattrs when using a modsig. */ if (try_modsig) break; fallthrough; case INTEGRITY_NOLABEL: /* No security.evm xattr. */ cause = "missing-HMAC"; goto out; case INTEGRITY_FAIL_IMMUTABLE: set_bit(IMA_DIGSIG, &iint->atomic_flags); cause = "invalid-fail-immutable"; goto out; case INTEGRITY_FAIL: /* Invalid HMAC/signature. */ cause = "invalid-HMAC"; goto out; default: WARN_ONCE(true, "Unexpected integrity status %d\n", status); } if (xattr_value) rc = xattr_verify(func, iint, xattr_value, xattr_len, &status, &cause); /* * If we have a modsig and either no imasig or the imasig's key isn't * known, then try verifying the modsig. */ if (try_modsig && (!xattr_value || xattr_value->type == IMA_XATTR_DIGEST_NG || rc == -ENOKEY)) rc = modsig_verify(func, modsig, &status, &cause); out: /* * File signatures on some filesystems can not be properly verified. * When such filesystems are mounted by an untrusted mounter or on a * system not willing to accept such a risk, fail the file signature * verification. */ if ((inode->i_sb->s_iflags & SB_I_IMA_UNVERIFIABLE_SIGNATURE) && ((inode->i_sb->s_iflags & SB_I_UNTRUSTED_MOUNTER) || (iint->flags & IMA_FAIL_UNVERIFIABLE_SIGS))) { status = INTEGRITY_FAIL; cause = "unverifiable-signature"; integrity_audit_msg(audit_msgno, inode, filename, op, cause, rc, 0); } else if (status != INTEGRITY_PASS) { /* Fix mode, but don't replace file signatures. */ if ((ima_appraise & IMA_APPRAISE_FIX) && !try_modsig && (!xattr_value || xattr_value->type != EVM_IMA_XATTR_DIGSIG)) { if (!ima_fix_xattr(dentry, iint)) status = INTEGRITY_PASS; } /* * Permit new files with file/EVM portable signatures, but * without data. */ if (inode->i_size == 0 && iint->flags & IMA_NEW_FILE && test_bit(IMA_DIGSIG, &iint->atomic_flags)) { status = INTEGRITY_PASS; } integrity_audit_msg(audit_msgno, inode, filename, op, cause, rc, 0); } else { ima_cache_flags(iint, func); } ima_set_cache_status(iint, func, status); return status; } /* * ima_update_xattr - update 'security.ima' hash value */ void ima_update_xattr(struct ima_iint_cache *iint, struct file *file) { struct dentry *dentry = file_dentry(file); int rc = 0; /* do not collect and update hash for digital signatures */ if (test_bit(IMA_DIGSIG, &iint->atomic_flags)) return; if ((iint->ima_file_status != INTEGRITY_PASS) && !(iint->flags & IMA_HASH)) return; rc = ima_collect_measurement(iint, file, NULL, 0, ima_hash_algo, NULL); if (rc < 0) return; inode_lock(file_inode(file)); ima_fix_xattr(dentry, iint); inode_unlock(file_inode(file)); } /** * ima_inode_post_setattr - reflect file metadata changes * @idmap: idmap of the mount the inode was found from * @dentry: pointer to the affected dentry * @ia_valid: for the UID and GID status * * Changes to a dentry's metadata might result in needing to appraise. * * This function is called from notify_change(), which expects the caller * to lock the inode's i_mutex. */ static void ima_inode_post_setattr(struct mnt_idmap *idmap, struct dentry *dentry, int ia_valid) { struct inode *inode = d_backing_inode(dentry); struct ima_iint_cache *iint; int action; if (!(ima_policy_flag & IMA_APPRAISE) || !S_ISREG(inode->i_mode) || !(inode->i_opflags & IOP_XATTR)) return; action = ima_must_appraise(idmap, inode, MAY_ACCESS, POST_SETATTR); iint = ima_iint_find(inode); if (iint) { set_bit(IMA_CHANGE_ATTR, &iint->atomic_flags); if (!action) clear_bit(IMA_UPDATE_XATTR, &iint->atomic_flags); } } /* * ima_protect_xattr - protect 'security.ima' * * Ensure that not just anyone can modify or remove 'security.ima'. */ static int ima_protect_xattr(struct dentry *dentry, const char *xattr_name, const void *xattr_value, size_t xattr_value_len) { if (strcmp(xattr_name, XATTR_NAME_IMA) == 0) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; return 1; } return 0; } /* * ima_reset_appraise_flags - reset ima_iint_cache flags * * @digsig: whether to clear/set IMA_DIGSIG flag, tristate values * 0: clear IMA_DIGSIG * 1: set IMA_DIGSIG * -1: don't change IMA_DIGSIG * */ static void ima_reset_appraise_flags(struct inode *inode, int digsig) { struct ima_iint_cache *iint; if (!(ima_policy_flag & IMA_APPRAISE) || !S_ISREG(inode->i_mode)) return; iint = ima_iint_find(inode); if (!iint) return; iint->measured_pcrs = 0; set_bit(IMA_CHANGE_XATTR, &iint->atomic_flags); if (digsig == 1) set_bit(IMA_DIGSIG, &iint->atomic_flags); else if (digsig == 0) clear_bit(IMA_DIGSIG, &iint->atomic_flags); } /** * validate_hash_algo() - Block setxattr with unsupported hash algorithms * @dentry: object of the setxattr() * @xattr_value: userland supplied xattr value * @xattr_value_len: length of xattr_value * * The xattr value is mapped to its hash algorithm, and this algorithm * must be built in the kernel for the setxattr to be allowed. * * Emit an audit message when the algorithm is invalid. * * Return: 0 on success, else an error. */ static int validate_hash_algo(struct dentry *dentry, const struct evm_ima_xattr_data *xattr_value, size_t xattr_value_len) { char *path = NULL, *pathbuf = NULL; enum hash_algo xattr_hash_algo; const char *errmsg = "unavailable-hash-algorithm"; unsigned int allowed_hashes; xattr_hash_algo = ima_get_hash_algo(xattr_value, xattr_value_len); allowed_hashes = atomic_read(&ima_setxattr_allowed_hash_algorithms); if (allowed_hashes) { /* success if the algorithm is allowed in the ima policy */ if (allowed_hashes & (1U << xattr_hash_algo)) return 0; /* * We use a different audit message when the hash algorithm * is denied by a policy rule, instead of not being built * in the kernel image */ errmsg = "denied-hash-algorithm"; } else { if (likely(xattr_hash_algo == ima_hash_algo)) return 0; /* allow any xattr using an algorithm built in the kernel */ if (crypto_has_alg(hash_algo_name[xattr_hash_algo], 0, 0)) return 0; } pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); if (!pathbuf) return -EACCES; path = dentry_path(dentry, pathbuf, PATH_MAX); integrity_audit_msg(AUDIT_INTEGRITY_DATA, d_inode(dentry), path, "set_data", errmsg, -EACCES, 0); kfree(pathbuf); return -EACCES; } static int ima_inode_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *xattr_name, const void *xattr_value, size_t xattr_value_len, int flags) { const struct evm_ima_xattr_data *xvalue = xattr_value; int digsig = 0; int result; int err; result = ima_protect_xattr(dentry, xattr_name, xattr_value, xattr_value_len); if (result == 1) { if (!xattr_value_len || (xvalue->type >= IMA_XATTR_LAST)) return -EINVAL; err = validate_hash_algo(dentry, xvalue, xattr_value_len); if (err) return err; digsig = (xvalue->type == EVM_IMA_XATTR_DIGSIG); } else if (!strcmp(xattr_name, XATTR_NAME_EVM) && xattr_value_len > 0) { digsig = (xvalue->type == EVM_XATTR_PORTABLE_DIGSIG); } else { digsig = -1; } if (result == 1 || evm_revalidate_status(xattr_name)) { ima_reset_appraise_flags(d_backing_inode(dentry), digsig); if (result == 1) result = 0; } return result; } static int ima_inode_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl) { if (evm_revalidate_status(acl_name)) ima_reset_appraise_flags(d_backing_inode(dentry), -1); return 0; } static int ima_inode_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *xattr_name) { int result, digsig = -1; result = ima_protect_xattr(dentry, xattr_name, NULL, 0); if (result == 1 || evm_revalidate_status(xattr_name)) { if (!strcmp(xattr_name, XATTR_NAME_IMA)) digsig = 0; ima_reset_appraise_flags(d_backing_inode(dentry), digsig); if (result == 1) result = 0; } return result; } static int ima_inode_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { return ima_inode_set_acl(idmap, dentry, acl_name, NULL); } static struct security_hook_list ima_appraise_hooks[] __ro_after_init = { LSM_HOOK_INIT(inode_post_setattr, ima_inode_post_setattr), LSM_HOOK_INIT(inode_setxattr, ima_inode_setxattr), LSM_HOOK_INIT(inode_set_acl, ima_inode_set_acl), LSM_HOOK_INIT(inode_removexattr, ima_inode_removexattr), LSM_HOOK_INIT(inode_remove_acl, ima_inode_remove_acl), }; void __init init_ima_appraise_lsm(const struct lsm_id *lsmid) { security_add_hooks(ima_appraise_hooks, ARRAY_SIZE(ima_appraise_hooks), lsmid); }
11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 /* SPDX-License-Identifier: GPL-2.0-only */ /* * linux/fs/pnode.h * * (C) Copyright IBM Corporation 2005. */ #ifndef _LINUX_PNODE_H #define _LINUX_PNODE_H #include <linux/list.h> #include "mount.h" #define IS_MNT_SHARED(m) ((m)->mnt_t_flags & T_SHARED) #define IS_MNT_SLAVE(m) ((m)->mnt_master) #define IS_MNT_NEW(m) (!(m)->mnt_ns) #define CLEAR_MNT_SHARED(m) ((m)->mnt_t_flags &= ~T_SHARED) #define IS_MNT_UNBINDABLE(m) ((m)->mnt_t_flags & T_UNBINDABLE) #define IS_MNT_MARKED(m) ((m)->mnt_t_flags & T_MARKED) #define SET_MNT_MARK(m) ((m)->mnt_t_flags |= T_MARKED) #define CLEAR_MNT_MARK(m) ((m)->mnt_t_flags &= ~T_MARKED) #define IS_MNT_LOCKED(m) ((m)->mnt.mnt_flags & MNT_LOCKED) #define CL_EXPIRE 0x01 #define CL_SLAVE 0x02 #define CL_COPY_UNBINDABLE 0x04 #define CL_MAKE_SHARED 0x08 #define CL_PRIVATE 0x10 #define CL_COPY_MNT_NS_FILE 0x40 /* * EXCL[namespace_sem] */ static inline void set_mnt_shared(struct mount *mnt) { mnt->mnt_t_flags &= ~T_SHARED_MASK; mnt->mnt_t_flags |= T_SHARED; } static inline bool peers(const struct mount *m1, const struct mount *m2) { return m1->mnt_group_id == m2->mnt_group_id && m1->mnt_group_id; } void change_mnt_propagation(struct mount *, int); void bulk_make_private(struct list_head *); int propagate_mnt(struct mount *, struct mountpoint *, struct mount *, struct hlist_head *); void propagate_umount(struct list_head *); int propagate_mount_busy(struct mount *, int); void propagate_mount_unlock(struct mount *); void mnt_release_group_id(struct mount *); int get_dominating_id(struct mount *mnt, const struct path *root); int mnt_get_count(struct mount *mnt); void mnt_set_mountpoint(struct mount *, struct mountpoint *, struct mount *); void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt); struct mount *copy_tree(struct mount *, struct dentry *, int); bool is_path_reachable(struct mount *, struct dentry *, const struct path *root); int count_mounts(struct mnt_namespace *ns, struct mount *mnt); bool propagation_would_overmount(const struct mount *from, const struct mount *to, const struct mountpoint *mp); #endif /* _LINUX_PNODE_H */
25 200 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 // SPDX-License-Identifier: GPL-2.0 /* * Copyright IBM Corporation, 2021 * * Author: Mike Rapoport <rppt@linux.ibm.com> */ #include <linux/mm.h> #include <linux/fs.h> #include <linux/swap.h> #include <linux/mount.h> #include <linux/memfd.h> #include <linux/bitops.h> #include <linux/printk.h> #include <linux/pagemap.h> #include <linux/syscalls.h> #include <linux/pseudo_fs.h> #include <linux/secretmem.h> #include <linux/set_memory.h> #include <linux/sched/signal.h> #include <uapi/linux/magic.h> #include <asm/tlbflush.h> #include "internal.h" #undef pr_fmt #define pr_fmt(fmt) "secretmem: " fmt /* * Define mode and flag masks to allow validation of the system call * parameters. */ #define SECRETMEM_MODE_MASK (0x0) #define SECRETMEM_FLAGS_MASK SECRETMEM_MODE_MASK static bool secretmem_enable __ro_after_init = 1; module_param_named(enable, secretmem_enable, bool, 0400); MODULE_PARM_DESC(secretmem_enable, "Enable secretmem and memfd_secret(2) system call"); static atomic_t secretmem_users; bool secretmem_active(void) { return !!atomic_read(&secretmem_users); } static vm_fault_t secretmem_fault(struct vm_fault *vmf) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; struct inode *inode = file_inode(vmf->vma->vm_file); pgoff_t offset = vmf->pgoff; gfp_t gfp = vmf->gfp_mask; unsigned long addr; struct folio *folio; vm_fault_t ret; int err; if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode)) return vmf_error(-EINVAL); filemap_invalidate_lock_shared(mapping); retry: folio = filemap_lock_folio(mapping, offset); if (IS_ERR(folio)) { folio = folio_alloc(gfp | __GFP_ZERO, 0); if (!folio) { ret = VM_FAULT_OOM; goto out; } err = set_direct_map_invalid_noflush(folio_page(folio, 0)); if (err) { folio_put(folio); ret = vmf_error(err); goto out; } __folio_mark_uptodate(folio); err = filemap_add_folio(mapping, folio, offset, gfp); if (unlikely(err)) { /* * If a split of large page was required, it * already happened when we marked the page invalid * which guarantees that this call won't fail */ set_direct_map_default_noflush(folio_page(folio, 0)); folio_put(folio); if (err == -EEXIST) goto retry; ret = vmf_error(err); goto out; } addr = (unsigned long)folio_address(folio); flush_tlb_kernel_range(addr, addr + PAGE_SIZE); } vmf->page = folio_file_page(folio, vmf->pgoff); ret = VM_FAULT_LOCKED; out: filemap_invalidate_unlock_shared(mapping); return ret; } static const struct vm_operations_struct secretmem_vm_ops = { .fault = secretmem_fault, }; static int secretmem_release(struct inode *inode, struct file *file) { atomic_dec(&secretmem_users); return 0; } static int secretmem_mmap_prepare(struct vm_area_desc *desc) { const unsigned long len = vma_desc_size(desc); if (!vma_desc_test_flags(desc, VMA_SHARED_BIT, VMA_MAYSHARE_BIT)) return -EINVAL; vma_desc_set_flags(desc, VMA_LOCKED_BIT, VMA_DONTDUMP_BIT); if (!mlock_future_ok(desc->mm, /*is_vma_locked=*/ true, len)) return -EAGAIN; desc->vm_ops = &secretmem_vm_ops; return 0; } bool vma_is_secretmem(struct vm_area_struct *vma) { return vma->vm_ops == &secretmem_vm_ops; } static const struct file_operations secretmem_fops = { .release = secretmem_release, .mmap_prepare = secretmem_mmap_prepare, }; static int secretmem_migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode) { return -EBUSY; } static void secretmem_free_folio(struct folio *folio) { set_direct_map_default_noflush(folio_page(folio, 0)); folio_zero_segment(folio, 0, folio_size(folio)); } const struct address_space_operations secretmem_aops = { .dirty_folio = noop_dirty_folio, .free_folio = secretmem_free_folio, .migrate_folio = secretmem_migrate_folio, }; static int secretmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { struct inode *inode = d_inode(dentry); struct address_space *mapping = inode->i_mapping; unsigned int ia_valid = iattr->ia_valid; int ret; filemap_invalidate_lock(mapping); if ((ia_valid & ATTR_SIZE) && inode->i_size) ret = -EINVAL; else ret = simple_setattr(idmap, dentry, iattr); filemap_invalidate_unlock(mapping); return ret; } static const struct inode_operations secretmem_iops = { .setattr = secretmem_setattr, }; static struct vfsmount *secretmem_mnt; static struct file *secretmem_file_create(unsigned long flags) { struct file *file; struct inode *inode; const char *anon_name = "[secretmem]"; inode = anon_inode_make_secure_inode(secretmem_mnt->mnt_sb, anon_name, NULL); if (IS_ERR(inode)) return ERR_CAST(inode); file = alloc_file_pseudo(inode, secretmem_mnt, "secretmem", O_RDWR | O_LARGEFILE, &secretmem_fops); if (IS_ERR(file)) goto err_free_inode; mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); mapping_set_unevictable(inode->i_mapping); inode->i_op = &secretmem_iops; inode->i_mapping->a_ops = &secretmem_aops; /* pretend we are a normal file with zero size */ inode->i_mode |= S_IFREG; inode->i_size = 0; atomic_inc(&secretmem_users); return file; err_free_inode: iput(inode); return file; } SYSCALL_DEFINE1(memfd_secret, unsigned int, flags) { /* make sure local flags do not conflict with global fcntl.h */ BUILD_BUG_ON(SECRETMEM_FLAGS_MASK & O_CLOEXEC); if (!secretmem_enable || !can_set_direct_map()) return -ENOSYS; if (flags & ~(SECRETMEM_FLAGS_MASK | O_CLOEXEC)) return -EINVAL; if (atomic_read(&secretmem_users) < 0) return -ENFILE; return FD_ADD(flags & O_CLOEXEC, secretmem_file_create(flags)); } static int secretmem_init_fs_context(struct fs_context *fc) { struct pseudo_fs_context *ctx; ctx = init_pseudo(fc, SECRETMEM_MAGIC); if (!ctx) return -ENOMEM; fc->s_iflags |= SB_I_NOEXEC; fc->s_iflags |= SB_I_NODEV; return 0; } static struct file_system_type secretmem_fs = { .name = "secretmem", .init_fs_context = secretmem_init_fs_context, .kill_sb = kill_anon_super, }; static int __init secretmem_init(void) { if (!secretmem_enable || !can_set_direct_map()) return 0; secretmem_mnt = kern_mount(&secretmem_fs); if (IS_ERR(secretmem_mnt)) return PTR_ERR(secretmem_mnt); return 0; } fs_initcall(secretmem_init);
109 109 109 119 120 120 120 9 106 3 31 31 88 87 109 109 3 106 3 106 3 119 120 4 118 106 97 62 31 31 88 87 64 25 52 64 2 22 40 40 47 47 47 47 3 3 3 3 20 26 4 84 31 30 23 79 43 3 24 9 2 212 12 12 33 209 209 125 121 116 4 117 117 93 24 29 23 7 12 90 120 121 126 126 125 17 16 126 126 110 109 4 24 104 28 2 76 104 5 85 24 104 5 10 80 109 109 69 36 109 104 105 4 84 6 4 68 4 10 58 21 78 13 65 11 84 84 78 65 21 2 18 18 18 4 3 18 12 12 12 11 4 9 8 8 5 10 10 10 2 8 2 2 6 5 3 4 6 5 5 3 2 7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2004, OGAWA Hirofumi */ #include <linux/blkdev.h> #include <linux/sched/signal.h> #include <linux/backing-dev-defs.h> #include "fat.h" struct fatent_operations { void (*ent_blocknr)(struct super_block *, int, int *, sector_t *); void (*ent_set_ptr)(struct fat_entry *, int); int (*ent_bread)(struct super_block *, struct fat_entry *, int, sector_t); int (*ent_get)(struct fat_entry *); void (*ent_put)(struct fat_entry *, int); int (*ent_next)(struct fat_entry *); }; static DEFINE_SPINLOCK(fat12_entry_lock); static void fat12_ent_blocknr(struct super_block *sb, int entry, int *offset, sector_t *blocknr) { struct msdos_sb_info *sbi = MSDOS_SB(sb); int bytes = entry + (entry >> 1); WARN_ON(!fat_valid_entry(sbi, entry)); *offset = bytes & (sb->s_blocksize - 1); *blocknr = sbi->fat_start + (bytes >> sb->s_blocksize_bits); } static void fat_ent_blocknr(struct super_block *sb, int entry, int *offset, sector_t *blocknr) { struct msdos_sb_info *sbi = MSDOS_SB(sb); int bytes = (entry << sbi->fatent_shift); WARN_ON(!fat_valid_entry(sbi, entry)); *offset = bytes & (sb->s_blocksize - 1); *blocknr = sbi->fat_start + (bytes >> sb->s_blocksize_bits); } static void fat12_ent_set_ptr(struct fat_entry *fatent, int offset) { struct buffer_head **bhs = fatent->bhs; if (fatent->nr_bhs == 1) { WARN_ON(offset >= (bhs[0]->b_size - 1)); fatent->u.ent12_p[0] = bhs[0]->b_data + offset; fatent->u.ent12_p[1] = bhs[0]->b_data + (offset + 1); } else { WARN_ON(offset != (bhs[0]->b_size - 1)); fatent->u.ent12_p[0] = bhs[0]->b_data + offset; fatent->u.ent12_p[1] = bhs[1]->b_data; } } static void fat16_ent_set_ptr(struct fat_entry *fatent, int offset) { WARN_ON(offset & (2 - 1)); fatent->u.ent16_p = (__le16 *)(fatent->bhs[0]->b_data + offset); } static void fat32_ent_set_ptr(struct fat_entry *fatent, int offset) { WARN_ON(offset & (4 - 1)); fatent->u.ent32_p = (__le32 *)(fatent->bhs[0]->b_data + offset); } static int fat12_ent_bread(struct super_block *sb, struct fat_entry *fatent, int offset, sector_t blocknr) { struct buffer_head **bhs = fatent->bhs; WARN_ON(blocknr < MSDOS_SB(sb)->fat_start); fatent->fat_inode = MSDOS_SB(sb)->fat_inode; bhs[0] = sb_bread(sb, blocknr); if (!bhs[0]) goto err; if ((offset + 1) < sb->s_blocksize) fatent->nr_bhs = 1; else { /* This entry is block boundary, it needs the next block */ blocknr++; bhs[1] = sb_bread(sb, blocknr); if (!bhs[1]) goto err_brelse; fatent->nr_bhs = 2; } fat12_ent_set_ptr(fatent, offset); return 0; err_brelse: brelse(bhs[0]); err: fat_msg_ratelimit(sb, KERN_ERR, "FAT read failed (blocknr %llu)", (llu)blocknr); return -EIO; } static int fat_ent_bread(struct super_block *sb, struct fat_entry *fatent, int offset, sector_t blocknr) { const struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops; WARN_ON(blocknr < MSDOS_SB(sb)->fat_start); fatent->fat_inode = MSDOS_SB(sb)->fat_inode; fatent->bhs[0] = sb_bread(sb, blocknr); if (!fatent->bhs[0]) { fat_msg_ratelimit(sb, KERN_ERR, "FAT read failed (blocknr %llu)", (llu)blocknr); return -EIO; } fatent->nr_bhs = 1; ops->ent_set_ptr(fatent, offset); return 0; } static int fat12_ent_get(struct fat_entry *fatent) { u8 **ent12_p = fatent->u.ent12_p; int next; spin_lock(&fat12_entry_lock); if (fatent->entry & 1) next = (*ent12_p[0] >> 4) | (*ent12_p[1] << 4); else next = (*ent12_p[1] << 8) | *ent12_p[0]; spin_unlock(&fat12_entry_lock); next &= 0x0fff; if (next >= BAD_FAT12) next = FAT_ENT_EOF; return next; } static int fat16_ent_get(struct fat_entry *fatent) { int next = le16_to_cpu(*fatent->u.ent16_p); WARN_ON((unsigned long)fatent->u.ent16_p & (2 - 1)); if (next >= BAD_FAT16) next = FAT_ENT_EOF; return next; } static int fat32_ent_get(struct fat_entry *fatent) { int next = le32_to_cpu(*fatent->u.ent32_p) & 0x0fffffff; WARN_ON((unsigned long)fatent->u.ent32_p & (4 - 1)); if (next >= BAD_FAT32) next = FAT_ENT_EOF; return next; } static void fat12_ent_put(struct fat_entry *fatent, int new) { u8 **ent12_p = fatent->u.ent12_p; if (new == FAT_ENT_EOF) new = EOF_FAT12; spin_lock(&fat12_entry_lock); if (fatent->entry & 1) { *ent12_p[0] = (new << 4) | (*ent12_p[0] & 0x0f); *ent12_p[1] = new >> 4; } else { *ent12_p[0] = new & 0xff; *ent12_p[1] = (*ent12_p[1] & 0xf0) | (new >> 8); } spin_unlock(&fat12_entry_lock); mark_buffer_dirty_inode(fatent->bhs[0], fatent->fat_inode); if (fatent->nr_bhs == 2) mark_buffer_dirty_inode(fatent->bhs[1], fatent->fat_inode); } static void fat16_ent_put(struct fat_entry *fatent, int new) { if (new == FAT_ENT_EOF) new = EOF_FAT16; *fatent->u.ent16_p = cpu_to_le16(new); mark_buffer_dirty_inode(fatent->bhs[0], fatent->fat_inode); } static void fat32_ent_put(struct fat_entry *fatent, int new) { WARN_ON(new & 0xf0000000); new |= le32_to_cpu(*fatent->u.ent32_p) & ~0x0fffffff; *fatent->u.ent32_p = cpu_to_le32(new); mark_buffer_dirty_inode(fatent->bhs[0], fatent->fat_inode); } static int fat12_ent_next(struct fat_entry *fatent) { u8 **ent12_p = fatent->u.ent12_p; struct buffer_head **bhs = fatent->bhs; u8 *nextp = ent12_p[1] + 1 + (fatent->entry & 1); fatent->entry++; if (fatent->nr_bhs == 1) { WARN_ON(ent12_p[0] > (u8 *)(bhs[0]->b_data + (bhs[0]->b_size - 2))); WARN_ON(ent12_p[1] > (u8 *)(bhs[0]->b_data + (bhs[0]->b_size - 1))); if (nextp < (u8 *)(bhs[0]->b_data + (bhs[0]->b_size - 1))) { ent12_p[0] = nextp - 1; ent12_p[1] = nextp; return 1; } } else { WARN_ON(ent12_p[0] != (u8 *)(bhs[0]->b_data + (bhs[0]->b_size - 1))); WARN_ON(ent12_p[1] != (u8 *)bhs[1]->b_data); ent12_p[0] = nextp - 1; ent12_p[1] = nextp; brelse(bhs[0]); bhs[0] = bhs[1]; fatent->nr_bhs = 1; return 1; } ent12_p[0] = NULL; ent12_p[1] = NULL; return 0; } static int fat16_ent_next(struct fat_entry *fatent) { const struct buffer_head *bh = fatent->bhs[0]; fatent->entry++; if (fatent->u.ent16_p < (__le16 *)(bh->b_data + (bh->b_size - 2))) { fatent->u.ent16_p++; return 1; } fatent->u.ent16_p = NULL; return 0; } static int fat32_ent_next(struct fat_entry *fatent) { const struct buffer_head *bh = fatent->bhs[0]; fatent->entry++; if (fatent->u.ent32_p < (__le32 *)(bh->b_data + (bh->b_size - 4))) { fatent->u.ent32_p++; return 1; } fatent->u.ent32_p = NULL; return 0; } static const struct fatent_operations fat12_ops = { .ent_blocknr = fat12_ent_blocknr, .ent_set_ptr = fat12_ent_set_ptr, .ent_bread = fat12_ent_bread, .ent_get = fat12_ent_get, .ent_put = fat12_ent_put, .ent_next = fat12_ent_next, }; static const struct fatent_operations fat16_ops = { .ent_blocknr = fat_ent_blocknr, .ent_set_ptr = fat16_ent_set_ptr, .ent_bread = fat_ent_bread, .ent_get = fat16_ent_get, .ent_put = fat16_ent_put, .ent_next = fat16_ent_next, }; static const struct fatent_operations fat32_ops = { .ent_blocknr = fat_ent_blocknr, .ent_set_ptr = fat32_ent_set_ptr, .ent_bread = fat_ent_bread, .ent_get = fat32_ent_get, .ent_put = fat32_ent_put, .ent_next = fat32_ent_next, }; static inline void lock_fat(struct msdos_sb_info *sbi) { mutex_lock(&sbi->fat_lock); } static inline void unlock_fat(struct msdos_sb_info *sbi) { mutex_unlock(&sbi->fat_lock); } void fat_ent_access_init(struct super_block *sb) { struct msdos_sb_info *sbi = MSDOS_SB(sb); mutex_init(&sbi->fat_lock); if (is_fat32(sbi)) { sbi->fatent_shift = 2; sbi->fatent_ops = &fat32_ops; } else if (is_fat16(sbi)) { sbi->fatent_shift = 1; sbi->fatent_ops = &fat16_ops; } else if (is_fat12(sbi)) { sbi->fatent_shift = -1; sbi->fatent_ops = &fat12_ops; } else { fat_fs_error(sb, "invalid FAT variant, %u bits", sbi->fat_bits); } } static void mark_fsinfo_dirty(struct super_block *sb) { struct msdos_sb_info *sbi = MSDOS_SB(sb); if (sb_rdonly(sb) || !is_fat32(sbi)) return; __mark_inode_dirty(sbi->fsinfo_inode, I_DIRTY_SYNC); } static inline int fat_ent_update_ptr(struct super_block *sb, struct fat_entry *fatent, int offset, sector_t blocknr) { struct msdos_sb_info *sbi = MSDOS_SB(sb); const struct fatent_operations *ops = sbi->fatent_ops; struct buffer_head **bhs = fatent->bhs; /* Is this fatent's blocks including this entry? */ if (!fatent->nr_bhs || bhs[0]->b_blocknr != blocknr) return 0; if (is_fat12(sbi)) { if ((offset + 1) < sb->s_blocksize) { /* This entry is on bhs[0]. */ if (fatent->nr_bhs == 2) { brelse(bhs[1]); fatent->nr_bhs = 1; } } else { /* This entry needs the next block. */ if (fatent->nr_bhs != 2) return 0; if (bhs[1]->b_blocknr != (blocknr + 1)) return 0; } } ops->ent_set_ptr(fatent, offset); return 1; } int fat_ent_read(struct inode *inode, struct fat_entry *fatent, int entry) { struct super_block *sb = inode->i_sb; struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); const struct fatent_operations *ops = sbi->fatent_ops; int err, offset; sector_t blocknr; if (!fat_valid_entry(sbi, entry)) { fatent_brelse(fatent); fat_fs_error_ratelimit(sb, "invalid access to FAT (entry 0x%08x)", entry); return -EIO; } fatent_set_entry(fatent, entry); ops->ent_blocknr(sb, entry, &offset, &blocknr); if (!fat_ent_update_ptr(sb, fatent, offset, blocknr)) { fatent_brelse(fatent); err = ops->ent_bread(sb, fatent, offset, blocknr); if (err) return err; } return ops->ent_get(fatent); } /* FIXME: We can write the blocks as more big chunk. */ static int fat_mirror_bhs(struct super_block *sb, struct buffer_head **bhs, int nr_bhs) { struct msdos_sb_info *sbi = MSDOS_SB(sb); struct buffer_head *c_bh; int err, n, copy; err = 0; for (copy = 1; copy < sbi->fats; copy++) { sector_t backup_fat = sbi->fat_length * copy; for (n = 0; n < nr_bhs; n++) { c_bh = sb_getblk(sb, backup_fat + bhs[n]->b_blocknr); if (!c_bh) { err = -ENOMEM; goto error; } /* Avoid race with userspace read via bdev */ lock_buffer(c_bh); memcpy(c_bh->b_data, bhs[n]->b_data, sb->s_blocksize); set_buffer_uptodate(c_bh); unlock_buffer(c_bh); mark_buffer_dirty_inode(c_bh, sbi->fat_inode); if (sb->s_flags & SB_SYNCHRONOUS) err = sync_dirty_buffer(c_bh); brelse(c_bh); if (err) goto error; } } error: return err; } int fat_ent_write(struct inode *inode, struct fat_entry *fatent, int new, int wait) { struct super_block *sb = inode->i_sb; const struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops; int err; ops->ent_put(fatent, new); if (wait) { err = fat_sync_bhs(fatent->bhs, fatent->nr_bhs); if (err) return err; } return fat_mirror_bhs(sb, fatent->bhs, fatent->nr_bhs); } static inline int fat_ent_next(struct msdos_sb_info *sbi, struct fat_entry *fatent) { if (sbi->fatent_ops->ent_next(fatent)) { if (fatent->entry < sbi->max_cluster) return 1; } return 0; } static inline int fat_ent_read_block(struct super_block *sb, struct fat_entry *fatent) { const struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops; sector_t blocknr; int offset; fatent_brelse(fatent); ops->ent_blocknr(sb, fatent->entry, &offset, &blocknr); return ops->ent_bread(sb, fatent, offset, blocknr); } static void fat_collect_bhs(struct buffer_head **bhs, int *nr_bhs, struct fat_entry *fatent) { int n, i; for (n = 0; n < fatent->nr_bhs; n++) { for (i = 0; i < *nr_bhs; i++) { if (fatent->bhs[n] == bhs[i]) break; } if (i == *nr_bhs) { get_bh(fatent->bhs[n]); bhs[i] = fatent->bhs[n]; (*nr_bhs)++; } } } int fat_alloc_clusters(struct inode *inode, int *cluster, int nr_cluster) { struct super_block *sb = inode->i_sb; struct msdos_sb_info *sbi = MSDOS_SB(sb); const struct fatent_operations *ops = sbi->fatent_ops; struct fat_entry fatent, prev_ent; struct buffer_head *bhs[MAX_BUF_PER_PAGE]; int i, count, err, nr_bhs, idx_clus; BUG_ON(nr_cluster > (MAX_BUF_PER_PAGE / 2)); /* fixed limit */ lock_fat(sbi); if (sbi->free_clusters != -1 && sbi->free_clus_valid && sbi->free_clusters < nr_cluster) { unlock_fat(sbi); return -ENOSPC; } err = nr_bhs = idx_clus = 0; count = FAT_START_ENT; fatent_init(&prev_ent); fatent_init(&fatent); fatent_set_entry(&fatent, sbi->prev_free + 1); while (count < sbi->max_cluster) { if (fatent.entry >= sbi->max_cluster) fatent.entry = FAT_START_ENT; fatent_set_entry(&fatent, fatent.entry); err = fat_ent_read_block(sb, &fatent); if (err) goto out; /* Find the free entries in a block */ do { if (ops->ent_get(&fatent) == FAT_ENT_FREE) { int entry = fatent.entry; /* make the cluster chain */ ops->ent_put(&fatent, FAT_ENT_EOF); if (prev_ent.nr_bhs) ops->ent_put(&prev_ent, entry); fat_collect_bhs(bhs, &nr_bhs, &fatent); sbi->prev_free = entry; if (sbi->free_clusters != -1) sbi->free_clusters--; cluster[idx_clus] = entry; idx_clus++; if (idx_clus == nr_cluster) goto out; /* * fat_collect_bhs() gets ref-count of bhs, * so we can still use the prev_ent. */ prev_ent = fatent; } count++; if (count == sbi->max_cluster) break; } while (fat_ent_next(sbi, &fatent)); } /* Couldn't allocate the free entries */ sbi->free_clusters = 0; sbi->free_clus_valid = 1; err = -ENOSPC; out: unlock_fat(sbi); mark_fsinfo_dirty(sb); fatent_brelse(&fatent); if (!err) { if (inode_needs_sync(inode)) err = fat_sync_bhs(bhs, nr_bhs); if (!err) err = fat_mirror_bhs(sb, bhs, nr_bhs); } for (i = 0; i < nr_bhs; i++) brelse(bhs[i]); if (err && idx_clus) fat_free_clusters(inode, cluster[0]); return err; } int fat_free_clusters(struct inode *inode, int cluster) { struct super_block *sb = inode->i_sb; struct msdos_sb_info *sbi = MSDOS_SB(sb); const struct fatent_operations *ops = sbi->fatent_ops; struct fat_entry fatent; struct buffer_head *bhs[MAX_BUF_PER_PAGE]; int i, err, nr_bhs; int first_cl = cluster, dirty_fsinfo = 0; nr_bhs = 0; fatent_init(&fatent); lock_fat(sbi); do { cluster = fat_ent_read(inode, &fatent, cluster); if (cluster < 0) { err = cluster; goto error; } else if (cluster == FAT_ENT_FREE) { fat_fs_error(sb, "%s: deleting FAT entry beyond EOF", __func__); err = -EIO; goto error; } if (sbi->options.discard) { /* * Issue discard for the sectors we no longer * care about, batching contiguous clusters * into one request */ if (cluster != fatent.entry + 1) { int nr_clus = fatent.entry - first_cl + 1; sb_issue_discard(sb, fat_clus_to_blknr(sbi, first_cl), nr_clus * sbi->sec_per_clus, GFP_NOFS, 0); first_cl = cluster; } } ops->ent_put(&fatent, FAT_ENT_FREE); if (sbi->free_clusters != -1) { sbi->free_clusters++; dirty_fsinfo = 1; } if (nr_bhs + fatent.nr_bhs > MAX_BUF_PER_PAGE) { if (sb->s_flags & SB_SYNCHRONOUS) { err = fat_sync_bhs(bhs, nr_bhs); if (err) goto error; } err = fat_mirror_bhs(sb, bhs, nr_bhs); if (err) goto error; for (i = 0; i < nr_bhs; i++) brelse(bhs[i]); nr_bhs = 0; } fat_collect_bhs(bhs, &nr_bhs, &fatent); } while (cluster != FAT_ENT_EOF); if (sb->s_flags & SB_SYNCHRONOUS) { err = fat_sync_bhs(bhs, nr_bhs); if (err) goto error; } err = fat_mirror_bhs(sb, bhs, nr_bhs); error: fatent_brelse(&fatent); for (i = 0; i < nr_bhs; i++) brelse(bhs[i]); unlock_fat(sbi); if (dirty_fsinfo) mark_fsinfo_dirty(sb); return err; } EXPORT_SYMBOL_GPL(fat_free_clusters); struct fatent_ra { sector_t cur; sector_t limit; unsigned int ra_blocks; sector_t ra_advance; sector_t ra_next; sector_t ra_limit; }; static void fat_ra_init(struct super_block *sb, struct fatent_ra *ra, struct fat_entry *fatent, int ent_limit) { struct msdos_sb_info *sbi = MSDOS_SB(sb); const struct fatent_operations *ops = sbi->fatent_ops; sector_t blocknr, block_end; int offset; /* * This is the sequential read, so ra_pages * 2 (but try to * align the optimal hardware IO size). * [BTW, 128kb covers the whole sectors for FAT12 and FAT16] */ unsigned long ra_pages = sb->s_bdi->ra_pages; unsigned int reada_blocks; if (fatent->entry >= ent_limit) return; if (ra_pages > sb->s_bdi->io_pages) ra_pages = rounddown(ra_pages, sb->s_bdi->io_pages); reada_blocks = ra_pages << (PAGE_SHIFT - sb->s_blocksize_bits + 1); /* Initialize the range for sequential read */ ops->ent_blocknr(sb, fatent->entry, &offset, &blocknr); ops->ent_blocknr(sb, ent_limit - 1, &offset, &block_end); ra->cur = 0; ra->limit = (block_end + 1) - blocknr; /* Advancing the window at half size */ ra->ra_blocks = reada_blocks >> 1; ra->ra_advance = ra->cur; ra->ra_next = ra->cur; ra->ra_limit = ra->cur + min_t(sector_t, reada_blocks, ra->limit); } /* Assuming to be called before reading a new block (increments ->cur). */ static void fat_ent_reada(struct super_block *sb, struct fatent_ra *ra, struct fat_entry *fatent) { if (ra->ra_next >= ra->ra_limit) return; if (ra->cur >= ra->ra_advance) { struct msdos_sb_info *sbi = MSDOS_SB(sb); const struct fatent_operations *ops = sbi->fatent_ops; struct blk_plug plug; sector_t blocknr, diff; int offset; ops->ent_blocknr(sb, fatent->entry, &offset, &blocknr); diff = blocknr - ra->cur; blk_start_plug(&plug); /* * FIXME: we would want to directly use the bio with * pages to reduce the number of segments. */ for (; ra->ra_next < ra->ra_limit; ra->ra_next++) sb_breadahead(sb, ra->ra_next + diff); blk_finish_plug(&plug); /* Advance the readahead window */ ra->ra_advance += ra->ra_blocks; ra->ra_limit += min_t(sector_t, ra->ra_blocks, ra->limit - ra->ra_limit); } ra->cur++; } int fat_count_free_clusters(struct super_block *sb) { struct msdos_sb_info *sbi = MSDOS_SB(sb); const struct fatent_operations *ops = sbi->fatent_ops; struct fat_entry fatent; struct fatent_ra fatent_ra; int err = 0, free; lock_fat(sbi); if (sbi->free_clusters != -1 && sbi->free_clus_valid) goto out; free = 0; fatent_init(&fatent); fatent_set_entry(&fatent, FAT_START_ENT); fat_ra_init(sb, &fatent_ra, &fatent, sbi->max_cluster); while (fatent.entry < sbi->max_cluster) { /* readahead of fat blocks */ fat_ent_reada(sb, &fatent_ra, &fatent); err = fat_ent_read_block(sb, &fatent); if (err) goto out; do { if (ops->ent_get(&fatent) == FAT_ENT_FREE) free++; } while (fat_ent_next(sbi, &fatent)); cond_resched(); } sbi->free_clusters = free; sbi->free_clus_valid = 1; mark_fsinfo_dirty(sb); fatent_brelse(&fatent); out: unlock_fat(sbi); return err; } static int fat_trim_clusters(struct super_block *sb, u32 clus, u32 nr_clus) { struct msdos_sb_info *sbi = MSDOS_SB(sb); return sb_issue_discard(sb, fat_clus_to_blknr(sbi, clus), nr_clus * sbi->sec_per_clus, GFP_NOFS, 0); } int fat_trim_fs(struct inode *inode, struct fstrim_range *range) { struct super_block *sb = inode->i_sb; struct msdos_sb_info *sbi = MSDOS_SB(sb); const struct fatent_operations *ops = sbi->fatent_ops; struct fat_entry fatent; struct fatent_ra fatent_ra; u64 ent_start, ent_end, minlen, trimmed = 0; u32 free = 0; int err = 0; /* * FAT data is organized as clusters, trim at the granulary of cluster. * * fstrim_range is in byte, convert values to cluster index. * Treat sectors before data region as all used, not to trim them. */ ent_start = max_t(u64, range->start>>sbi->cluster_bits, FAT_START_ENT); ent_end = ent_start + (range->len >> sbi->cluster_bits) - 1; minlen = range->minlen >> sbi->cluster_bits; if (ent_start >= sbi->max_cluster || range->len < sbi->cluster_size) return -EINVAL; if (ent_end >= sbi->max_cluster) ent_end = sbi->max_cluster - 1; fatent_init(&fatent); lock_fat(sbi); fatent_set_entry(&fatent, ent_start); fat_ra_init(sb, &fatent_ra, &fatent, ent_end + 1); while (fatent.entry <= ent_end) { /* readahead of fat blocks */ fat_ent_reada(sb, &fatent_ra, &fatent); err = fat_ent_read_block(sb, &fatent); if (err) goto error; do { if (ops->ent_get(&fatent) == FAT_ENT_FREE) { free++; } else if (free) { if (free >= minlen) { u32 clus = fatent.entry - free; err = fat_trim_clusters(sb, clus, free); if (err && err != -EOPNOTSUPP) goto error; if (!err) trimmed += free; err = 0; } free = 0; } } while (fat_ent_next(sbi, &fatent) && fatent.entry <= ent_end); if (fatal_signal_pending(current)) { err = -ERESTARTSYS; goto error; } if (need_resched()) { fatent_brelse(&fatent); unlock_fat(sbi); cond_resched(); lock_fat(sbi); } } /* handle scenario when tail entries are all free */ if (free && free >= minlen) { u32 clus = fatent.entry - free; err = fat_trim_clusters(sb, clus, free); if (err && err != -EOPNOTSUPP) goto error; if (!err) trimmed += free; err = 0; } error: fatent_brelse(&fatent); unlock_fat(sbi); range->len = trimmed << sbi->cluster_bits; return err; }
232 11 456 232 15 135 135 130 61 139 61 22 147 5 42 81 6 1 2 1 85 220 219 354 153 20 4 69 67 96 2 304 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. * All Rights Reserved. */ #ifndef __XFS_INODE_H__ #define __XFS_INODE_H__ #include "xfs_inode_buf.h" #include "xfs_inode_fork.h" #include "xfs_inode_util.h" /* * Kernel only inode definitions */ struct xfs_dinode; struct xfs_inode; struct xfs_buf; struct xfs_bmbt_irec; struct xfs_inode_log_item; struct xfs_mount; struct xfs_trans; struct xfs_dquot; typedef struct xfs_inode { /* Inode linking and identification information. */ struct xfs_mount *i_mount; /* fs mount struct ptr */ struct xfs_dquot *i_udquot; /* user dquot */ struct xfs_dquot *i_gdquot; /* group dquot */ struct xfs_dquot *i_pdquot; /* project dquot */ /* Inode location stuff */ xfs_ino_t i_ino; /* inode number (agno/agino)*/ struct xfs_imap i_imap; /* location for xfs_imap() */ /* Extent information. */ struct xfs_ifork *i_cowfp; /* copy on write extents */ struct xfs_ifork i_df; /* data fork */ struct xfs_ifork i_af; /* attribute fork */ /* Transaction and locking information. */ struct xfs_inode_log_item *i_itemp; /* logging information */ struct rw_semaphore i_lock; /* inode lock */ atomic_t i_pincount; /* inode pin count */ struct llist_node i_gclist; /* deferred inactivation list */ /* * Bitsets of inode metadata that have been checked and/or are sick. * Callers must hold i_flags_lock before accessing this field. */ uint16_t i_checked; uint16_t i_sick; spinlock_t i_flags_lock; /* inode i_flags lock */ /* Miscellaneous state. */ unsigned long i_flags; /* see defined flags below */ uint64_t i_delayed_blks; /* count of delay alloc blks */ xfs_fsize_t i_disk_size; /* number of bytes in file */ xfs_rfsblock_t i_nblocks; /* # of direct & btree blocks */ prid_t i_projid; /* owner's project id */ xfs_extlen_t i_extsize; /* basic/minimum extent size */ /* * i_used_blocks is used for zoned rtrmap inodes, * i_cowextsize is used for other v3 inodes, * i_flushiter for v1/2 inodes */ union { uint32_t i_used_blocks; /* used blocks in RTG */ xfs_extlen_t i_cowextsize; /* basic cow extent size */ uint16_t i_flushiter; /* incremented on flush */ }; uint8_t i_forkoff; /* attr fork offset >> 3 */ enum xfs_metafile_type i_metatype; /* XFS_METAFILE_* */ uint16_t i_diflags; /* XFS_DIFLAG_... */ uint64_t i_diflags2; /* XFS_DIFLAG2_... */ struct timespec64 i_crtime; /* time created */ /* * Unlinked list pointers. These point to the next and previous inodes * in the AGI unlinked bucket list, respectively. These fields can * only be updated with the AGI locked. * * i_next_unlinked caches di_next_unlinked. */ xfs_agino_t i_next_unlinked; /* * If the inode is not on an unlinked list, this field is zero. If the * inode is the first element in an unlinked list, this field is * NULLAGINO. Otherwise, i_prev_unlinked points to the previous inode * in the unlinked list. */ xfs_agino_t i_prev_unlinked; /* VFS inode */ struct inode i_vnode; /* embedded VFS inode */ /* pending io completions */ spinlock_t i_ioend_lock; struct work_struct i_ioend_work; struct list_head i_ioend_list; } xfs_inode_t; static inline bool xfs_inode_on_unlinked_list(const struct xfs_inode *ip) { return ip->i_prev_unlinked != 0; } static inline bool xfs_inode_has_attr_fork(const struct xfs_inode *ip) { return ip->i_forkoff > 0; } static inline struct xfs_ifork * xfs_ifork_ptr( struct xfs_inode *ip, int whichfork) { switch (whichfork) { case XFS_DATA_FORK: return &ip->i_df; case XFS_ATTR_FORK: if (!xfs_inode_has_attr_fork(ip)) return NULL; return &ip->i_af; case XFS_COW_FORK: return ip->i_cowfp; default: ASSERT(0); return NULL; } } static inline unsigned int xfs_inode_fork_boff(struct xfs_inode *ip) { return ip->i_forkoff << 3; } static inline unsigned int xfs_inode_data_fork_size(struct xfs_inode *ip) { if (xfs_inode_has_attr_fork(ip)) return xfs_inode_fork_boff(ip); return XFS_LITINO(ip->i_mount); } static inline unsigned int xfs_inode_attr_fork_size(struct xfs_inode *ip) { if (xfs_inode_has_attr_fork(ip)) return XFS_LITINO(ip->i_mount) - xfs_inode_fork_boff(ip); return 0; } static inline unsigned int xfs_inode_fork_size( struct xfs_inode *ip, int whichfork) { switch (whichfork) { case XFS_DATA_FORK: return xfs_inode_data_fork_size(ip); case XFS_ATTR_FORK: return xfs_inode_attr_fork_size(ip); default: return 0; } } /* Convert from vfs inode to xfs inode */ static inline struct xfs_inode *XFS_I(struct inode *inode) { return container_of(inode, struct xfs_inode, i_vnode); } /* convert from xfs inode to vfs inode */ static inline struct inode *VFS_I(struct xfs_inode *ip) { return &ip->i_vnode; } /* convert from const xfs inode to const vfs inode */ static inline const struct inode *VFS_IC(const struct xfs_inode *ip) { return &ip->i_vnode; } /* * For regular files we only update the on-disk filesize when actually * writing data back to disk. Until then only the copy in the VFS inode * is uptodate. */ static inline xfs_fsize_t XFS_ISIZE(struct xfs_inode *ip) { if (S_ISREG(VFS_I(ip)->i_mode)) return i_size_read(VFS_I(ip)); return ip->i_disk_size; } /* * If this I/O goes past the on-disk inode size update it unless it would * be past the current in-core inode size. */ static inline xfs_fsize_t xfs_new_eof(struct xfs_inode *ip, xfs_fsize_t new_size) { xfs_fsize_t i_size = i_size_read(VFS_I(ip)); if (new_size > i_size || new_size < 0) new_size = i_size; return new_size > ip->i_disk_size ? new_size : 0; } /* * i_flags helper functions */ static inline void __xfs_iflags_set(xfs_inode_t *ip, unsigned long flags) { ip->i_flags |= flags; } static inline void xfs_iflags_set(xfs_inode_t *ip, unsigned long flags) { spin_lock(&ip->i_flags_lock); __xfs_iflags_set(ip, flags); spin_unlock(&ip->i_flags_lock); } static inline void xfs_iflags_clear(xfs_inode_t *ip, unsigned long flags) { spin_lock(&ip->i_flags_lock); ip->i_flags &= ~flags; spin_unlock(&ip->i_flags_lock); } static inline int __xfs_iflags_test(const struct xfs_inode *ip, unsigned long flags) { return (ip->i_flags & flags); } static inline int xfs_iflags_test(xfs_inode_t *ip, unsigned long flags) { int ret; spin_lock(&ip->i_flags_lock); ret = __xfs_iflags_test(ip, flags); spin_unlock(&ip->i_flags_lock); return ret; } static inline int xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned long flags) { int ret; spin_lock(&ip->i_flags_lock); ret = ip->i_flags & flags; if (ret) ip->i_flags &= ~flags; spin_unlock(&ip->i_flags_lock); return ret; } static inline int xfs_iflags_test_and_set(xfs_inode_t *ip, unsigned long flags) { int ret; spin_lock(&ip->i_flags_lock); ret = ip->i_flags & flags; if (!ret) ip->i_flags |= flags; spin_unlock(&ip->i_flags_lock); return ret; } static inline bool xfs_is_reflink_inode(const struct xfs_inode *ip) { return ip->i_diflags2 & XFS_DIFLAG2_REFLINK; } static inline bool xfs_is_metadir_inode(const struct xfs_inode *ip) { return ip->i_diflags2 & XFS_DIFLAG2_METADATA; } static inline bool xfs_is_internal_inode(const struct xfs_inode *ip) { struct xfs_mount *mp = ip->i_mount; /* Any file in the metadata directory tree is a metadata inode. */ if (xfs_has_metadir(mp)) return xfs_is_metadir_inode(ip); /* * Before metadata directories, the only metadata inodes were the * three quota files, the realtime bitmap, and the realtime summary. */ return ip->i_ino == mp->m_sb.sb_rbmino || ip->i_ino == mp->m_sb.sb_rsumino || xfs_is_quota_inode(&mp->m_sb, ip->i_ino); } static inline bool xfs_is_zoned_inode(const struct xfs_inode *ip) { return xfs_has_zoned(ip->i_mount) && XFS_IS_REALTIME_INODE(ip); } bool xfs_is_always_cow_inode(const struct xfs_inode *ip); static inline bool xfs_is_cow_inode(const struct xfs_inode *ip) { return xfs_is_reflink_inode(ip) || xfs_is_always_cow_inode(ip); } static inline bool xfs_inode_has_filedata(const struct xfs_inode *ip) { return ip->i_df.if_nextents > 0 || ip->i_delayed_blks > 0; } /* * Check if an inode has any data in the COW fork. This might be often false * even for inodes with the reflink flag when there is no pending COW operation. */ static inline bool xfs_inode_has_cow_data(const struct xfs_inode *ip) { return ip->i_cowfp && ip->i_cowfp->if_bytes; } static inline bool xfs_inode_has_bigtime(const struct xfs_inode *ip) { return ip->i_diflags2 & XFS_DIFLAG2_BIGTIME; } static inline bool xfs_inode_has_large_extent_counts(const struct xfs_inode *ip) { return ip->i_diflags2 & XFS_DIFLAG2_NREXT64; } /* * Decide if this file is a realtime file whose data allocation unit is larger * than a single filesystem block. */ static inline bool xfs_inode_has_bigrtalloc(const struct xfs_inode *ip) { return XFS_IS_REALTIME_INODE(ip) && ip->i_mount->m_sb.sb_rextsize > 1; } /* * Return the buftarg used for data allocations on a given inode. */ #define xfs_inode_buftarg(ip) \ (XFS_IS_REALTIME_INODE(ip) ? \ (ip)->i_mount->m_rtdev_targp : (ip)->i_mount->m_ddev_targp) static inline bool xfs_inode_can_hw_atomic_write(const struct xfs_inode *ip) { if (IS_DAX(VFS_IC(ip))) return false; return xfs_inode_buftarg(ip)->bt_awu_max > 0; } static inline bool xfs_inode_can_sw_atomic_write(const struct xfs_inode *ip) { if (IS_DAX(VFS_IC(ip))) return false; return xfs_can_sw_atomic_write(ip->i_mount); } /* * In-core inode flags. */ #define XFS_IRECLAIM (1 << 0) /* started reclaiming this inode */ #define XFS_ISTALE (1 << 1) /* inode has been staled */ #define XFS_IRECLAIMABLE (1 << 2) /* inode can be reclaimed */ #define XFS_INEW (1 << 3) /* inode has just been allocated */ #define XFS_IPRESERVE_DM_FIELDS (1 << 4) /* has legacy DMAPI fields set */ #define XFS_ITRUNCATED (1 << 5) /* truncated down so flush-on-close */ #define XFS_EOFBLOCKS_RELEASED (1 << 6) /* eofblocks were freed in ->release */ #define XFS_IFLUSHING (1 << 7) /* inode is being flushed */ #define __XFS_IPINNED_BIT 8 /* wakeup key for zero pin count */ #define XFS_IPINNED (1 << __XFS_IPINNED_BIT) #define XFS_IEOFBLOCKS (1 << 9) /* has the preallocblocks tag set */ #define XFS_NEED_INACTIVE (1 << 10) /* see XFS_INACTIVATING below */ /* * If this unlinked inode is in the middle of recovery, don't let drop_inode * truncate and free the inode. This can happen if we iget the inode during * log recovery to replay a bmap operation on the inode. */ #define XFS_IRECOVERY (1 << 11) #define XFS_ICOWBLOCKS (1 << 12)/* has the cowblocks tag set */ /* * If we need to update on-disk metadata before this IRECLAIMABLE inode can be * freed, then NEED_INACTIVE will be set. Once we start the updates, the * INACTIVATING bit will be set to keep iget away from this inode. After the * inactivation completes, both flags will be cleared and the inode is a * plain old IRECLAIMABLE inode. */ #define XFS_INACTIVATING (1 << 13) /* Quotacheck is running but inode has not been added to quota counts. */ #define XFS_IQUOTAUNCHECKED (1 << 14) /* * Remap in progress. Callers that wish to update file data while * holding a shared IOLOCK or MMAPLOCK must drop the lock and retake * the lock in exclusive mode. Relocking the file will block until * IREMAPPING is cleared. */ #define XFS_IREMAPPING (1U << 15) /* All inode state flags related to inode reclaim. */ #define XFS_ALL_IRECLAIM_FLAGS (XFS_IRECLAIMABLE | \ XFS_IRECLAIM | \ XFS_NEED_INACTIVE | \ XFS_INACTIVATING) /* * Per-lifetime flags need to be reset when re-using a reclaimable inode during * inode lookup. This prevents unintended behaviour on the new inode from * ocurring. */ #define XFS_IRECLAIM_RESET_FLAGS \ (XFS_IRECLAIMABLE | XFS_IRECLAIM | \ XFS_EOFBLOCKS_RELEASED | XFS_ITRUNCATED | XFS_NEED_INACTIVE | \ XFS_INACTIVATING | XFS_IQUOTAUNCHECKED) /* * Flags for inode locking. * Bit ranges: 1<<1 - 1<<16-1 -- iolock/ilock modes (bitfield) * 1<<16 - 1<<32-1 -- lockdep annotation (integers) */ #define XFS_IOLOCK_EXCL (1u << 0) #define XFS_IOLOCK_SHARED (1u << 1) #define XFS_ILOCK_EXCL (1u << 2) #define XFS_ILOCK_SHARED (1u << 3) #define XFS_MMAPLOCK_EXCL (1u << 4) #define XFS_MMAPLOCK_SHARED (1u << 5) #define XFS_LOCK_MASK (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED \ | XFS_ILOCK_EXCL | XFS_ILOCK_SHARED \ | XFS_MMAPLOCK_EXCL | XFS_MMAPLOCK_SHARED) #define XFS_LOCK_FLAGS \ { XFS_IOLOCK_EXCL, "IOLOCK_EXCL" }, \ { XFS_IOLOCK_SHARED, "IOLOCK_SHARED" }, \ { XFS_ILOCK_EXCL, "ILOCK_EXCL" }, \ { XFS_ILOCK_SHARED, "ILOCK_SHARED" }, \ { XFS_MMAPLOCK_EXCL, "MMAPLOCK_EXCL" }, \ { XFS_MMAPLOCK_SHARED, "MMAPLOCK_SHARED" } /* * Flags for lockdep annotations. * * XFS_LOCK_PARENT - for directory operations that require locking a * parent directory inode and a child entry inode. IOLOCK requires nesting, * MMAPLOCK does not support this class, ILOCK requires a single subclass * to differentiate parent from child. * * XFS_LOCK_RTBITMAP/XFS_LOCK_RTSUM - the realtime device bitmap and summary * inodes do not participate in the normal lock order, and thus have their * own subclasses. * * XFS_LOCK_INUMORDER - for locking several inodes at the some time * with xfs_lock_inodes(). This flag is used as the starting subclass * and each subsequent lock acquired will increment the subclass by one. * However, MAX_LOCKDEP_SUBCLASSES == 8, which means we are greatly * limited to the subclasses we can represent via nesting. We need at least * 5 inodes nest depth for the ILOCK through rename, and we also have to support * XFS_ILOCK_PARENT, which gives 6 subclasses. That's 6 of the 8 subclasses * supported by lockdep. * * This also means we have to number the sub-classes in the lowest bits of * the mask we keep, and we have to ensure we never exceed 3 bits of lockdep * mask and we can't use bit-masking to build the subclasses. What a mess. * * Bit layout: * * Bit Lock Region * 16-19 XFS_IOLOCK_SHIFT dependencies * 20-23 XFS_MMAPLOCK_SHIFT dependencies * 24-31 XFS_ILOCK_SHIFT dependencies * * IOLOCK values * * 0-3 subclass value * 4-7 unused * * MMAPLOCK values * * 0-3 subclass value * 4-7 unused * * ILOCK values * 0-4 subclass values * 5 PARENT subclass (not nestable) * 6 unused * 7 unused * */ #define XFS_IOLOCK_SHIFT 16 #define XFS_IOLOCK_MAX_SUBCLASS 3 #define XFS_IOLOCK_DEP_MASK 0x000f0000u #define XFS_MMAPLOCK_SHIFT 20 #define XFS_MMAPLOCK_NUMORDER 0 #define XFS_MMAPLOCK_MAX_SUBCLASS 3 #define XFS_MMAPLOCK_DEP_MASK 0x00f00000u #define XFS_ILOCK_SHIFT 24 #define XFS_ILOCK_PARENT_VAL 5u #define XFS_ILOCK_MAX_SUBCLASS (XFS_ILOCK_PARENT_VAL - 1) #define XFS_ILOCK_DEP_MASK 0xff000000u #define XFS_ILOCK_PARENT (XFS_ILOCK_PARENT_VAL << XFS_ILOCK_SHIFT) #define XFS_LOCK_SUBCLASS_MASK (XFS_IOLOCK_DEP_MASK | \ XFS_MMAPLOCK_DEP_MASK | \ XFS_ILOCK_DEP_MASK) #define XFS_IOLOCK_DEP(flags) (((flags) & XFS_IOLOCK_DEP_MASK) \ >> XFS_IOLOCK_SHIFT) #define XFS_MMAPLOCK_DEP(flags) (((flags) & XFS_MMAPLOCK_DEP_MASK) \ >> XFS_MMAPLOCK_SHIFT) #define XFS_ILOCK_DEP(flags) (((flags) & XFS_ILOCK_DEP_MASK) \ >> XFS_ILOCK_SHIFT) /* * Layouts are broken in the BREAK_WRITE case to ensure that * layout-holders do not collide with local writes. Additionally, * layouts are broken in the BREAK_UNMAP case to make sure the * layout-holder has a consistent view of the file's extent map. While * BREAK_WRITE breaks can be satisfied by recalling FL_LAYOUT leases, * BREAK_UNMAP breaks additionally require waiting for busy dax-pages to * go idle. */ enum layout_break_reason { BREAK_WRITE, BREAK_UNMAP, }; /* * For multiple groups support: if S_ISGID bit is set in the parent * directory, group of new file is set to that of the parent, and * new subdirectory gets S_ISGID bit from parent. */ #define XFS_INHERIT_GID(pip) \ (xfs_has_grpid((pip)->i_mount) || (VFS_I(pip)->i_mode & S_ISGID)) int xfs_inactive(struct xfs_inode *ip); int xfs_lookup(struct xfs_inode *dp, const struct xfs_name *name, struct xfs_inode **ipp, struct xfs_name *ci_name); int xfs_create(const struct xfs_icreate_args *iargs, struct xfs_name *name, struct xfs_inode **ipp); int xfs_create_tmpfile(const struct xfs_icreate_args *iargs, struct xfs_inode **ipp); int xfs_remove(struct xfs_inode *dp, struct xfs_name *name, struct xfs_inode *ip); int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip, struct xfs_name *target_name); int xfs_rename(struct mnt_idmap *idmap, struct xfs_inode *src_dp, struct xfs_name *src_name, struct xfs_inode *src_ip, struct xfs_inode *target_dp, struct xfs_name *target_name, struct xfs_inode *target_ip, unsigned int flags); void xfs_ilock(xfs_inode_t *, uint); int xfs_ilock_nowait(xfs_inode_t *, uint); void xfs_iunlock(xfs_inode_t *, uint); void xfs_ilock_demote(xfs_inode_t *, uint); void xfs_assert_ilocked(struct xfs_inode *, uint); uint xfs_ilock_data_map_shared(struct xfs_inode *); uint xfs_ilock_attr_map_shared(struct xfs_inode *); int xfs_ifree(struct xfs_trans *, struct xfs_inode *); int xfs_itruncate_extents_flags(struct xfs_trans **, struct xfs_inode *, int, xfs_fsize_t, int); void xfs_iext_realloc(xfs_inode_t *, int, int); int xfs_log_force_inode(struct xfs_inode *ip); void xfs_iunpin_wait(xfs_inode_t *); #define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount)) int xfs_iflush_cluster(struct xfs_buf *); void xfs_lock_two_inodes(struct xfs_inode *ip0, uint ip0_mode, struct xfs_inode *ip1, uint ip1_mode); int xfs_icreate(struct xfs_trans *tp, xfs_ino_t ino, const struct xfs_icreate_args *args, struct xfs_inode **ipp); static inline int xfs_itruncate_extents( struct xfs_trans **tpp, struct xfs_inode *ip, int whichfork, xfs_fsize_t new_size) { return xfs_itruncate_extents_flags(tpp, ip, whichfork, new_size, 0); } int xfs_break_dax_layouts(struct inode *inode); int xfs_break_layouts(struct inode *inode, uint *iolock, enum layout_break_reason reason); static inline void xfs_update_stable_writes(struct xfs_inode *ip) { if (bdev_stable_writes(xfs_inode_buftarg(ip)->bt_bdev)) mapping_set_stable_writes(VFS_I(ip)->i_mapping); else mapping_clear_stable_writes(VFS_I(ip)->i_mapping); } /* * When setting up a newly allocated inode, we need to call * xfs_finish_inode_setup() once the inode is fully instantiated at * the VFS level to prevent the rest of the world seeing the inode * before we've completed instantiation. Otherwise we can do it * the moment the inode lookup is complete. */ static inline void xfs_finish_inode_setup(struct xfs_inode *ip) { xfs_iflags_clear(ip, XFS_INEW); barrier(); unlock_new_inode(VFS_I(ip)); } static inline void xfs_setup_existing_inode(struct xfs_inode *ip) { xfs_setup_inode(ip); xfs_setup_iops(ip); xfs_finish_inode_setup(ip); } void xfs_irele(struct xfs_inode *ip); extern struct kmem_cache *xfs_inode_cache; /* The default CoW extent size hint. */ #define XFS_DEFAULT_COWEXTSZ_HINT 32 bool xfs_inode_needs_inactive(struct xfs_inode *ip); struct xfs_inode *xfs_iunlink_lookup(struct xfs_perag *pag, xfs_agino_t agino); int xfs_iunlink_reload_next(struct xfs_trans *tp, struct xfs_buf *agibp, xfs_agino_t prev_agino, xfs_agino_t next_agino); void xfs_end_io(struct work_struct *work); int xfs_ilock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2); void xfs_iunlock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2); void xfs_iunlock2_remapping(struct xfs_inode *ip1, struct xfs_inode *ip2); void xfs_lock_inodes(struct xfs_inode **ips, int inodes, uint lock_mode); void xfs_sort_inodes(struct xfs_inode **i_tab, unsigned int num_inodes); static inline bool xfs_inode_unlinked_incomplete( const struct xfs_inode *ip) { return VFS_IC(ip)->i_nlink == 0 && !xfs_inode_on_unlinked_list(ip); } int xfs_inode_reload_unlinked_bucket(struct xfs_trans *tp, struct xfs_inode *ip); int xfs_inode_reload_unlinked(struct xfs_inode *ip); bool xfs_ifork_zapped(const struct xfs_inode *ip, int whichfork); void xfs_inode_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip, xfs_filblks_t *dblocks, xfs_filblks_t *rblocks); unsigned int xfs_inode_alloc_unitsize(struct xfs_inode *ip); int xfs_icreate_dqalloc(const struct xfs_icreate_args *args, struct xfs_dquot **udqpp, struct xfs_dquot **gdqpp, struct xfs_dquot **pdqpp); #endif /* __XFS_INODE_H__ */
4 1 3 9 1 1 1 6 1 5 5 1 1 19 2 4 9 1 2 2 1 1 3 4 6 6 1 3 4 10 1 2 6 1 2 2 13 1 1 2 6 3 4 1 1 1 1 35 35 2 32 2 1 2 33 1 1 34 35 20 1 19 19 19 20 5 4 3 3 26 2 6 22 4 2 17 2 19 19 6 2 16 16 33 33 8 10 15 31 31 3 28 23 20 6 13 2 5 6 5 8 11 2 4 4 4 1 1 1 1 5 1 1 1 1 1 4 1 1 5 1 1 1 2 4 1 1 1 1 1 1 1 1 1 1 1 2 143 144 23 14 1 5 2 10 13 4 1 26 1 1 1 4 1 5 4 1 7 1 1 3 1 7 1 3 1 5 2 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * All Rights Reserved. */ #include "xfs_platform.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_rtalloc.h" #include "xfs_iwalk.h" #include "xfs_itable.h" #include "xfs_error.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" #include "xfs_attr.h" #include "xfs_bmap.h" #include "xfs_bmap_util.h" #include "xfs_fsops.h" #include "xfs_discard.h" #include "xfs_quota.h" #include "xfs_trace.h" #include "xfs_icache.h" #include "xfs_trans.h" #include "xfs_btree.h" #include <linux/fsmap.h> #include "xfs_fsmap.h" #include "scrub/xfs_scrub.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_health.h" #include "xfs_reflink.h" #include "xfs_ioctl.h" #include "xfs_xattr.h" #include "xfs_rtbitmap.h" #include "xfs_file.h" #include "xfs_exchrange.h" #include "xfs_handle.h" #include "xfs_rtgroup.h" #include "xfs_healthmon.h" #include "xfs_verify_media.h" #include <linux/mount.h> #include <linux/fileattr.h> /* Return 0 on success or positive error */ int xfs_fsbulkstat_one_fmt( struct xfs_ibulk *breq, const struct xfs_bulkstat *bstat) { struct xfs_bstat bs1; xfs_bulkstat_to_bstat(breq->mp, &bs1, bstat); if (copy_to_user(breq->ubuffer, &bs1, sizeof(bs1))) return -EFAULT; return xfs_ibulk_advance(breq, sizeof(struct xfs_bstat)); } int xfs_fsinumbers_fmt( struct xfs_ibulk *breq, const struct xfs_inumbers *igrp) { struct xfs_inogrp ig1; xfs_inumbers_to_inogrp(&ig1, igrp); if (copy_to_user(breq->ubuffer, &ig1, sizeof(struct xfs_inogrp))) return -EFAULT; return xfs_ibulk_advance(breq, sizeof(struct xfs_inogrp)); } STATIC int xfs_ioc_fsbulkstat( struct file *file, unsigned int cmd, void __user *arg) { struct xfs_mount *mp = XFS_I(file_inode(file))->i_mount; struct xfs_fsop_bulkreq bulkreq; struct xfs_ibulk breq = { .mp = mp, .idmap = file_mnt_idmap(file), .ocount = 0, }; xfs_ino_t lastino; int error; /* done = 1 if there are more stats to get and if bulkstat */ /* should be called again (unused here, but used in dmapi) */ if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (xfs_is_shutdown(mp)) return -EIO; if (copy_from_user(&bulkreq, arg, sizeof(struct xfs_fsop_bulkreq))) return -EFAULT; if (copy_from_user(&lastino, bulkreq.lastip, sizeof(__s64))) return -EFAULT; if (bulkreq.icount <= 0) return -EINVAL; if (bulkreq.ubuffer == NULL) return -EINVAL; breq.ubuffer = bulkreq.ubuffer; breq.icount = bulkreq.icount; /* * FSBULKSTAT_SINGLE expects that *lastip contains the inode number * that we want to stat. However, FSINUMBERS and FSBULKSTAT expect * that *lastip contains either zero or the number of the last inode to * be examined by the previous call and return results starting with * the next inode after that. The new bulk request back end functions * take the inode to start with, so we have to compute the startino * parameter from lastino to maintain correct function. lastino == 0 * is a special case because it has traditionally meant "first inode * in filesystem". */ if (cmd == XFS_IOC_FSINUMBERS) { breq.startino = lastino ? lastino + 1 : 0; error = xfs_inumbers(&breq, xfs_fsinumbers_fmt); lastino = breq.startino - 1; } else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE) { breq.startino = lastino; breq.icount = 1; error = xfs_bulkstat_one(&breq, xfs_fsbulkstat_one_fmt); } else { /* XFS_IOC_FSBULKSTAT */ breq.startino = lastino ? lastino + 1 : 0; error = xfs_bulkstat(&breq, xfs_fsbulkstat_one_fmt); lastino = breq.startino - 1; } if (error) return error; if (bulkreq.lastip != NULL && copy_to_user(bulkreq.lastip, &lastino, sizeof(xfs_ino_t))) return -EFAULT; if (bulkreq.ocount != NULL && copy_to_user(bulkreq.ocount, &breq.ocount, sizeof(__s32))) return -EFAULT; return 0; } /* Return 0 on success or positive error */ static int xfs_bulkstat_fmt( struct xfs_ibulk *breq, const struct xfs_bulkstat *bstat) { if (copy_to_user(breq->ubuffer, bstat, sizeof(struct xfs_bulkstat))) return -EFAULT; return xfs_ibulk_advance(breq, sizeof(struct xfs_bulkstat)); } /* * Check the incoming bulk request @hdr from userspace and initialize the * internal @breq bulk request appropriately. Returns 0 if the bulk request * should proceed; -ECANCELED if there's nothing to do; or the usual * negative error code. */ static int xfs_bulk_ireq_setup( struct xfs_mount *mp, const struct xfs_bulk_ireq *hdr, struct xfs_ibulk *breq, void __user *ubuffer) { if (hdr->icount == 0 || (hdr->flags & ~XFS_BULK_IREQ_FLAGS_ALL) || memchr_inv(hdr->reserved, 0, sizeof(hdr->reserved))) return -EINVAL; breq->startino = hdr->ino; breq->ubuffer = ubuffer; breq->icount = hdr->icount; breq->ocount = 0; breq->flags = 0; /* * The @ino parameter is a special value, so we must look it up here. * We're not allowed to have IREQ_AGNO, and we only return one inode * worth of data. */ if (hdr->flags & XFS_BULK_IREQ_SPECIAL) { if (hdr->flags & XFS_BULK_IREQ_AGNO) return -EINVAL; switch (hdr->ino) { case XFS_BULK_IREQ_SPECIAL_ROOT: breq->startino = mp->m_sb.sb_rootino; break; default: return -EINVAL; } breq->icount = 1; } /* * The IREQ_AGNO flag means that we only want results from a given AG. * If @hdr->ino is zero, we start iterating in that AG. If @hdr->ino is * beyond the specified AG then we return no results. */ if (hdr->flags & XFS_BULK_IREQ_AGNO) { if (hdr->agno >= mp->m_sb.sb_agcount) return -EINVAL; if (breq->startino == 0) breq->startino = XFS_AGINO_TO_INO(mp, hdr->agno, 0); else if (XFS_INO_TO_AGNO(mp, breq->startino) < hdr->agno) return -EINVAL; breq->iwalk_flags |= XFS_IWALK_SAME_AG; /* Asking for an inode past the end of the AG? We're done! */ if (XFS_INO_TO_AGNO(mp, breq->startino) > hdr->agno) return -ECANCELED; } else if (hdr->agno) return -EINVAL; /* Asking for an inode past the end of the FS? We're done! */ if (XFS_INO_TO_AGNO(mp, breq->startino) >= mp->m_sb.sb_agcount) return -ECANCELED; if (hdr->flags & XFS_BULK_IREQ_NREXT64) breq->flags |= XFS_IBULK_NREXT64; /* Caller wants to see metadata directories in bulkstat output. */ if (hdr->flags & XFS_BULK_IREQ_METADIR) breq->flags |= XFS_IBULK_METADIR; return 0; } /* * Update the userspace bulk request @hdr to reflect the end state of the * internal bulk request @breq. */ static void xfs_bulk_ireq_teardown( struct xfs_bulk_ireq *hdr, struct xfs_ibulk *breq) { hdr->ino = breq->startino; hdr->ocount = breq->ocount; } /* Handle the v5 bulkstat ioctl. */ STATIC int xfs_ioc_bulkstat( struct file *file, unsigned int cmd, struct xfs_bulkstat_req __user *arg) { struct xfs_mount *mp = XFS_I(file_inode(file))->i_mount; struct xfs_bulk_ireq hdr; struct xfs_ibulk breq = { .mp = mp, .idmap = file_mnt_idmap(file), }; int error; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (xfs_is_shutdown(mp)) return -EIO; if (copy_from_user(&hdr, &arg->hdr, sizeof(hdr))) return -EFAULT; error = xfs_bulk_ireq_setup(mp, &hdr, &breq, arg->bulkstat); if (error == -ECANCELED) goto out_teardown; if (error < 0) return error; error = xfs_bulkstat(&breq, xfs_bulkstat_fmt); if (error) return error; out_teardown: xfs_bulk_ireq_teardown(&hdr, &breq); if (copy_to_user(&arg->hdr, &hdr, sizeof(hdr))) return -EFAULT; return 0; } STATIC int xfs_inumbers_fmt( struct xfs_ibulk *breq, const struct xfs_inumbers *igrp) { if (copy_to_user(breq->ubuffer, igrp, sizeof(struct xfs_inumbers))) return -EFAULT; return xfs_ibulk_advance(breq, sizeof(struct xfs_inumbers)); } /* Handle the v5 inumbers ioctl. */ STATIC int xfs_ioc_inumbers( struct xfs_mount *mp, unsigned int cmd, struct xfs_inumbers_req __user *arg) { struct xfs_bulk_ireq hdr; struct xfs_ibulk breq = { .mp = mp, }; int error; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (xfs_is_shutdown(mp)) return -EIO; if (copy_from_user(&hdr, &arg->hdr, sizeof(hdr))) return -EFAULT; if (hdr.flags & XFS_BULK_IREQ_METADIR) return -EINVAL; error = xfs_bulk_ireq_setup(mp, &hdr, &breq, arg->inumbers); if (error == -ECANCELED) goto out_teardown; if (error < 0) return error; error = xfs_inumbers(&breq, xfs_inumbers_fmt); if (error) return error; out_teardown: xfs_bulk_ireq_teardown(&hdr, &breq); if (copy_to_user(&arg->hdr, &hdr, sizeof(hdr))) return -EFAULT; return 0; } STATIC int xfs_ioc_fsgeometry( struct xfs_mount *mp, void __user *arg, int struct_version) { struct xfs_fsop_geom fsgeo; size_t len; xfs_fs_geometry(mp, &fsgeo, struct_version); if (struct_version <= 3) len = sizeof(struct xfs_fsop_geom_v1); else if (struct_version == 4) len = sizeof(struct xfs_fsop_geom_v4); else { xfs_fsop_geom_health(mp, &fsgeo); len = sizeof(fsgeo); } if (copy_to_user(arg, &fsgeo, len)) return -EFAULT; return 0; } STATIC int xfs_ioc_ag_geometry( struct xfs_mount *mp, void __user *arg) { struct xfs_perag *pag; struct xfs_ag_geometry ageo; int error; if (copy_from_user(&ageo, arg, sizeof(ageo))) return -EFAULT; if (ageo.ag_flags) return -EINVAL; if (memchr_inv(&ageo.ag_reserved, 0, sizeof(ageo.ag_reserved))) return -EINVAL; pag = xfs_perag_get(mp, ageo.ag_number); if (!pag) return -EINVAL; error = xfs_ag_get_geometry(pag, &ageo); xfs_perag_put(pag); if (error) return error; if (copy_to_user(arg, &ageo, sizeof(ageo))) return -EFAULT; return 0; } STATIC int xfs_ioc_rtgroup_geometry( struct xfs_mount *mp, void __user *arg) { struct xfs_rtgroup *rtg; struct xfs_rtgroup_geometry rgeo; int error; if (copy_from_user(&rgeo, arg, sizeof(rgeo))) return -EFAULT; if (rgeo.rg_flags) return -EINVAL; if (memchr_inv(&rgeo.rg_reserved, 0, sizeof(rgeo.rg_reserved))) return -EINVAL; if (!xfs_has_rtgroups(mp)) return -EINVAL; rtg = xfs_rtgroup_get(mp, rgeo.rg_number); if (!rtg) return -EINVAL; error = xfs_rtgroup_get_geometry(rtg, &rgeo); xfs_rtgroup_put(rtg); if (error) return error; if (copy_to_user(arg, &rgeo, sizeof(rgeo))) return -EFAULT; return 0; } /* * Linux extended inode flags interface. */ static void xfs_fill_fsxattr( struct xfs_inode *ip, int whichfork, struct file_kattr *fa) { struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); fileattr_fill_xflags(fa, xfs_ip2xflags(ip)); if (ip->i_diflags & XFS_DIFLAG_EXTSIZE) { fa->fsx_extsize = XFS_FSB_TO_B(mp, ip->i_extsize); } else if (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) { /* * Don't let a misaligned extent size hint on a directory * escape to userspace if it won't pass the setattr checks * later. */ if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) && xfs_extlen_to_rtxmod(mp, ip->i_extsize) > 0) { fa->fsx_xflags &= ~(FS_XFLAG_EXTSIZE | FS_XFLAG_EXTSZINHERIT); fa->fsx_extsize = 0; } else { fa->fsx_extsize = XFS_FSB_TO_B(mp, ip->i_extsize); } } if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) { /* * Don't let a misaligned CoW extent size hint on a directory * escape to userspace if it won't pass the setattr checks * later. */ if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) && ip->i_cowextsize % mp->m_sb.sb_rextsize > 0) { fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE; fa->fsx_cowextsize = 0; } else { fa->fsx_cowextsize = XFS_FSB_TO_B(mp, ip->i_cowextsize); } } fa->fsx_projid = ip->i_projid; if (ifp && !xfs_need_iread_extents(ifp)) fa->fsx_nextents = xfs_iext_count(ifp); else fa->fsx_nextents = xfs_ifork_nextents(ifp); } STATIC int xfs_ioc_fsgetxattra( xfs_inode_t *ip, void __user *arg) { struct file_kattr fa; xfs_ilock(ip, XFS_ILOCK_SHARED); xfs_fill_fsxattr(ip, XFS_ATTR_FORK, &fa); xfs_iunlock(ip, XFS_ILOCK_SHARED); return copy_fsxattr_to_user(&fa, arg); } int xfs_fileattr_get( struct dentry *dentry, struct file_kattr *fa) { struct xfs_inode *ip = XFS_I(d_inode(dentry)); xfs_ilock(ip, XFS_ILOCK_SHARED); xfs_fill_fsxattr(ip, XFS_DATA_FORK, fa); xfs_iunlock(ip, XFS_ILOCK_SHARED); return 0; } static int xfs_ioctl_setattr_xflags( struct xfs_trans *tp, struct xfs_inode *ip, struct file_kattr *fa) { struct xfs_mount *mp = ip->i_mount; bool rtflag = (fa->fsx_xflags & FS_XFLAG_REALTIME); uint64_t i_flags2; if (rtflag != XFS_IS_REALTIME_INODE(ip)) { /* Can't change realtime flag if any extents are allocated. */ if (xfs_inode_has_filedata(ip)) return -EINVAL; /* * If S_DAX is enabled on this file, we can only switch the * device if both support fsdax. We can't update S_DAX because * there might be other threads walking down the access paths. */ if (IS_DAX(VFS_I(ip)) && (mp->m_ddev_targp->bt_daxdev == NULL || (mp->m_rtdev_targp && mp->m_rtdev_targp->bt_daxdev == NULL))) return -EINVAL; } if (rtflag) { /* If realtime flag is set then must have realtime device */ if (mp->m_sb.sb_rblocks == 0 || mp->m_sb.sb_rextsize == 0 || xfs_extlen_to_rtxmod(mp, ip->i_extsize)) return -EINVAL; } /* diflags2 only valid for v3 inodes. */ i_flags2 = xfs_flags2diflags2(ip, fa->fsx_xflags); if (i_flags2 && !xfs_has_v3inodes(mp)) return -EINVAL; ip->i_diflags = xfs_flags2diflags(ip, fa->fsx_xflags); ip->i_diflags2 = i_flags2; xfs_diflags_to_iflags(ip, false); /* * Make the stable writes flag match that of the device the inode * resides on when flipping the RT flag. */ if (rtflag != XFS_IS_REALTIME_INODE(ip) && S_ISREG(VFS_I(ip)->i_mode)) xfs_update_stable_writes(ip); xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); XFS_STATS_INC(mp, xs_ig_attrchg); return 0; } static void xfs_ioctl_setattr_prepare_dax( struct xfs_inode *ip, struct file_kattr *fa) { struct xfs_mount *mp = ip->i_mount; struct inode *inode = VFS_I(ip); if (S_ISDIR(inode->i_mode)) return; if (xfs_has_dax_always(mp) || xfs_has_dax_never(mp)) return; if (((fa->fsx_xflags & FS_XFLAG_DAX) && !(ip->i_diflags2 & XFS_DIFLAG2_DAX)) || (!(fa->fsx_xflags & FS_XFLAG_DAX) && (ip->i_diflags2 & XFS_DIFLAG2_DAX))) d_mark_dontcache(inode); } /* * Set up the transaction structure for the setattr operation, checking that we * have permission to do so. On success, return a clean transaction and the * inode locked exclusively ready for further operation specific checks. On * failure, return an error without modifying or locking the inode. */ static struct xfs_trans * xfs_ioctl_setattr_get_trans( struct xfs_inode *ip, struct xfs_dquot *pdqp) { struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; int error = -EROFS; if (xfs_is_readonly(mp)) goto out_error; error = -EIO; if (xfs_is_shutdown(mp)) goto out_error; error = xfs_trans_alloc_ichange(ip, NULL, NULL, pdqp, has_capability_noaudit(current, CAP_FOWNER), &tp); if (error) goto out_error; if (xfs_has_wsync(mp)) xfs_trans_set_sync(tp); return tp; out_error: return ERR_PTR(error); } /* * Validate a proposed extent size hint. For regular files, the hint can only * be changed if no extents are allocated. */ static int xfs_ioctl_setattr_check_extsize( struct xfs_inode *ip, struct file_kattr *fa) { struct xfs_mount *mp = ip->i_mount; xfs_failaddr_t failaddr; uint16_t new_diflags; if (!fa->fsx_valid) return 0; if (S_ISREG(VFS_I(ip)->i_mode) && xfs_inode_has_filedata(ip) && XFS_FSB_TO_B(mp, ip->i_extsize) != fa->fsx_extsize) return -EINVAL; if (fa->fsx_extsize & mp->m_blockmask) return -EINVAL; new_diflags = xfs_flags2diflags(ip, fa->fsx_xflags); /* * Inode verifiers do not check that the extent size hint is an integer * multiple of the rt extent size on a directory with both rtinherit * and extszinherit flags set. Don't let sysadmins misconfigure * directories. */ if ((new_diflags & XFS_DIFLAG_RTINHERIT) && (new_diflags & XFS_DIFLAG_EXTSZINHERIT)) { unsigned int rtextsize_bytes; rtextsize_bytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize); if (fa->fsx_extsize % rtextsize_bytes) return -EINVAL; } failaddr = xfs_inode_validate_extsize(ip->i_mount, XFS_B_TO_FSB(mp, fa->fsx_extsize), VFS_I(ip)->i_mode, new_diflags); return failaddr != NULL ? -EINVAL : 0; } static int xfs_ioctl_setattr_check_cowextsize( struct xfs_inode *ip, struct file_kattr *fa) { struct xfs_mount *mp = ip->i_mount; xfs_failaddr_t failaddr; uint64_t new_diflags2; uint16_t new_diflags; if (!fa->fsx_valid) return 0; if (fa->fsx_cowextsize & mp->m_blockmask) return -EINVAL; new_diflags = xfs_flags2diflags(ip, fa->fsx_xflags); new_diflags2 = xfs_flags2diflags2(ip, fa->fsx_xflags); failaddr = xfs_inode_validate_cowextsize(ip->i_mount, XFS_B_TO_FSB(mp, fa->fsx_cowextsize), VFS_I(ip)->i_mode, new_diflags, new_diflags2); return failaddr != NULL ? -EINVAL : 0; } static int xfs_ioctl_setattr_check_projid( struct xfs_inode *ip, struct file_kattr *fa) { if (!fa->fsx_valid) return 0; /* Disallow 32bit project ids if 32bit IDs are not enabled. */ if (fa->fsx_projid > (uint16_t)-1 && !xfs_has_projid32(ip->i_mount)) return -EINVAL; return 0; } int xfs_fileattr_set( struct mnt_idmap *idmap, struct dentry *dentry, struct file_kattr *fa) { struct xfs_inode *ip = XFS_I(d_inode(dentry)); struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; struct xfs_dquot *pdqp = NULL; struct xfs_dquot *olddquot = NULL; int error; trace_xfs_ioctl_setattr(ip); if (!fa->fsx_valid) { if (fa->flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NOATIME_FL | FS_NODUMP_FL | FS_SYNC_FL | FS_DAX_FL | FS_PROJINHERIT_FL)) return -EOPNOTSUPP; } error = xfs_ioctl_setattr_check_projid(ip, fa); if (error) return error; /* * If disk quotas is on, we make sure that the dquots do exist on disk, * before we start any other transactions. Trying to do this later * is messy. We don't care to take a readlock to look at the ids * in inode here, because we can't hold it across the trans_reserve. * If the IDs do change before we take the ilock, we're covered * because the i_*dquot fields will get updated anyway. */ if (fa->fsx_valid && XFS_IS_QUOTA_ON(mp)) { error = xfs_qm_vop_dqalloc(ip, VFS_I(ip)->i_uid, VFS_I(ip)->i_gid, fa->fsx_projid, XFS_QMOPT_PQUOTA, NULL, NULL, &pdqp); if (error) return error; } xfs_ioctl_setattr_prepare_dax(ip, fa); tp = xfs_ioctl_setattr_get_trans(ip, pdqp); if (IS_ERR(tp)) { error = PTR_ERR(tp); goto error_free_dquots; } error = xfs_ioctl_setattr_check_extsize(ip, fa); if (error) goto error_trans_cancel; error = xfs_ioctl_setattr_check_cowextsize(ip, fa); if (error) goto error_trans_cancel; error = xfs_ioctl_setattr_xflags(tp, ip, fa); if (error) goto error_trans_cancel; if (!fa->fsx_valid) goto skip_xattr; /* * Change file ownership. Must be the owner or privileged. CAP_FSETID * overrides the following restrictions: * * The set-user-ID and set-group-ID bits of a file will be cleared upon * successful return from chown() */ if ((VFS_I(ip)->i_mode & (S_ISUID|S_ISGID)) && !capable_wrt_inode_uidgid(idmap, VFS_I(ip), CAP_FSETID)) VFS_I(ip)->i_mode &= ~(S_ISUID|S_ISGID); /* Change the ownerships and register project quota modifications */ if (ip->i_projid != fa->fsx_projid) { if (XFS_IS_PQUOTA_ON(mp)) { olddquot = xfs_qm_vop_chown(tp, ip, &ip->i_pdquot, pdqp); } ip->i_projid = fa->fsx_projid; } /* * Only set the extent size hint if we've already determined that the * extent size hint should be set on the inode. If no extent size flags * are set on the inode then unconditionally clear the extent size hint. */ if (ip->i_diflags & (XFS_DIFLAG_EXTSIZE | XFS_DIFLAG_EXTSZINHERIT)) ip->i_extsize = XFS_B_TO_FSB(mp, fa->fsx_extsize); else ip->i_extsize = 0; if (xfs_has_v3inodes(mp)) { if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) ip->i_cowextsize = XFS_B_TO_FSB(mp, fa->fsx_cowextsize); else ip->i_cowextsize = 0; } skip_xattr: error = xfs_trans_commit(tp); /* * Release any dquot(s) the inode had kept before chown. */ xfs_qm_dqrele(olddquot); xfs_qm_dqrele(pdqp); return error; error_trans_cancel: xfs_trans_cancel(tp); error_free_dquots: xfs_qm_dqrele(pdqp); return error; } static bool xfs_getbmap_format( struct kgetbmap *p, struct getbmapx __user *u, size_t recsize) { if (put_user(p->bmv_offset, &u->bmv_offset) || put_user(p->bmv_block, &u->bmv_block) || put_user(p->bmv_length, &u->bmv_length) || put_user(0, &u->bmv_count) || put_user(0, &u->bmv_entries)) return false; if (recsize < sizeof(struct getbmapx)) return true; if (put_user(0, &u->bmv_iflags) || put_user(p->bmv_oflags, &u->bmv_oflags) || put_user(0, &u->bmv_unused1) || put_user(0, &u->bmv_unused2)) return false; return true; } STATIC int xfs_ioc_getbmap( struct file *file, unsigned int cmd, void __user *arg) { struct getbmapx bmx = { 0 }; struct kgetbmap *buf; size_t recsize; int error, i; switch (cmd) { case XFS_IOC_GETBMAPA: bmx.bmv_iflags = BMV_IF_ATTRFORK; fallthrough; case XFS_IOC_GETBMAP: /* struct getbmap is a strict subset of struct getbmapx. */ recsize = sizeof(struct getbmap); break; case XFS_IOC_GETBMAPX: recsize = sizeof(struct getbmapx); break; default: return -EINVAL; } if (copy_from_user(&bmx, arg, recsize)) return -EFAULT; if (bmx.bmv_count < 2) return -EINVAL; if (bmx.bmv_count >= INT_MAX / recsize) return -ENOMEM; buf = kvzalloc_objs(*buf, bmx.bmv_count); if (!buf) return -ENOMEM; error = xfs_getbmap(XFS_I(file_inode(file)), &bmx, buf); if (error) goto out_free_buf; error = -EFAULT; if (copy_to_user(arg, &bmx, recsize)) goto out_free_buf; arg += recsize; for (i = 0; i < bmx.bmv_entries; i++) { if (!xfs_getbmap_format(buf + i, arg, recsize)) goto out_free_buf; arg += recsize; } error = 0; out_free_buf: kvfree(buf); return error; } int xfs_ioc_swapext( xfs_swapext_t *sxp) { xfs_inode_t *ip, *tip; /* Pull information for the target fd */ CLASS(fd, f)((int)sxp->sx_fdtarget); if (fd_empty(f)) return -EINVAL; if (!(fd_file(f)->f_mode & FMODE_WRITE) || !(fd_file(f)->f_mode & FMODE_READ) || (fd_file(f)->f_flags & O_APPEND)) return -EBADF; CLASS(fd, tmp)((int)sxp->sx_fdtmp); if (fd_empty(tmp)) return -EINVAL; if (!(fd_file(tmp)->f_mode & FMODE_WRITE) || !(fd_file(tmp)->f_mode & FMODE_READ) || (fd_file(tmp)->f_flags & O_APPEND)) return -EBADF; if (IS_SWAPFILE(file_inode(fd_file(f))) || IS_SWAPFILE(file_inode(fd_file(tmp)))) return -EINVAL; /* * We need to ensure that the fds passed in point to XFS inodes * before we cast and access them as XFS structures as we have no * control over what the user passes us here. */ if (fd_file(f)->f_op != &xfs_file_operations || fd_file(tmp)->f_op != &xfs_file_operations) return -EINVAL; ip = XFS_I(file_inode(fd_file(f))); tip = XFS_I(file_inode(fd_file(tmp))); if (ip->i_mount != tip->i_mount) return -EINVAL; if (ip->i_ino == tip->i_ino) return -EINVAL; if (xfs_is_shutdown(ip->i_mount)) return -EIO; return xfs_swap_extents(ip, tip, sxp); } static int xfs_ioc_getlabel( struct xfs_mount *mp, char __user *user_label) { struct xfs_sb *sbp = &mp->m_sb; char label[XFSLABEL_MAX + 1]; /* Paranoia */ BUILD_BUG_ON(sizeof(sbp->sb_fname) > FSLABEL_MAX); /* 1 larger than sb_fname, so this ensures a trailing NUL char */ spin_lock(&mp->m_sb_lock); memtostr_pad(label, sbp->sb_fname); spin_unlock(&mp->m_sb_lock); if (copy_to_user(user_label, label, sizeof(label))) return -EFAULT; return 0; } static int xfs_ioc_setlabel( struct file *filp, struct xfs_mount *mp, char __user *newlabel) { struct xfs_sb *sbp = &mp->m_sb; char label[XFSLABEL_MAX + 1]; size_t len; int error; if (!capable(CAP_SYS_ADMIN)) return -EPERM; /* * The generic ioctl allows up to FSLABEL_MAX chars, but XFS is much * smaller, at 12 bytes. We copy one more to be sure we find the * (required) NULL character to test the incoming label length. * NB: The on disk label doesn't need to be null terminated. */ if (copy_from_user(label, newlabel, XFSLABEL_MAX + 1)) return -EFAULT; len = strnlen(label, XFSLABEL_MAX + 1); if (len > sizeof(sbp->sb_fname)) return -EINVAL; error = mnt_want_write_file(filp); if (error) return error; spin_lock(&mp->m_sb_lock); memset(sbp->sb_fname, 0, sizeof(sbp->sb_fname)); memcpy(sbp->sb_fname, label, len); spin_unlock(&mp->m_sb_lock); /* * Now we do several things to satisfy userspace. * In addition to normal logging of the primary superblock, we also * immediately write these changes to sector zero for the primary, then * update all backup supers (as xfs_db does for a label change), then * invalidate the block device page cache. This is so that any prior * buffered reads from userspace (i.e. from blkid) are invalidated, * and userspace will see the newly-written label. */ error = xfs_sync_sb_buf(mp, true); if (error) goto out; /* * growfs also updates backup supers so lock against that. */ mutex_lock(&mp->m_growlock); error = xfs_update_secondary_sbs(mp); mutex_unlock(&mp->m_growlock); invalidate_bdev(mp->m_ddev_targp->bt_bdev); if (xfs_has_rtsb(mp) && mp->m_rtdev_targp) invalidate_bdev(mp->m_rtdev_targp->bt_bdev); out: mnt_drop_write_file(filp); return error; } static inline int xfs_fs_eofblocks_from_user( struct xfs_fs_eofblocks *src, struct xfs_icwalk *dst) { if (src->eof_version != XFS_EOFBLOCKS_VERSION) return -EINVAL; if (src->eof_flags & ~XFS_EOF_FLAGS_VALID) return -EINVAL; if (memchr_inv(&src->pad32, 0, sizeof(src->pad32)) || memchr_inv(src->pad64, 0, sizeof(src->pad64))) return -EINVAL; dst->icw_flags = 0; if (src->eof_flags & XFS_EOF_FLAGS_SYNC) dst->icw_flags |= XFS_ICWALK_FLAG_SYNC; if (src->eof_flags & XFS_EOF_FLAGS_UID) dst->icw_flags |= XFS_ICWALK_FLAG_UID; if (src->eof_flags & XFS_EOF_FLAGS_GID) dst->icw_flags |= XFS_ICWALK_FLAG_GID; if (src->eof_flags & XFS_EOF_FLAGS_PRID) dst->icw_flags |= XFS_ICWALK_FLAG_PRID; if (src->eof_flags & XFS_EOF_FLAGS_MINFILESIZE) dst->icw_flags |= XFS_ICWALK_FLAG_MINFILESIZE; dst->icw_prid = src->eof_prid; dst->icw_min_file_size = src->eof_min_file_size; dst->icw_uid = INVALID_UID; if (src->eof_flags & XFS_EOF_FLAGS_UID) { dst->icw_uid = make_kuid(current_user_ns(), src->eof_uid); if (!uid_valid(dst->icw_uid)) return -EINVAL; } dst->icw_gid = INVALID_GID; if (src->eof_flags & XFS_EOF_FLAGS_GID) { dst->icw_gid = make_kgid(current_user_ns(), src->eof_gid); if (!gid_valid(dst->icw_gid)) return -EINVAL; } return 0; } static int xfs_ioctl_getset_resblocks( struct file *filp, unsigned int cmd, void __user *arg) { struct xfs_mount *mp = XFS_I(file_inode(filp))->i_mount; struct xfs_fsop_resblks fsop = { }; int error; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (cmd == XFS_IOC_SET_RESBLKS) { if (xfs_is_readonly(mp)) return -EROFS; if (copy_from_user(&fsop, arg, sizeof(fsop))) return -EFAULT; error = mnt_want_write_file(filp); if (error) return error; error = xfs_reserve_blocks(mp, XC_FREE_BLOCKS, fsop.resblks); mnt_drop_write_file(filp); if (error) return error; } spin_lock(&mp->m_sb_lock); fsop.resblks = mp->m_free[XC_FREE_BLOCKS].res_total; fsop.resblks_avail = mp->m_free[XC_FREE_BLOCKS].res_avail; spin_unlock(&mp->m_sb_lock); if (copy_to_user(arg, &fsop, sizeof(fsop))) return -EFAULT; return 0; } static int xfs_ioctl_fs_counts( struct xfs_mount *mp, struct xfs_fsop_counts __user *uarg) { struct xfs_fsop_counts out = { .allocino = percpu_counter_read_positive(&mp->m_icount), .freeino = percpu_counter_read_positive(&mp->m_ifree), .freedata = xfs_estimate_freecounter(mp, XC_FREE_BLOCKS) - xfs_freecounter_unavailable(mp, XC_FREE_BLOCKS), .freertx = xfs_estimate_freecounter(mp, XC_FREE_RTEXTENTS), }; if (copy_to_user(uarg, &out, sizeof(out))) return -EFAULT; return 0; } /* * These long-unused ioctls were removed from the official ioctl API in 5.17, * but retain these definitions so that we can log warnings about them. */ #define XFS_IOC_ALLOCSP _IOW ('X', 10, struct xfs_flock64) #define XFS_IOC_FREESP _IOW ('X', 11, struct xfs_flock64) #define XFS_IOC_ALLOCSP64 _IOW ('X', 36, struct xfs_flock64) #define XFS_IOC_FREESP64 _IOW ('X', 37, struct xfs_flock64) /* * Note: some of the ioctl's return positive numbers as a * byte count indicating success, such as readlink_by_handle. * So we don't "sign flip" like most other routines. This means * true errors need to be returned as a negative value. */ long xfs_file_ioctl( struct file *filp, unsigned int cmd, unsigned long p) { struct inode *inode = file_inode(filp); struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; void __user *arg = (void __user *)p; int error; trace_xfs_file_ioctl(ip); switch (cmd) { case FITRIM: return xfs_ioc_trim(mp, arg); case FS_IOC_GETFSLABEL: return xfs_ioc_getlabel(mp, arg); case FS_IOC_SETFSLABEL: return xfs_ioc_setlabel(filp, mp, arg); case XFS_IOC_ALLOCSP: case XFS_IOC_FREESP: case XFS_IOC_ALLOCSP64: case XFS_IOC_FREESP64: xfs_warn_once(mp, "%s should use fallocate; XFS_IOC_{ALLOC,FREE}SP ioctl unsupported", current->comm); return -ENOTTY; case XFS_IOC_DIOINFO: { struct kstat st; struct dioattr da; error = vfs_getattr(&filp->f_path, &st, STATX_DIOALIGN, 0); if (error) return error; /* * Some userspace directly feeds the return value to * posix_memalign, which fails for values that are smaller than * the pointer size. Round up the value to not break userspace. */ da.d_mem = roundup(st.dio_mem_align, sizeof(void *)); da.d_miniosz = st.dio_offset_align; da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1); if (copy_to_user(arg, &da, sizeof(da))) return -EFAULT; return 0; } case XFS_IOC_FSBULKSTAT_SINGLE: case XFS_IOC_FSBULKSTAT: case XFS_IOC_FSINUMBERS: return xfs_ioc_fsbulkstat(filp, cmd, arg); case XFS_IOC_BULKSTAT: return xfs_ioc_bulkstat(filp, cmd, arg); case XFS_IOC_INUMBERS: return xfs_ioc_inumbers(mp, cmd, arg); case XFS_IOC_FSGEOMETRY_V1: return xfs_ioc_fsgeometry(mp, arg, 3); case XFS_IOC_FSGEOMETRY_V4: return xfs_ioc_fsgeometry(mp, arg, 4); case XFS_IOC_FSGEOMETRY: return xfs_ioc_fsgeometry(mp, arg, 5); case XFS_IOC_AG_GEOMETRY: return xfs_ioc_ag_geometry(mp, arg); case XFS_IOC_RTGROUP_GEOMETRY: return xfs_ioc_rtgroup_geometry(mp, arg); case XFS_IOC_GETVERSION: return put_user(inode->i_generation, (int __user *)arg); case XFS_IOC_FSGETXATTRA: return xfs_ioc_fsgetxattra(ip, arg); case XFS_IOC_GETPARENTS: return xfs_ioc_getparents(filp, arg); case XFS_IOC_GETPARENTS_BY_HANDLE: return xfs_ioc_getparents_by_handle(filp, arg); case XFS_IOC_GETBMAP: case XFS_IOC_GETBMAPA: case XFS_IOC_GETBMAPX: return xfs_ioc_getbmap(filp, cmd, arg); case FS_IOC_GETFSMAP: return xfs_ioc_getfsmap(ip, arg); case XFS_IOC_SCRUBV_METADATA: return xfs_ioc_scrubv_metadata(filp, arg); case XFS_IOC_SCRUB_METADATA: return xfs_ioc_scrub_metadata(filp, arg); case XFS_IOC_FD_TO_HANDLE: case XFS_IOC_PATH_TO_HANDLE: case XFS_IOC_PATH_TO_FSHANDLE: { xfs_fsop_handlereq_t hreq; if (copy_from_user(&hreq, arg, sizeof(hreq))) return -EFAULT; return xfs_find_handle(cmd, &hreq); } case XFS_IOC_OPEN_BY_HANDLE: { xfs_fsop_handlereq_t hreq; if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t))) return -EFAULT; return xfs_open_by_handle(filp, &hreq); } case XFS_IOC_READLINK_BY_HANDLE: { xfs_fsop_handlereq_t hreq; if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t))) return -EFAULT; return xfs_readlink_by_handle(filp, &hreq); } case XFS_IOC_ATTRLIST_BY_HANDLE: return xfs_attrlist_by_handle(filp, arg); case XFS_IOC_ATTRMULTI_BY_HANDLE: return xfs_attrmulti_by_handle(filp, arg); case XFS_IOC_SWAPEXT: { struct xfs_swapext sxp; if (copy_from_user(&sxp, arg, sizeof(xfs_swapext_t))) return -EFAULT; error = mnt_want_write_file(filp); if (error) return error; error = xfs_ioc_swapext(&sxp); mnt_drop_write_file(filp); return error; } case XFS_IOC_FSCOUNTS: return xfs_ioctl_fs_counts(mp, arg); case XFS_IOC_SET_RESBLKS: case XFS_IOC_GET_RESBLKS: return xfs_ioctl_getset_resblocks(filp, cmd, arg); case XFS_IOC_FSGROWFSDATA: { struct xfs_growfs_data in; if (copy_from_user(&in, arg, sizeof(in))) return -EFAULT; error = mnt_want_write_file(filp); if (error) return error; error = xfs_growfs_data(mp, &in); mnt_drop_write_file(filp); return error; } case XFS_IOC_FSGROWFSLOG: { struct xfs_growfs_log in; if (copy_from_user(&in, arg, sizeof(in))) return -EFAULT; error = mnt_want_write_file(filp); if (error) return error; error = xfs_growfs_log(mp, &in); mnt_drop_write_file(filp); return error; } case XFS_IOC_FSGROWFSRT: { xfs_growfs_rt_t in; if (copy_from_user(&in, arg, sizeof(in))) return -EFAULT; error = mnt_want_write_file(filp); if (error) return error; error = xfs_growfs_rt(mp, &in); mnt_drop_write_file(filp); return error; } case XFS_IOC_GOINGDOWN: { uint32_t in; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (get_user(in, (uint32_t __user *)arg)) return -EFAULT; return xfs_fs_goingdown(mp, in); } case XFS_IOC_ERROR_INJECTION: { xfs_error_injection_t in; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (copy_from_user(&in, arg, sizeof(in))) return -EFAULT; return xfs_errortag_add(mp, in.errtag); } case XFS_IOC_ERROR_CLEARALL: if (!capable(CAP_SYS_ADMIN)) return -EPERM; return xfs_errortag_clearall(mp); case XFS_IOC_FREE_EOFBLOCKS: { struct xfs_fs_eofblocks eofb; struct xfs_icwalk icw; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (xfs_is_readonly(mp)) return -EROFS; if (copy_from_user(&eofb, arg, sizeof(eofb))) return -EFAULT; error = xfs_fs_eofblocks_from_user(&eofb, &icw); if (error) return error; trace_xfs_ioc_free_eofblocks(mp, &icw, _RET_IP_); guard(super_write)(mp->m_super); return xfs_blockgc_free_space(mp, &icw); } case XFS_IOC_EXCHANGE_RANGE: return xfs_ioc_exchange_range(filp, arg); case XFS_IOC_START_COMMIT: return xfs_ioc_start_commit(filp, arg); case XFS_IOC_COMMIT_RANGE: return xfs_ioc_commit_range(filp, arg); case XFS_IOC_HEALTH_MONITOR: return xfs_ioc_health_monitor(filp, arg); case XFS_IOC_VERIFY_MEDIA: return xfs_ioc_verify_media(filp, arg); default: return -ENOTTY; } }
8416 8416 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PAGE_H #define _ASM_X86_PAGE_H #include <linux/types.h> #ifdef __KERNEL__ #include <asm/page_types.h> #ifdef CONFIG_X86_64 #include <asm/page_64.h> #else #include <asm/page_32.h> #endif /* CONFIG_X86_64 */ #ifndef __ASSEMBLER__ struct page; #include <linux/range.h> extern struct range pfn_mapped[]; extern int nr_pfn_mapped; static inline void copy_user_page(void *to, void *from, unsigned long vaddr, struct page *topage) { copy_page(to, from); } #define vma_alloc_zeroed_movable_folio(vma, vaddr) \ vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr) #ifndef __pa #define __pa(x) __phys_addr((unsigned long)(x)) #endif #define __pa_nodebug(x) __phys_addr_nodebug((unsigned long)(x)) /* __pa_symbol should be used for C visible symbols. This seems to be the official gcc blessed way to do such arithmetic. */ /* * We need __phys_reloc_hide() here because gcc may assume that there is no * overflow during __pa() calculation and can optimize it unexpectedly. * Newer versions of gcc provide -fno-strict-overflow switch to handle this * case properly. Once all supported versions of gcc understand it, we can * remove this Voodoo magic stuff. (i.e. once gcc3.x is deprecated) */ #define __pa_symbol(x) \ __phys_addr_symbol(__phys_reloc_hide((unsigned long)(x))) #ifndef __va #define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) #endif #define __boot_va(x) __va(x) #define __boot_pa(x) __pa(x) /* * virt_to_page(kaddr) returns a valid pointer if and only if * virt_addr_valid(kaddr) returns true. */ #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) extern bool __virt_addr_valid(unsigned long kaddr); #define virt_addr_valid(kaddr) __virt_addr_valid((unsigned long) (kaddr)) static __always_inline void *pfn_to_kaddr(unsigned long pfn) { return __va(pfn << PAGE_SHIFT); } static __always_inline u64 __canonical_address(u64 vaddr, u8 vaddr_bits) { return ((s64)vaddr << (64 - vaddr_bits)) >> (64 - vaddr_bits); } static __always_inline u64 __is_canonical_address(u64 vaddr, u8 vaddr_bits) { return __canonical_address(vaddr, vaddr_bits) == vaddr; } #endif /* __ASSEMBLER__ */ #include <asm-generic/memory_model.h> #include <asm-generic/getorder.h> #define HAVE_ARCH_HUGETLB_UNMAPPED_AREA #endif /* __KERNEL__ */ #endif /* _ASM_X86_PAGE_H */
2 2 2 2 2 3 2 3 2 2 2 2 2 2 2 216 216 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 // SPDX-License-Identifier: GPL-2.0 /* * drivers/base/devres.c - device resource management * * Copyright (c) 2006 SUSE Linux Products GmbH * Copyright (c) 2006 Tejun Heo <teheo@suse.de> */ #include <linux/device.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/percpu.h> #include <asm/sections.h> #include "base.h" #include "trace.h" struct devres_node { struct list_head entry; dr_release_t release; const char *name; size_t size; }; struct devres { struct devres_node node; /* * Some archs want to perform DMA into kmalloc caches * and need a guaranteed alignment larger than * the alignment of a 64-bit integer. * Thus we use ARCH_DMA_MINALIGN for data[] which will force the same * alignment for struct devres when allocated by kmalloc(). */ u8 __aligned(ARCH_DMA_MINALIGN) data[]; }; struct devres_group { struct devres_node node[2]; void *id; int color; /* -- 8 pointers */ }; static void set_node_dbginfo(struct devres_node *node, const char *name, size_t size) { node->name = name; node->size = size; } #ifdef CONFIG_DEBUG_DEVRES static int log_devres = 0; module_param_named(log, log_devres, int, S_IRUGO | S_IWUSR); static void devres_dbg(struct device *dev, struct devres_node *node, const char *op) { if (unlikely(log_devres)) dev_err(dev, "DEVRES %3s %p %s (%zu bytes)\n", op, node, node->name, node->size); } #else /* CONFIG_DEBUG_DEVRES */ #define devres_dbg(dev, node, op) do {} while (0) #endif /* CONFIG_DEBUG_DEVRES */ static void devres_log(struct device *dev, struct devres_node *node, const char *op) { trace_devres_log(dev, op, node, node->name, node->size); devres_dbg(dev, node, op); } /* * Release functions for devres group. These callbacks are used only * for identification. */ static void group_open_release(struct device *dev, void *res) { /* noop */ } static void group_close_release(struct device *dev, void *res) { /* noop */ } static struct devres_group *node_to_group(struct devres_node *node) { if (node->release == &group_open_release) return container_of(node, struct devres_group, node[0]); if (node->release == &group_close_release) return container_of(node, struct devres_group, node[1]); return NULL; } static bool check_dr_size(size_t size, size_t *tot_size) { /* We must catch any near-SIZE_MAX cases that could overflow. */ if (unlikely(check_add_overflow(sizeof(struct devres), size, tot_size))) return false; /* Actually allocate the full kmalloc bucket size. */ *tot_size = kmalloc_size_roundup(*tot_size); return true; } static __always_inline struct devres *alloc_dr(dr_release_t release, size_t size, gfp_t gfp, int nid) { size_t tot_size; struct devres *dr; if (!check_dr_size(size, &tot_size)) return NULL; dr = kmalloc_node_track_caller(tot_size, gfp, nid); if (unlikely(!dr)) return NULL; /* No need to clear memory twice */ if (!(gfp & __GFP_ZERO)) memset(dr, 0, offsetof(struct devres, data)); INIT_LIST_HEAD(&dr->node.entry); dr->node.release = release; return dr; } static void add_dr(struct device *dev, struct devres_node *node) { devres_log(dev, node, "ADD"); BUG_ON(!list_empty(&node->entry)); list_add_tail(&node->entry, &dev->devres_head); } static void replace_dr(struct device *dev, struct devres_node *old, struct devres_node *new) { devres_log(dev, old, "REPLACE"); BUG_ON(!list_empty(&new->entry)); list_replace(&old->entry, &new->entry); } /** * __devres_alloc_node - Allocate device resource data * @release: Release function devres will be associated with * @size: Allocation size * @gfp: Allocation flags * @nid: NUMA node * @name: Name of the resource * * Allocate devres of @size bytes. The allocated area is zeroed, then * associated with @release. The returned pointer can be passed to * other devres_*() functions. * * RETURNS: * Pointer to allocated devres on success, NULL on failure. */ void *__devres_alloc_node(dr_release_t release, size_t size, gfp_t gfp, int nid, const char *name) { struct devres *dr; dr = alloc_dr(release, size, gfp | __GFP_ZERO, nid); if (unlikely(!dr)) return NULL; set_node_dbginfo(&dr->node, name, size); return dr->data; } EXPORT_SYMBOL_GPL(__devres_alloc_node); /** * devres_for_each_res - Resource iterator * @dev: Device to iterate resource from * @release: Look for resources associated with this release function * @match: Match function (optional) * @match_data: Data for the match function * @fn: Function to be called for each matched resource. * @data: Data for @fn, the 3rd parameter of @fn * * Call @fn for each devres of @dev which is associated with @release * and for which @match returns 1. * * RETURNS: * void */ void devres_for_each_res(struct device *dev, dr_release_t release, dr_match_t match, void *match_data, void (*fn)(struct device *, void *, void *), void *data) { struct devres_node *node; struct devres_node *tmp; unsigned long flags; if (!fn) return; spin_lock_irqsave(&dev->devres_lock, flags); list_for_each_entry_safe_reverse(node, tmp, &dev->devres_head, entry) { struct devres *dr = container_of(node, struct devres, node); if (node->release != release) continue; if (match && !match(dev, dr->data, match_data)) continue; fn(dev, dr->data, data); } spin_unlock_irqrestore(&dev->devres_lock, flags); } EXPORT_SYMBOL_GPL(devres_for_each_res); /** * devres_free - Free device resource data * @res: Pointer to devres data to free * * Free devres created with devres_alloc(). */ void devres_free(void *res) { if (res) { struct devres *dr = container_of(res, struct devres, data); BUG_ON(!list_empty(&dr->node.entry)); kfree(dr); } } EXPORT_SYMBOL_GPL(devres_free); /** * devres_add - Register device resource * @dev: Device to add resource to * @res: Resource to register * * Register devres @res to @dev. @res should have been allocated * using devres_alloc(). On driver detach, the associated release * function will be invoked and devres will be freed automatically. */ void devres_add(struct device *dev, void *res) { struct devres *dr = container_of(res, struct devres, data); unsigned long flags; spin_lock_irqsave(&dev->devres_lock, flags); add_dr(dev, &dr->node); spin_unlock_irqrestore(&dev->devres_lock, flags); } EXPORT_SYMBOL_GPL(devres_add); static struct devres *find_dr(struct device *dev, dr_release_t release, dr_match_t match, void *match_data) { struct devres_node *node; list_for_each_entry_reverse(node, &dev->devres_head, entry) { struct devres *dr = container_of(node, struct devres, node); if (node->release != release) continue; if (match && !match(dev, dr->data, match_data)) continue; return dr; } return NULL; } /** * devres_find - Find device resource * @dev: Device to lookup resource from * @release: Look for resources associated with this release function * @match: Match function (optional) * @match_data: Data for the match function * * Find the latest devres of @dev which is associated with @release * and for which @match returns 1. If @match is NULL, it's considered * to match all. * * RETURNS: * Pointer to found devres, NULL if not found. */ void *devres_find(struct device *dev, dr_release_t release, dr_match_t match, void *match_data) { struct devres *dr; unsigned long flags; spin_lock_irqsave(&dev->devres_lock, flags); dr = find_dr(dev, release, match, match_data); spin_unlock_irqrestore(&dev->devres_lock, flags); if (dr) return dr->data; return NULL; } EXPORT_SYMBOL_GPL(devres_find); /** * devres_get - Find devres, if non-existent, add one atomically * @dev: Device to lookup or add devres for * @new_res: Pointer to new initialized devres to add if not found * @match: Match function (optional) * @match_data: Data for the match function * * Find the latest devres of @dev which has the same release function * as @new_res and for which @match return 1. If found, @new_res is * freed; otherwise, @new_res is added atomically. * * RETURNS: * Pointer to found or added devres. */ void *devres_get(struct device *dev, void *new_res, dr_match_t match, void *match_data) { struct devres *new_dr = container_of(new_res, struct devres, data); struct devres *dr; unsigned long flags; spin_lock_irqsave(&dev->devres_lock, flags); dr = find_dr(dev, new_dr->node.release, match, match_data); if (!dr) { add_dr(dev, &new_dr->node); dr = new_dr; new_res = NULL; } spin_unlock_irqrestore(&dev->devres_lock, flags); devres_free(new_res); return dr->data; } EXPORT_SYMBOL_GPL(devres_get); /** * devres_remove - Find a device resource and remove it * @dev: Device to find resource from * @release: Look for resources associated with this release function * @match: Match function (optional) * @match_data: Data for the match function * * Find the latest devres of @dev associated with @release and for * which @match returns 1. If @match is NULL, it's considered to * match all. If found, the resource is removed atomically and * returned. * * RETURNS: * Pointer to removed devres on success, NULL if not found. */ void *devres_remove(struct device *dev, dr_release_t release, dr_match_t match, void *match_data) { struct devres *dr; unsigned long flags; spin_lock_irqsave(&dev->devres_lock, flags); dr = find_dr(dev, release, match, match_data); if (dr) { list_del_init(&dr->node.entry); devres_log(dev, &dr->node, "REM"); } spin_unlock_irqrestore(&dev->devres_lock, flags); if (dr) return dr->data; return NULL; } EXPORT_SYMBOL_GPL(devres_remove); /** * devres_destroy - Find a device resource and destroy it * @dev: Device to find resource from * @release: Look for resources associated with this release function * @match: Match function (optional) * @match_data: Data for the match function * * Find the latest devres of @dev associated with @release and for * which @match returns 1. If @match is NULL, it's considered to * match all. If found, the resource is removed atomically and freed. * * Note that the release function for the resource will not be called, * only the devres-allocated data will be freed. The caller becomes * responsible for freeing any other data. * * RETURNS: * 0 if devres is found and freed, -ENOENT if not found. */ int devres_destroy(struct device *dev, dr_release_t release, dr_match_t match, void *match_data) { void *res; res = devres_remove(dev, release, match, match_data); if (unlikely(!res)) return -ENOENT; devres_free(res); return 0; } EXPORT_SYMBOL_GPL(devres_destroy); /** * devres_release - Find a device resource and destroy it, calling release * @dev: Device to find resource from * @release: Look for resources associated with this release function * @match: Match function (optional) * @match_data: Data for the match function * * Find the latest devres of @dev associated with @release and for * which @match returns 1. If @match is NULL, it's considered to * match all. If found, the resource is removed atomically, the * release function called and the resource freed. * * RETURNS: * 0 if devres is found and freed, -ENOENT if not found. */ int devres_release(struct device *dev, dr_release_t release, dr_match_t match, void *match_data) { void *res; res = devres_remove(dev, release, match, match_data); if (unlikely(!res)) return -ENOENT; (*release)(dev, res); devres_free(res); return 0; } EXPORT_SYMBOL_GPL(devres_release); static int remove_nodes(struct device *dev, struct list_head *first, struct list_head *end, struct list_head *todo) { struct devres_node *node, *n; int cnt = 0, nr_groups = 0; /* First pass - move normal devres entries to @todo and clear * devres_group colors. */ node = list_entry(first, struct devres_node, entry); list_for_each_entry_safe_from(node, n, end, entry) { struct devres_group *grp; grp = node_to_group(node); if (grp) { /* clear color of group markers in the first pass */ grp->color = 0; nr_groups++; } else { /* regular devres entry */ if (&node->entry == first) first = first->next; list_move_tail(&node->entry, todo); cnt++; } } if (!nr_groups) return cnt; /* Second pass - Scan groups and color them. A group gets * color value of two iff the group is wholly contained in * [current node, end). That is, for a closed group, both opening * and closing markers should be in the range, while just the * opening marker is enough for an open group. */ node = list_entry(first, struct devres_node, entry); list_for_each_entry_safe_from(node, n, end, entry) { struct devres_group *grp; grp = node_to_group(node); BUG_ON(!grp || list_empty(&grp->node[0].entry)); grp->color++; if (list_empty(&grp->node[1].entry)) grp->color++; BUG_ON(grp->color <= 0 || grp->color > 2); if (grp->color == 2) { /* No need to update current node or end. The removed * nodes are always before both. */ list_move_tail(&grp->node[0].entry, todo); list_del_init(&grp->node[1].entry); } } return cnt; } static void release_nodes(struct device *dev, struct list_head *todo) { struct devres *dr, *tmp; /* Release. Note that both devres and devres_group are * handled as devres in the following loop. This is safe. */ list_for_each_entry_safe_reverse(dr, tmp, todo, node.entry) { devres_log(dev, &dr->node, "REL"); dr->node.release(dev, dr->data); kfree(dr); } } /** * devres_release_all - Release all managed resources * @dev: Device to release resources for * * Release all resources associated with @dev. This function is * called on driver detach. */ int devres_release_all(struct device *dev) { unsigned long flags; LIST_HEAD(todo); int cnt; /* Looks like an uninitialized device structure */ if (WARN_ON(dev->devres_head.next == NULL)) return -ENODEV; /* Nothing to release if list is empty */ if (list_empty(&dev->devres_head)) return 0; spin_lock_irqsave(&dev->devres_lock, flags); cnt = remove_nodes(dev, dev->devres_head.next, &dev->devres_head, &todo); spin_unlock_irqrestore(&dev->devres_lock, flags); release_nodes(dev, &todo); return cnt; } /** * devres_open_group - Open a new devres group * @dev: Device to open devres group for * @id: Separator ID * @gfp: Allocation flags * * Open a new devres group for @dev with @id. For @id, using a * pointer to an object which won't be used for another group is * recommended. If @id is NULL, address-wise unique ID is created. * * RETURNS: * ID of the new group, NULL on failure. */ void *devres_open_group(struct device *dev, void *id, gfp_t gfp) { struct devres_group *grp; unsigned long flags; grp = kmalloc_obj(*grp, gfp); if (unlikely(!grp)) return NULL; grp->node[0].release = &group_open_release; grp->node[1].release = &group_close_release; INIT_LIST_HEAD(&grp->node[0].entry); INIT_LIST_HEAD(&grp->node[1].entry); set_node_dbginfo(&grp->node[0], "grp<", 0); set_node_dbginfo(&grp->node[1], "grp>", 0); grp->id = grp; if (id) grp->id = id; grp->color = 0; spin_lock_irqsave(&dev->devres_lock, flags); add_dr(dev, &grp->node[0]); spin_unlock_irqrestore(&dev->devres_lock, flags); return grp->id; } EXPORT_SYMBOL_GPL(devres_open_group); /* * Find devres group with ID @id. If @id is NULL, look for the latest open * group. */ static struct devres_group *find_group(struct device *dev, void *id) { struct devres_node *node; list_for_each_entry_reverse(node, &dev->devres_head, entry) { struct devres_group *grp; if (node->release != &group_open_release) continue; grp = container_of(node, struct devres_group, node[0]); if (id) { if (grp->id == id) return grp; } else if (list_empty(&grp->node[1].entry)) return grp; } return NULL; } /** * devres_close_group - Close a devres group * @dev: Device to close devres group for * @id: ID of target group, can be NULL * * Close the group identified by @id. If @id is NULL, the latest open * group is selected. */ void devres_close_group(struct device *dev, void *id) { struct devres_group *grp; unsigned long flags; spin_lock_irqsave(&dev->devres_lock, flags); grp = find_group(dev, id); if (grp) add_dr(dev, &grp->node[1]); else WARN_ON(1); spin_unlock_irqrestore(&dev->devres_lock, flags); } EXPORT_SYMBOL_GPL(devres_close_group); /** * devres_remove_group - Remove a devres group * @dev: Device to remove group for * @id: ID of target group, can be NULL * * Remove the group identified by @id. If @id is NULL, the latest * open group is selected. Note that removing a group doesn't affect * any other resources. */ void devres_remove_group(struct device *dev, void *id) { struct devres_group *grp; unsigned long flags; spin_lock_irqsave(&dev->devres_lock, flags); grp = find_group(dev, id); if (grp) { list_del_init(&grp->node[0].entry); list_del_init(&grp->node[1].entry); devres_log(dev, &grp->node[0], "REM"); } else WARN_ON(1); spin_unlock_irqrestore(&dev->devres_lock, flags); kfree(grp); } EXPORT_SYMBOL_GPL(devres_remove_group); /** * devres_release_group - Release resources in a devres group * @dev: Device to release group for * @id: ID of target group, can be NULL * * Release all resources in the group identified by @id. If @id is * NULL, the latest open group is selected. The selected group and * groups properly nested inside the selected group are removed. * * RETURNS: * The number of released non-group resources. */ int devres_release_group(struct device *dev, void *id) { struct devres_group *grp; unsigned long flags; LIST_HEAD(todo); int cnt = 0; spin_lock_irqsave(&dev->devres_lock, flags); grp = find_group(dev, id); if (grp) { struct list_head *first = &grp->node[0].entry; struct list_head *end = &dev->devres_head; if (!list_empty(&grp->node[1].entry)) end = grp->node[1].entry.next; cnt = remove_nodes(dev, first, end, &todo); spin_unlock_irqrestore(&dev->devres_lock, flags); release_nodes(dev, &todo); } else if (list_empty(&dev->devres_head)) { /* * dev is probably dying via devres_release_all(): groups * have already been removed and are on the process of * being released - don't touch and don't warn. */ spin_unlock_irqrestore(&dev->devres_lock, flags); } else { WARN_ON(1); spin_unlock_irqrestore(&dev->devres_lock, flags); } return cnt; } EXPORT_SYMBOL_GPL(devres_release_group); /* * Custom devres actions allow inserting a simple function call * into the teardown sequence. */ struct action_devres { void *data; void (*action)(void *); }; static int devm_action_match(struct device *dev, void *res, void *p) { struct action_devres *devres = res; struct action_devres *target = p; return devres->action == target->action && devres->data == target->data; } static void devm_action_release(struct device *dev, void *res) { struct action_devres *devres = res; devres->action(devres->data); } /** * __devm_add_action() - add a custom action to list of managed resources * @dev: Device that owns the action * @action: Function that should be called * @data: Pointer to data passed to @action implementation * @name: Name of the resource (for debugging purposes) * * This adds a custom action to the list of managed resources so that * it gets executed as part of standard resource unwinding. */ int __devm_add_action(struct device *dev, void (*action)(void *), void *data, const char *name) { struct action_devres *devres; devres = __devres_alloc_node(devm_action_release, sizeof(struct action_devres), GFP_KERNEL, NUMA_NO_NODE, name); if (!devres) return -ENOMEM; devres->data = data; devres->action = action; devres_add(dev, devres); return 0; } EXPORT_SYMBOL_GPL(__devm_add_action); bool devm_is_action_added(struct device *dev, void (*action)(void *), void *data) { struct action_devres devres = { .data = data, .action = action, }; return devres_find(dev, devm_action_release, devm_action_match, &devres); } EXPORT_SYMBOL_GPL(devm_is_action_added); /** * devm_remove_action_nowarn() - removes previously added custom action * @dev: Device that owns the action * @action: Function implementing the action * @data: Pointer to data passed to @action implementation * * Removes instance of @action previously added by devm_add_action(). * Both action and data should match one of the existing entries. * * In contrast to devm_remove_action(), this function does not WARN() if no * entry could have been found. * * This should only be used if the action is contained in an object with * independent lifetime management, e.g. the Devres rust abstraction. * * Causing the warning from regular driver code most likely indicates an abuse * of the devres API. * * Returns: 0 on success, -ENOENT if no entry could have been found. */ int devm_remove_action_nowarn(struct device *dev, void (*action)(void *), void *data) { struct action_devres devres = { .data = data, .action = action, }; return devres_destroy(dev, devm_action_release, devm_action_match, &devres); } EXPORT_SYMBOL_GPL(devm_remove_action_nowarn); /** * devm_release_action() - release previously added custom action * @dev: Device that owns the action * @action: Function implementing the action * @data: Pointer to data passed to @action implementation * * Releases and removes instance of @action previously added by * devm_add_action(). Both action and data should match one of the * existing entries. */ void devm_release_action(struct device *dev, void (*action)(void *), void *data) { struct action_devres devres = { .data = data, .action = action, }; WARN_ON(devres_release(dev, devm_action_release, devm_action_match, &devres)); } EXPORT_SYMBOL_GPL(devm_release_action); /* * Managed kmalloc/kfree */ static void devm_kmalloc_release(struct device *dev, void *res) { /* noop */ } static int devm_kmalloc_match(struct device *dev, void *res, void *data) { return res == data; } /** * devm_kmalloc - Resource-managed kmalloc * @dev: Device to allocate memory for * @size: Allocation size * @gfp: Allocation gfp flags * * Managed kmalloc. Memory allocated with this function is * automatically freed on driver detach. Like all other devres * resources, guaranteed alignment is unsigned long long. * * RETURNS: * Pointer to allocated memory on success, NULL on failure. */ void *devm_kmalloc(struct device *dev, size_t size, gfp_t gfp) { struct devres *dr; if (unlikely(!size)) return ZERO_SIZE_PTR; /* use raw alloc_dr for kmalloc caller tracing */ dr = alloc_dr(devm_kmalloc_release, size, gfp, dev_to_node(dev)); if (unlikely(!dr)) return NULL; /* * This is named devm_kzalloc_release for historical reasons * The initial implementation did not support kmalloc, only kzalloc */ set_node_dbginfo(&dr->node, "devm_kzalloc_release", size); devres_add(dev, dr->data); return dr->data; } EXPORT_SYMBOL_GPL(devm_kmalloc); /** * devm_krealloc - Resource-managed krealloc() * @dev: Device to re-allocate memory for * @ptr: Pointer to the memory chunk to re-allocate * @new_size: New allocation size * @gfp: Allocation gfp flags * * Managed krealloc(). Resizes the memory chunk allocated with devm_kmalloc(). * Behaves similarly to regular krealloc(): if @ptr is NULL or ZERO_SIZE_PTR, * it's the equivalent of devm_kmalloc(). If new_size is zero, it frees the * previously allocated memory and returns ZERO_SIZE_PTR. This function doesn't * change the order in which the release callback for the re-alloc'ed devres * will be called (except when falling back to devm_kmalloc() or when freeing * resources when new_size is zero). The contents of the memory are preserved * up to the lesser of new and old sizes. */ void *devm_krealloc(struct device *dev, void *ptr, size_t new_size, gfp_t gfp) { size_t total_new_size, total_old_size; struct devres *old_dr, *new_dr; unsigned long flags; if (unlikely(!new_size)) { devm_kfree(dev, ptr); return ZERO_SIZE_PTR; } if (unlikely(ZERO_OR_NULL_PTR(ptr))) return devm_kmalloc(dev, new_size, gfp); if (WARN_ON(is_kernel_rodata((unsigned long)ptr))) /* * We cannot reliably realloc a const string returned by * devm_kstrdup_const(). */ return NULL; if (!check_dr_size(new_size, &total_new_size)) return NULL; total_old_size = ksize(container_of(ptr, struct devres, data)); if (total_old_size == 0) { WARN(1, "Pointer doesn't point to dynamically allocated memory."); return NULL; } /* * If new size is smaller or equal to the actual number of bytes * allocated previously - just return the same pointer. */ if (total_new_size <= total_old_size) return ptr; /* * Otherwise: allocate new, larger chunk. We need to allocate before * taking the lock as most probably the caller uses GFP_KERNEL. * alloc_dr() will call check_dr_size() to reserve extra memory * for struct devres automatically, so size @new_size user request * is delivered to it directly as devm_kmalloc() does. */ new_dr = alloc_dr(devm_kmalloc_release, new_size, gfp, dev_to_node(dev)); if (!new_dr) return NULL; /* * The spinlock protects the linked list against concurrent * modifications but not the resource itself. */ spin_lock_irqsave(&dev->devres_lock, flags); old_dr = find_dr(dev, devm_kmalloc_release, devm_kmalloc_match, ptr); if (!old_dr) { spin_unlock_irqrestore(&dev->devres_lock, flags); kfree(new_dr); WARN(1, "Memory chunk not managed or managed by a different device."); return NULL; } replace_dr(dev, &old_dr->node, &new_dr->node); spin_unlock_irqrestore(&dev->devres_lock, flags); /* * We can copy the memory contents after releasing the lock as we're * no longer modifying the list links. */ memcpy(new_dr->data, old_dr->data, total_old_size - offsetof(struct devres, data)); /* * Same for releasing the old devres - it's now been removed from the * list. This is also the reason why we must not use devm_kfree() - the * links are no longer valid. */ kfree(old_dr); return new_dr->data; } EXPORT_SYMBOL_GPL(devm_krealloc); /** * devm_kstrdup - Allocate resource managed space and * copy an existing string into that. * @dev: Device to allocate memory for * @s: the string to duplicate * @gfp: the GFP mask used in the devm_kmalloc() call when * allocating memory * RETURNS: * Pointer to allocated string on success, NULL on failure. */ char *devm_kstrdup(struct device *dev, const char *s, gfp_t gfp) { if (!s) return NULL; return devm_kmemdup(dev, s, strlen(s) + 1, gfp); } EXPORT_SYMBOL_GPL(devm_kstrdup); /** * devm_kstrdup_const - resource managed conditional string duplication * @dev: device for which to duplicate the string * @s: the string to duplicate * @gfp: the GFP mask used in the kmalloc() call when allocating memory * * Strings allocated by devm_kstrdup_const will be automatically freed when * the associated device is detached. * * RETURNS: * Source string if it is in .rodata section otherwise it falls back to * devm_kstrdup. */ const char *devm_kstrdup_const(struct device *dev, const char *s, gfp_t gfp) { if (is_kernel_rodata((unsigned long)s)) return s; return devm_kstrdup(dev, s, gfp); } EXPORT_SYMBOL_GPL(devm_kstrdup_const); /** * devm_kvasprintf - Allocate resource managed space and format a string * into that. * @dev: Device to allocate memory for * @gfp: the GFP mask used in the devm_kmalloc() call when * allocating memory * @fmt: The printf()-style format string * @ap: Arguments for the format string * RETURNS: * Pointer to allocated string on success, NULL on failure. */ char *devm_kvasprintf(struct device *dev, gfp_t gfp, const char *fmt, va_list ap) { unsigned int len; char *p; va_list aq; va_copy(aq, ap); len = vsnprintf(NULL, 0, fmt, aq); va_end(aq); p = devm_kmalloc(dev, len+1, gfp); if (!p) return NULL; vsnprintf(p, len+1, fmt, ap); return p; } EXPORT_SYMBOL(devm_kvasprintf); /** * devm_kasprintf - Allocate resource managed space and format a string * into that. * @dev: Device to allocate memory for * @gfp: the GFP mask used in the devm_kmalloc() call when * allocating memory * @fmt: The printf()-style format string * @...: Arguments for the format string * RETURNS: * Pointer to allocated string on success, NULL on failure. */ char *devm_kasprintf(struct device *dev, gfp_t gfp, const char *fmt, ...) { va_list ap; char *p; va_start(ap, fmt); p = devm_kvasprintf(dev, gfp, fmt, ap); va_end(ap); return p; } EXPORT_SYMBOL_GPL(devm_kasprintf); /** * devm_kfree - Resource-managed kfree * @dev: Device this memory belongs to * @p: Memory to free * * Free memory allocated with devm_kmalloc(). */ void devm_kfree(struct device *dev, const void *p) { int rc; /* * Special cases: pointer to a string in .rodata returned by * devm_kstrdup_const() or NULL/ZERO ptr. */ if (unlikely(is_kernel_rodata((unsigned long)p) || ZERO_OR_NULL_PTR(p))) return; rc = devres_destroy(dev, devm_kmalloc_release, devm_kmalloc_match, (void *)p); WARN_ON(rc); } EXPORT_SYMBOL_GPL(devm_kfree); /** * devm_kmemdup - Resource-managed kmemdup * @dev: Device this memory belongs to * @src: Memory region to duplicate * @len: Memory region length * @gfp: GFP mask to use * * Duplicate region of a memory using resource managed kmalloc */ void *devm_kmemdup(struct device *dev, const void *src, size_t len, gfp_t gfp) { void *p; p = devm_kmalloc(dev, len, gfp); if (p) memcpy(p, src, len); return p; } EXPORT_SYMBOL_GPL(devm_kmemdup); /** * devm_kmemdup_const - conditionally duplicate and manage a region of memory * * @dev: Device this memory belongs to * @src: memory region to duplicate * @len: memory region length, * @gfp: GFP mask to use * * Return: source address if it is in .rodata or the return value of kmemdup() * to which the function falls back otherwise. */ const void * devm_kmemdup_const(struct device *dev, const void *src, size_t len, gfp_t gfp) { if (is_kernel_rodata((unsigned long)src)) return src; return devm_kmemdup(dev, src, len, gfp); } EXPORT_SYMBOL_GPL(devm_kmemdup_const); struct pages_devres { unsigned long addr; unsigned int order; }; static int devm_pages_match(struct device *dev, void *res, void *p) { struct pages_devres *devres = res; struct pages_devres *target = p; return devres->addr == target->addr; } static void devm_pages_release(struct device *dev, void *res) { struct pages_devres *devres = res; free_pages(devres->addr, devres->order); } /** * devm_get_free_pages - Resource-managed __get_free_pages * @dev: Device to allocate memory for * @gfp_mask: Allocation gfp flags * @order: Allocation size is (1 << order) pages * * Managed get_free_pages. Memory allocated with this function is * automatically freed on driver detach. * * RETURNS: * Address of allocated memory on success, 0 on failure. */ unsigned long devm_get_free_pages(struct device *dev, gfp_t gfp_mask, unsigned int order) { struct pages_devres *devres; unsigned long addr; addr = __get_free_pages(gfp_mask, order); if (unlikely(!addr)) return 0; devres = devres_alloc(devm_pages_release, sizeof(struct pages_devres), GFP_KERNEL); if (unlikely(!devres)) { free_pages(addr, order); return 0; } devres->addr = addr; devres->order = order; devres_add(dev, devres); return addr; } EXPORT_SYMBOL_GPL(devm_get_free_pages); /** * devm_free_pages - Resource-managed free_pages * @dev: Device this memory belongs to * @addr: Memory to free * * Free memory allocated with devm_get_free_pages(). Unlike free_pages, * there is no need to supply the @order. */ void devm_free_pages(struct device *dev, unsigned long addr) { struct pages_devres devres = { .addr = addr }; WARN_ON(devres_release(dev, devm_pages_release, devm_pages_match, &devres)); } EXPORT_SYMBOL_GPL(devm_free_pages); static void devm_percpu_release(struct device *dev, void *pdata) { void __percpu *p; p = *(void __percpu **)pdata; free_percpu(p); } /** * __devm_alloc_percpu - Resource-managed alloc_percpu * @dev: Device to allocate per-cpu memory for * @size: Size of per-cpu memory to allocate * @align: Alignment of per-cpu memory to allocate * * Managed alloc_percpu. Per-cpu memory allocated with this function is * automatically freed on driver detach. * * RETURNS: * Pointer to allocated memory on success, NULL on failure. */ void __percpu *__devm_alloc_percpu(struct device *dev, size_t size, size_t align) { void *p; void __percpu *pcpu; pcpu = __alloc_percpu(size, align); if (!pcpu) return NULL; p = devres_alloc(devm_percpu_release, sizeof(void *), GFP_KERNEL); if (!p) { free_percpu(pcpu); return NULL; } *(void __percpu **)p = pcpu; devres_add(dev, p); return pcpu; } EXPORT_SYMBOL_GPL(__devm_alloc_percpu);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SWAPOPS_H #define _LINUX_SWAPOPS_H #include <linux/radix-tree.h> #include <linux/bug.h> #include <linux/mm_types.h> #ifdef CONFIG_MMU #ifdef CONFIG_SWAP #include <linux/swapfile.h> #endif /* CONFIG_SWAP */ /* * swapcache pages are stored in the swapper_space radix tree. We want to * get good packing density in that tree, so the index should be dense in * the low-order bits. * * We arrange the `type' and `offset' fields so that `type' is at the six * high-order bits of the swp_entry_t and `offset' is right-aligned in the * remaining bits. Although `type' itself needs only five bits, we allow for * shmem/tmpfs to shift it all up a further one bit: see swp_to_radix_entry(). * * swp_entry_t's are *never* stored anywhere in their arch-dependent format. */ #define SWP_TYPE_SHIFT (BITS_PER_XA_VALUE - MAX_SWAPFILES_SHIFT) #define SWP_OFFSET_MASK ((1UL << SWP_TYPE_SHIFT) - 1) /* * Definitions only for PFN swap entries (see leafeant_has_pfn()). To * store PFN, we only need SWP_PFN_BITS bits. Each of the pfn swap entries * can use the extra bits to store other information besides PFN. */ #ifdef MAX_PHYSMEM_BITS #define SWP_PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT) #else /* MAX_PHYSMEM_BITS */ #define SWP_PFN_BITS min_t(int, \ sizeof(phys_addr_t) * 8 - PAGE_SHIFT, \ SWP_TYPE_SHIFT) #endif /* MAX_PHYSMEM_BITS */ #define SWP_PFN_MASK (BIT(SWP_PFN_BITS) - 1) /** * Migration swap entry specific bitfield definitions. Layout: * * |----------+--------------------| * | swp_type | swp_offset | * |----------+--------+-+-+-------| * | | resv |D|A| PFN | * |----------+--------+-+-+-------| * * @SWP_MIG_YOUNG_BIT: Whether the page used to have young bit set (bit A) * @SWP_MIG_DIRTY_BIT: Whether the page used to have dirty bit set (bit D) * * Note: A/D bits will be stored in migration entries iff there're enough * free bits in arch specific swp offset. By default we'll ignore A/D bits * when migrating a page. Please refer to migration_entry_supports_ad() * for more information. If there're more bits besides PFN and A/D bits, * they should be reserved and always be zeros. */ #define SWP_MIG_YOUNG_BIT (SWP_PFN_BITS) #define SWP_MIG_DIRTY_BIT (SWP_PFN_BITS + 1) #define SWP_MIG_TOTAL_BITS (SWP_PFN_BITS + 2) #define SWP_MIG_YOUNG BIT(SWP_MIG_YOUNG_BIT) #define SWP_MIG_DIRTY BIT(SWP_MIG_DIRTY_BIT) /* Clear all flags but only keep swp_entry_t related information */ static inline pte_t pte_swp_clear_flags(pte_t pte) { if (pte_swp_exclusive(pte)) pte = pte_swp_clear_exclusive(pte); if (pte_swp_soft_dirty(pte)) pte = pte_swp_clear_soft_dirty(pte); if (pte_swp_uffd_wp(pte)) pte = pte_swp_clear_uffd_wp(pte); return pte; } /* * Store a type+offset into a swp_entry_t in an arch-independent format */ static inline swp_entry_t swp_entry(unsigned long type, pgoff_t offset) { swp_entry_t ret; ret.val = (type << SWP_TYPE_SHIFT) | (offset & SWP_OFFSET_MASK); return ret; } /* * Extract the `type' field from a swp_entry_t. The swp_entry_t is in * arch-independent format */ static inline unsigned swp_type(swp_entry_t entry) { return (entry.val >> SWP_TYPE_SHIFT); } /* * Extract the `offset' field from a swp_entry_t. The swp_entry_t is in * arch-independent format */ static inline pgoff_t swp_offset(swp_entry_t entry) { return entry.val & SWP_OFFSET_MASK; } /* * Convert the arch-independent representation of a swp_entry_t into the * arch-dependent pte representation. */ static inline pte_t swp_entry_to_pte(swp_entry_t entry) { swp_entry_t arch_entry; arch_entry = __swp_entry(swp_type(entry), swp_offset(entry)); return __swp_entry_to_pte(arch_entry); } static inline swp_entry_t radix_to_swp_entry(void *arg) { swp_entry_t entry; entry.val = xa_to_value(arg); return entry; } static inline void *swp_to_radix_entry(swp_entry_t entry) { return xa_mk_value(entry.val); } #if IS_ENABLED(CONFIG_DEVICE_PRIVATE) static inline swp_entry_t make_readable_device_private_entry(pgoff_t offset) { return swp_entry(SWP_DEVICE_READ, offset); } static inline swp_entry_t make_writable_device_private_entry(pgoff_t offset) { return swp_entry(SWP_DEVICE_WRITE, offset); } static inline swp_entry_t make_device_exclusive_entry(pgoff_t offset) { return swp_entry(SWP_DEVICE_EXCLUSIVE, offset); } #else /* CONFIG_DEVICE_PRIVATE */ static inline swp_entry_t make_readable_device_private_entry(pgoff_t offset) { return swp_entry(0, 0); } static inline swp_entry_t make_writable_device_private_entry(pgoff_t offset) { return swp_entry(0, 0); } static inline swp_entry_t make_device_exclusive_entry(pgoff_t offset) { return swp_entry(0, 0); } #endif /* CONFIG_DEVICE_PRIVATE */ #ifdef CONFIG_MIGRATION static inline swp_entry_t make_readable_migration_entry(pgoff_t offset) { return swp_entry(SWP_MIGRATION_READ, offset); } static inline swp_entry_t make_readable_exclusive_migration_entry(pgoff_t offset) { return swp_entry(SWP_MIGRATION_READ_EXCLUSIVE, offset); } static inline swp_entry_t make_writable_migration_entry(pgoff_t offset) { return swp_entry(SWP_MIGRATION_WRITE, offset); } /* * Returns whether the host has large enough swap offset field to support * carrying over pgtable A/D bits for page migrations. The result is * pretty much arch specific. */ static inline bool migration_entry_supports_ad(void) { #ifdef CONFIG_SWAP return swap_migration_ad_supported; #else /* CONFIG_SWAP */ return false; #endif /* CONFIG_SWAP */ } static inline swp_entry_t make_migration_entry_young(swp_entry_t entry) { if (migration_entry_supports_ad()) return swp_entry(swp_type(entry), swp_offset(entry) | SWP_MIG_YOUNG); return entry; } static inline swp_entry_t make_migration_entry_dirty(swp_entry_t entry) { if (migration_entry_supports_ad()) return swp_entry(swp_type(entry), swp_offset(entry) | SWP_MIG_DIRTY); return entry; } extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, unsigned long address); extern void migration_entry_wait_huge(struct vm_area_struct *vma, unsigned long addr, pte_t *pte); #else /* CONFIG_MIGRATION */ static inline swp_entry_t make_readable_migration_entry(pgoff_t offset) { return swp_entry(0, 0); } static inline swp_entry_t make_readable_exclusive_migration_entry(pgoff_t offset) { return swp_entry(0, 0); } static inline swp_entry_t make_writable_migration_entry(pgoff_t offset) { return swp_entry(0, 0); } static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, unsigned long address) { } static inline void migration_entry_wait_huge(struct vm_area_struct *vma, unsigned long addr, pte_t *pte) { } static inline swp_entry_t make_migration_entry_young(swp_entry_t entry) { return entry; } static inline swp_entry_t make_migration_entry_dirty(swp_entry_t entry) { return entry; } #endif /* CONFIG_MIGRATION */ #ifdef CONFIG_MEMORY_FAILURE /* * Support for hardware poisoned pages */ static inline swp_entry_t make_hwpoison_entry(struct page *page) { BUG_ON(!PageLocked(page)); return swp_entry(SWP_HWPOISON, page_to_pfn(page)); } static inline int is_hwpoison_entry(swp_entry_t entry) { return swp_type(entry) == SWP_HWPOISON; } #else static inline swp_entry_t make_hwpoison_entry(struct page *page) { return swp_entry(0, 0); } static inline int is_hwpoison_entry(swp_entry_t swp) { return 0; } #endif typedef unsigned long pte_marker; #define PTE_MARKER_UFFD_WP BIT(0) /* * "Poisoned" here is meant in the very general sense of "future accesses are * invalid", instead of referring very specifically to hardware memory errors. * This marker is meant to represent any of various different causes of this. * * Note that, when encountered by the faulting logic, PTEs with this marker will * result in VM_FAULT_HWPOISON and thus regardless trigger hardware memory error * logic. */ #define PTE_MARKER_POISONED BIT(1) /* * Indicates that, on fault, this PTE will case a SIGSEGV signal to be * sent. This means guard markers behave in effect as if the region were mapped * PROT_NONE, rather than if they were a memory hole or equivalent. */ #define PTE_MARKER_GUARD BIT(2) #define PTE_MARKER_MASK (BIT(3) - 1) static inline swp_entry_t make_pte_marker_entry(pte_marker marker) { return swp_entry(SWP_PTE_MARKER, marker); } static inline pte_t make_pte_marker(pte_marker marker) { return swp_entry_to_pte(make_pte_marker_entry(marker)); } static inline swp_entry_t make_poisoned_swp_entry(void) { return make_pte_marker_entry(PTE_MARKER_POISONED); } static inline swp_entry_t make_guard_swp_entry(void) { return make_pte_marker_entry(PTE_MARKER_GUARD); } struct page_vma_mapped_walk; #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION extern int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, struct page *page); extern void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new); extern void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd); static inline pmd_t swp_entry_to_pmd(swp_entry_t entry) { swp_entry_t arch_entry; arch_entry = __swp_entry(swp_type(entry), swp_offset(entry)); return __swp_entry_to_pmd(arch_entry); } #else /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ static inline int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, struct page *page) { BUILD_BUG(); } static inline void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) { BUILD_BUG(); } static inline void pmd_migration_entry_wait(struct mm_struct *m, pmd_t *p) { } static inline pmd_t swp_entry_to_pmd(swp_entry_t entry) { return __pmd(0); } #endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ #endif /* CONFIG_MMU */ #endif /* _LINUX_SWAPOPS_H */
17 17 5 3 10 87 2 1 1 86 87 87 87 2 84 84 84 41 137 211 211 210 211 73 73 73 73 59 59 40 22 101 59 101 65 39 101 39 39 306 269 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 // SPDX-License-Identifier: GPL-2.0-or-later /* * Written 1992,1993 by Werner Almesberger * 22/11/2000 - Fixed fat_date_unix2dos for dates earlier than 01/01/1980 * and date_dos2unix for date==0 by Igor Zhbanov(bsg@uniyar.ac.ru) * Copyright (C) 2012-2013 Samsung Electronics Co., Ltd. */ #include <linux/time.h> #include <linux/fs.h> #include <linux/slab.h> #include <linux/buffer_head.h> #include <linux/blk_types.h> #include "exfat_raw.h" #include "exfat_fs.h" /* * exfat_fs_error reports a file system problem that might indicate fa data * corruption/inconsistency. Depending on 'errors' mount option the * panic() is called, or error message is printed FAT and nothing is done, * or filesystem is remounted read-only (default behavior). * In case the file system is remounted read-only, it can be made writable * again by remounting it. */ void __exfat_fs_error(struct super_block *sb, int report, const char *fmt, ...) { struct exfat_mount_options *opts = &EXFAT_SB(sb)->options; va_list args; struct va_format vaf; if (report) { va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; exfat_err(sb, "error, %pV", &vaf); va_end(args); } if (opts->errors == EXFAT_ERRORS_PANIC) { panic("exFAT-fs (%s): fs panic from previous error\n", sb->s_id); } else if (opts->errors == EXFAT_ERRORS_RO && !sb_rdonly(sb)) { sb->s_flags |= SB_RDONLY; exfat_err(sb, "Filesystem has been set read-only"); } } #define SECS_PER_MIN (60) #define TIMEZONE_SEC(x) ((x) * 15 * SECS_PER_MIN) static void exfat_adjust_tz(struct timespec64 *ts, u8 tz_off) { if (tz_off <= 0x3F) ts->tv_sec -= TIMEZONE_SEC(tz_off); else /* 0x40 <= (tz_off & 0x7F) <=0x7F */ ts->tv_sec += TIMEZONE_SEC(0x80 - tz_off); } static inline int exfat_tz_offset(struct exfat_sb_info *sbi) { if (sbi->options.sys_tz) return -sys_tz.tz_minuteswest; return sbi->options.time_offset; } /* Convert a EXFAT time/date pair to a UNIX date (seconds since 1 1 70). */ void exfat_get_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts, u8 tz, __le16 time, __le16 date, u8 time_cs) { u16 t = le16_to_cpu(time); u16 d = le16_to_cpu(date); ts->tv_sec = mktime64(1980 + (d >> 9), d >> 5 & 0x000F, d & 0x001F, t >> 11, (t >> 5) & 0x003F, (t & 0x001F) << 1); /* time_cs field represent 0 ~ 199cs(1990 ms) */ if (time_cs) { ts->tv_sec += time_cs / 100; ts->tv_nsec = (time_cs % 100) * 10 * NSEC_PER_MSEC; } else ts->tv_nsec = 0; if (tz & EXFAT_TZ_VALID) /* Adjust timezone to UTC0. */ exfat_adjust_tz(ts, tz & ~EXFAT_TZ_VALID); else ts->tv_sec -= exfat_tz_offset(sbi) * SECS_PER_MIN; } /* Convert linear UNIX date to a EXFAT time/date pair. */ void exfat_set_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts, u8 *tz, __le16 *time, __le16 *date, u8 *time_cs) { struct tm tm; u16 t, d; time64_to_tm(ts->tv_sec, 0, &tm); t = (tm.tm_hour << 11) | (tm.tm_min << 5) | (tm.tm_sec >> 1); d = ((tm.tm_year - 80) << 9) | ((tm.tm_mon + 1) << 5) | tm.tm_mday; *time = cpu_to_le16(t); *date = cpu_to_le16(d); /* time_cs field represent 0 ~ 199cs(1990 ms) */ if (time_cs) *time_cs = (tm.tm_sec & 1) * 100 + ts->tv_nsec / (10 * NSEC_PER_MSEC); /* * Record 00h value for OffsetFromUtc field and 1 value for OffsetValid * to indicate that local time and UTC are the same. */ *tz = EXFAT_TZ_VALID; } /* * The timestamp for access_time has double seconds granularity. * (There is no 10msIncrement field for access_time unlike create/modify_time) * atime also has only a 2-second resolution. */ void exfat_truncate_atime(struct timespec64 *ts) { ts->tv_sec = round_down(ts->tv_sec, 2); ts->tv_nsec = 0; } void exfat_truncate_inode_atime(struct inode *inode) { struct timespec64 atime = inode_get_atime(inode); exfat_truncate_atime(&atime); inode_set_atime_to_ts(inode, atime); } u16 exfat_calc_chksum16(void *data, int len, u16 chksum, int type) { int i; u8 *c = (u8 *)data; for (i = 0; i < len; i++, c++) { if (unlikely(type == CS_DIR_ENTRY && (i == 2 || i == 3))) continue; chksum = ((chksum << 15) | (chksum >> 1)) + *c; } return chksum; } u32 exfat_calc_chksum32(void *data, int len, u32 chksum, int type) { int i; u8 *c = (u8 *)data; for (i = 0; i < len; i++, c++) { if (unlikely(type == CS_BOOT_SECTOR && (i == 106 || i == 107 || i == 112))) continue; chksum = ((chksum << 31) | (chksum >> 1)) + *c; } return chksum; } void exfat_update_bh(struct buffer_head *bh, int sync) { set_buffer_uptodate(bh); mark_buffer_dirty(bh); if (sync) sync_dirty_buffer(bh); } int exfat_update_bhs(struct buffer_head **bhs, int nr_bhs, int sync) { int i, err = 0; for (i = 0; i < nr_bhs; i++) { set_buffer_uptodate(bhs[i]); mark_buffer_dirty(bhs[i]); if (sync) write_dirty_buffer(bhs[i], REQ_SYNC); } for (i = 0; i < nr_bhs && sync; i++) { wait_on_buffer(bhs[i]); if (!err && !buffer_uptodate(bhs[i])) err = -EIO; } return err; } void exfat_chain_set(struct exfat_chain *ec, unsigned int dir, unsigned int size, unsigned char flags) { ec->dir = dir; ec->size = size; ec->flags = flags; } void exfat_chain_dup(struct exfat_chain *dup, struct exfat_chain *ec) { return exfat_chain_set(dup, ec->dir, ec->size, ec->flags); }
8 8 7 8 2 2 2 6 6 6 4 8 8 8 8 1 1 13 6 14 5 13 4 10 14 3 10 2 6 8 14 14 8 1 1 1 1 14 14 4 4 10 9 14 14 12 6 7 1 8 14 4 4 4 1 1 1 1 1 1 3 3 3 3 3 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 // SPDX-License-Identifier: LGPL-2.0+ /* * PCM Plug-In shared (kernel/library) code * Copyright (c) 1999 by Jaroslav Kysela <perex@perex.cz> * Copyright (c) 2000 by Abramo Bagnara <abramo@alsa-project.org> */ #if 0 #define PLUGIN_DEBUG #endif #include <linux/slab.h> #include <linux/time.h> #include <linux/vmalloc.h> #include <sound/core.h> #include <sound/pcm.h> #include <sound/pcm_params.h> #include "pcm_plugin.h" #define snd_pcm_plug_first(plug) ((plug)->runtime->oss.plugin_first) #define snd_pcm_plug_last(plug) ((plug)->runtime->oss.plugin_last) /* * because some cards might have rates "very close", we ignore * all "resampling" requests within +-5% */ static int rate_match(unsigned int src_rate, unsigned int dst_rate) { unsigned int low = (src_rate * 95) / 100; unsigned int high = (src_rate * 105) / 100; return dst_rate >= low && dst_rate <= high; } static int snd_pcm_plugin_alloc(struct snd_pcm_plugin *plugin, snd_pcm_uframes_t frames) { struct snd_pcm_plugin_format *format; ssize_t width; size_t size; unsigned int channel; struct snd_pcm_plugin_channel *c; if (plugin->stream == SNDRV_PCM_STREAM_PLAYBACK) { format = &plugin->src_format; } else { format = &plugin->dst_format; } width = snd_pcm_format_physical_width(format->format); if (width < 0) return width; size = array3_size(frames, format->channels, width); /* check for too large period size once again */ if (size > 1024 * 1024) return -ENOMEM; if (snd_BUG_ON(size % 8)) return -ENXIO; size /= 8; if (plugin->buf_frames < frames) { kvfree(plugin->buf); plugin->buf = kvzalloc(size, GFP_KERNEL); plugin->buf_frames = frames; } if (!plugin->buf) { plugin->buf_frames = 0; return -ENOMEM; } c = plugin->buf_channels; if (plugin->access == SNDRV_PCM_ACCESS_RW_INTERLEAVED) { for (channel = 0; channel < format->channels; channel++, c++) { c->frames = frames; c->enabled = 1; c->wanted = 0; c->area.addr = plugin->buf; c->area.first = channel * width; c->area.step = format->channels * width; } } else if (plugin->access == SNDRV_PCM_ACCESS_RW_NONINTERLEAVED) { if (snd_BUG_ON(size % format->channels)) return -EINVAL; size /= format->channels; for (channel = 0; channel < format->channels; channel++, c++) { c->frames = frames; c->enabled = 1; c->wanted = 0; c->area.addr = plugin->buf + (channel * size); c->area.first = 0; c->area.step = width; } } else return -EINVAL; return 0; } int snd_pcm_plug_alloc(struct snd_pcm_substream *plug, snd_pcm_uframes_t frames) { int err; if (snd_BUG_ON(!snd_pcm_plug_first(plug))) return -ENXIO; if (snd_pcm_plug_stream(plug) == SNDRV_PCM_STREAM_PLAYBACK) { struct snd_pcm_plugin *plugin = snd_pcm_plug_first(plug); while (plugin->next) { if (plugin->dst_frames) frames = plugin->dst_frames(plugin, frames); if ((snd_pcm_sframes_t)frames <= 0) return -ENXIO; plugin = plugin->next; err = snd_pcm_plugin_alloc(plugin, frames); if (err < 0) return err; } } else { struct snd_pcm_plugin *plugin = snd_pcm_plug_last(plug); while (plugin->prev) { if (plugin->src_frames) frames = plugin->src_frames(plugin, frames); if ((snd_pcm_sframes_t)frames <= 0) return -ENXIO; plugin = plugin->prev; err = snd_pcm_plugin_alloc(plugin, frames); if (err < 0) return err; } } return 0; } snd_pcm_sframes_t snd_pcm_plugin_client_channels(struct snd_pcm_plugin *plugin, snd_pcm_uframes_t frames, struct snd_pcm_plugin_channel **channels) { *channels = plugin->buf_channels; return frames; } int snd_pcm_plugin_build(struct snd_pcm_substream *plug, const char *name, struct snd_pcm_plugin_format *src_format, struct snd_pcm_plugin_format *dst_format, size_t extra, struct snd_pcm_plugin **ret) { struct snd_pcm_plugin *plugin; unsigned int channels; if (snd_BUG_ON(!plug)) return -ENXIO; if (snd_BUG_ON(!src_format || !dst_format)) return -ENXIO; plugin = kzalloc(sizeof(*plugin) + extra, GFP_KERNEL); if (plugin == NULL) return -ENOMEM; plugin->name = name; plugin->plug = plug; plugin->stream = snd_pcm_plug_stream(plug); plugin->access = SNDRV_PCM_ACCESS_RW_INTERLEAVED; plugin->src_format = *src_format; plugin->src_width = snd_pcm_format_physical_width(src_format->format); snd_BUG_ON(plugin->src_width <= 0); plugin->dst_format = *dst_format; plugin->dst_width = snd_pcm_format_physical_width(dst_format->format); snd_BUG_ON(plugin->dst_width <= 0); if (plugin->stream == SNDRV_PCM_STREAM_PLAYBACK) channels = src_format->channels; else channels = dst_format->channels; plugin->buf_channels = kzalloc_objs(*plugin->buf_channels, channels); if (plugin->buf_channels == NULL) { snd_pcm_plugin_free(plugin); return -ENOMEM; } plugin->client_channels = snd_pcm_plugin_client_channels; *ret = plugin; return 0; } int snd_pcm_plugin_free(struct snd_pcm_plugin *plugin) { if (! plugin) return 0; if (plugin->private_free) plugin->private_free(plugin); kfree(plugin->buf_channels); kvfree(plugin->buf); kfree(plugin); return 0; } static snd_pcm_sframes_t calc_dst_frames(struct snd_pcm_substream *plug, snd_pcm_sframes_t frames, bool check_size) { struct snd_pcm_plugin *plugin, *plugin_next; plugin = snd_pcm_plug_first(plug); while (plugin && frames > 0) { plugin_next = plugin->next; if (check_size && plugin->buf_frames && frames > plugin->buf_frames) frames = plugin->buf_frames; if (plugin->dst_frames) { frames = plugin->dst_frames(plugin, frames); if (frames < 0) return frames; } plugin = plugin_next; } return frames; } static snd_pcm_sframes_t calc_src_frames(struct snd_pcm_substream *plug, snd_pcm_sframes_t frames, bool check_size) { struct snd_pcm_plugin *plugin, *plugin_prev; plugin = snd_pcm_plug_last(plug); while (plugin && frames > 0) { plugin_prev = plugin->prev; if (plugin->src_frames) { frames = plugin->src_frames(plugin, frames); if (frames < 0) return frames; } if (check_size && plugin->buf_frames && frames > plugin->buf_frames) frames = plugin->buf_frames; plugin = plugin_prev; } return frames; } snd_pcm_sframes_t snd_pcm_plug_client_size(struct snd_pcm_substream *plug, snd_pcm_uframes_t drv_frames) { if (snd_BUG_ON(!plug)) return -ENXIO; switch (snd_pcm_plug_stream(plug)) { case SNDRV_PCM_STREAM_PLAYBACK: return calc_src_frames(plug, drv_frames, false); case SNDRV_PCM_STREAM_CAPTURE: return calc_dst_frames(plug, drv_frames, false); default: snd_BUG(); return -EINVAL; } } snd_pcm_sframes_t snd_pcm_plug_slave_size(struct snd_pcm_substream *plug, snd_pcm_uframes_t clt_frames) { if (snd_BUG_ON(!plug)) return -ENXIO; switch (snd_pcm_plug_stream(plug)) { case SNDRV_PCM_STREAM_PLAYBACK: return calc_dst_frames(plug, clt_frames, false); case SNDRV_PCM_STREAM_CAPTURE: return calc_src_frames(plug, clt_frames, false); default: snd_BUG(); return -EINVAL; } } static int snd_pcm_plug_formats(const struct snd_mask *mask, snd_pcm_format_t format) { struct snd_mask formats = *mask; u64 linfmts = (SNDRV_PCM_FMTBIT_U8 | SNDRV_PCM_FMTBIT_S8 | SNDRV_PCM_FMTBIT_U16_LE | SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_U16_BE | SNDRV_PCM_FMTBIT_S16_BE | SNDRV_PCM_FMTBIT_U24_LE | SNDRV_PCM_FMTBIT_S24_LE | SNDRV_PCM_FMTBIT_U24_BE | SNDRV_PCM_FMTBIT_S24_BE | SNDRV_PCM_FMTBIT_U24_3LE | SNDRV_PCM_FMTBIT_S24_3LE | SNDRV_PCM_FMTBIT_U24_3BE | SNDRV_PCM_FMTBIT_S24_3BE | SNDRV_PCM_FMTBIT_U32_LE | SNDRV_PCM_FMTBIT_S32_LE | SNDRV_PCM_FMTBIT_U32_BE | SNDRV_PCM_FMTBIT_S32_BE); snd_mask_set(&formats, (__force int)SNDRV_PCM_FORMAT_MU_LAW); if (formats.bits[0] & lower_32_bits(linfmts)) formats.bits[0] |= lower_32_bits(linfmts); if (formats.bits[1] & upper_32_bits(linfmts)) formats.bits[1] |= upper_32_bits(linfmts); return snd_mask_test(&formats, (__force int)format); } static const snd_pcm_format_t preferred_formats[] = { SNDRV_PCM_FORMAT_S16_LE, SNDRV_PCM_FORMAT_S16_BE, SNDRV_PCM_FORMAT_U16_LE, SNDRV_PCM_FORMAT_U16_BE, SNDRV_PCM_FORMAT_S24_3LE, SNDRV_PCM_FORMAT_S24_3BE, SNDRV_PCM_FORMAT_U24_3LE, SNDRV_PCM_FORMAT_U24_3BE, SNDRV_PCM_FORMAT_S24_LE, SNDRV_PCM_FORMAT_S24_BE, SNDRV_PCM_FORMAT_U24_LE, SNDRV_PCM_FORMAT_U24_BE, SNDRV_PCM_FORMAT_S32_LE, SNDRV_PCM_FORMAT_S32_BE, SNDRV_PCM_FORMAT_U32_LE, SNDRV_PCM_FORMAT_U32_BE, SNDRV_PCM_FORMAT_S8, SNDRV_PCM_FORMAT_U8 }; snd_pcm_format_t snd_pcm_plug_slave_format(snd_pcm_format_t format, const struct snd_mask *format_mask) { int i; if (snd_mask_test(format_mask, (__force int)format)) return format; if (!snd_pcm_plug_formats(format_mask, format)) return (__force snd_pcm_format_t)-EINVAL; if (snd_pcm_format_linear(format)) { unsigned int width = snd_pcm_format_width(format); int unsignd = snd_pcm_format_unsigned(format) > 0; int big = snd_pcm_format_big_endian(format) > 0; unsigned int badness, best = -1; snd_pcm_format_t best_format = (__force snd_pcm_format_t)-1; for (i = 0; i < ARRAY_SIZE(preferred_formats); i++) { snd_pcm_format_t f = preferred_formats[i]; unsigned int w; if (!snd_mask_test(format_mask, (__force int)f)) continue; w = snd_pcm_format_width(f); if (w >= width) badness = w - width; else badness = width - w + 32; badness += snd_pcm_format_unsigned(f) != unsignd; badness += snd_pcm_format_big_endian(f) != big; if (badness < best) { best_format = f; best = badness; } } if ((__force int)best_format >= 0) return best_format; else return (__force snd_pcm_format_t)-EINVAL; } else { switch (format) { case SNDRV_PCM_FORMAT_MU_LAW: for (i = 0; i < ARRAY_SIZE(preferred_formats); ++i) { snd_pcm_format_t format1 = preferred_formats[i]; if (snd_mask_test(format_mask, (__force int)format1)) return format1; } fallthrough; default: return (__force snd_pcm_format_t)-EINVAL; } } } int snd_pcm_plug_format_plugins(struct snd_pcm_substream *plug, struct snd_pcm_hw_params *params, struct snd_pcm_hw_params *slave_params) { struct snd_pcm_plugin_format tmpformat; struct snd_pcm_plugin_format dstformat; struct snd_pcm_plugin_format srcformat; snd_pcm_access_t src_access, dst_access; struct snd_pcm_plugin *plugin = NULL; int err; int stream = snd_pcm_plug_stream(plug); int slave_interleaved = (params_channels(slave_params) == 1 || params_access(slave_params) == SNDRV_PCM_ACCESS_RW_INTERLEAVED); switch (stream) { case SNDRV_PCM_STREAM_PLAYBACK: dstformat.format = params_format(slave_params); dstformat.rate = params_rate(slave_params); dstformat.channels = params_channels(slave_params); srcformat.format = params_format(params); srcformat.rate = params_rate(params); srcformat.channels = params_channels(params); src_access = SNDRV_PCM_ACCESS_RW_INTERLEAVED; dst_access = (slave_interleaved ? SNDRV_PCM_ACCESS_RW_INTERLEAVED : SNDRV_PCM_ACCESS_RW_NONINTERLEAVED); break; case SNDRV_PCM_STREAM_CAPTURE: dstformat.format = params_format(params); dstformat.rate = params_rate(params); dstformat.channels = params_channels(params); srcformat.format = params_format(slave_params); srcformat.rate = params_rate(slave_params); srcformat.channels = params_channels(slave_params); src_access = (slave_interleaved ? SNDRV_PCM_ACCESS_RW_INTERLEAVED : SNDRV_PCM_ACCESS_RW_NONINTERLEAVED); dst_access = SNDRV_PCM_ACCESS_RW_INTERLEAVED; break; default: snd_BUG(); return -EINVAL; } tmpformat = srcformat; pdprintf("srcformat: format=%i, rate=%i, channels=%i\n", srcformat.format, srcformat.rate, srcformat.channels); pdprintf("dstformat: format=%i, rate=%i, channels=%i\n", dstformat.format, dstformat.rate, dstformat.channels); /* Format change (linearization) */ if (! rate_match(srcformat.rate, dstformat.rate) && ! snd_pcm_format_linear(srcformat.format)) { if (srcformat.format != SNDRV_PCM_FORMAT_MU_LAW) return -EINVAL; tmpformat.format = SNDRV_PCM_FORMAT_S16; err = snd_pcm_plugin_build_mulaw(plug, &srcformat, &tmpformat, &plugin); if (err < 0) return err; err = snd_pcm_plugin_append(plugin); if (err < 0) { snd_pcm_plugin_free(plugin); return err; } srcformat = tmpformat; src_access = dst_access; } /* channels reduction */ if (srcformat.channels > dstformat.channels) { tmpformat.channels = dstformat.channels; err = snd_pcm_plugin_build_route(plug, &srcformat, &tmpformat, &plugin); pdprintf("channels reduction: src=%i, dst=%i returns %i\n", srcformat.channels, tmpformat.channels, err); if (err < 0) return err; err = snd_pcm_plugin_append(plugin); if (err < 0) { snd_pcm_plugin_free(plugin); return err; } srcformat = tmpformat; src_access = dst_access; } /* rate resampling */ if (!rate_match(srcformat.rate, dstformat.rate)) { if (srcformat.format != SNDRV_PCM_FORMAT_S16) { /* convert to S16 for resampling */ tmpformat.format = SNDRV_PCM_FORMAT_S16; err = snd_pcm_plugin_build_linear(plug, &srcformat, &tmpformat, &plugin); if (err < 0) return err; err = snd_pcm_plugin_append(plugin); if (err < 0) { snd_pcm_plugin_free(plugin); return err; } srcformat = tmpformat; src_access = dst_access; } tmpformat.rate = dstformat.rate; err = snd_pcm_plugin_build_rate(plug, &srcformat, &tmpformat, &plugin); pdprintf("rate down resampling: src=%i, dst=%i returns %i\n", srcformat.rate, tmpformat.rate, err); if (err < 0) return err; err = snd_pcm_plugin_append(plugin); if (err < 0) { snd_pcm_plugin_free(plugin); return err; } srcformat = tmpformat; src_access = dst_access; } /* format change */ if (srcformat.format != dstformat.format) { tmpformat.format = dstformat.format; if (srcformat.format == SNDRV_PCM_FORMAT_MU_LAW || tmpformat.format == SNDRV_PCM_FORMAT_MU_LAW) { err = snd_pcm_plugin_build_mulaw(plug, &srcformat, &tmpformat, &plugin); } else if (snd_pcm_format_linear(srcformat.format) && snd_pcm_format_linear(tmpformat.format)) { err = snd_pcm_plugin_build_linear(plug, &srcformat, &tmpformat, &plugin); } else return -EINVAL; pdprintf("format change: src=%i, dst=%i returns %i\n", srcformat.format, tmpformat.format, err); if (err < 0) return err; err = snd_pcm_plugin_append(plugin); if (err < 0) { snd_pcm_plugin_free(plugin); return err; } srcformat = tmpformat; src_access = dst_access; } /* channels extension */ if (srcformat.channels < dstformat.channels) { tmpformat.channels = dstformat.channels; err = snd_pcm_plugin_build_route(plug, &srcformat, &tmpformat, &plugin); pdprintf("channels extension: src=%i, dst=%i returns %i\n", srcformat.channels, tmpformat.channels, err); if (err < 0) return err; err = snd_pcm_plugin_append(plugin); if (err < 0) { snd_pcm_plugin_free(plugin); return err; } srcformat = tmpformat; src_access = dst_access; } /* de-interleave */ if (src_access != dst_access) { err = snd_pcm_plugin_build_copy(plug, &srcformat, &tmpformat, &plugin); pdprintf("interleave change (copy: returns %i)\n", err); if (err < 0) return err; err = snd_pcm_plugin_append(plugin); if (err < 0) { snd_pcm_plugin_free(plugin); return err; } } return 0; } snd_pcm_sframes_t snd_pcm_plug_client_channels_buf(struct snd_pcm_substream *plug, char *buf, snd_pcm_uframes_t count, struct snd_pcm_plugin_channel **channels) { struct snd_pcm_plugin *plugin; struct snd_pcm_plugin_channel *v; struct snd_pcm_plugin_format *format; int width, nchannels, channel; int stream = snd_pcm_plug_stream(plug); if (snd_BUG_ON(!buf)) return -ENXIO; if (stream == SNDRV_PCM_STREAM_PLAYBACK) { plugin = snd_pcm_plug_first(plug); format = &plugin->src_format; } else { plugin = snd_pcm_plug_last(plug); format = &plugin->dst_format; } v = plugin->buf_channels; *channels = v; width = snd_pcm_format_physical_width(format->format); if (width < 0) return width; nchannels = format->channels; if (snd_BUG_ON(plugin->access != SNDRV_PCM_ACCESS_RW_INTERLEAVED && format->channels > 1)) return -ENXIO; for (channel = 0; channel < nchannels; channel++, v++) { v->frames = count; v->enabled = 1; v->wanted = (stream == SNDRV_PCM_STREAM_CAPTURE); v->area.addr = buf; v->area.first = channel * width; v->area.step = nchannels * width; } return count; } snd_pcm_sframes_t snd_pcm_plug_write_transfer(struct snd_pcm_substream *plug, struct snd_pcm_plugin_channel *src_channels, snd_pcm_uframes_t size) { struct snd_pcm_plugin *plugin, *next; struct snd_pcm_plugin_channel *dst_channels; int err; snd_pcm_sframes_t frames = size; plugin = snd_pcm_plug_first(plug); while (plugin) { if (frames <= 0) return frames; next = plugin->next; if (next) { snd_pcm_sframes_t frames1 = frames; if (plugin->dst_frames) { frames1 = plugin->dst_frames(plugin, frames); if (frames1 <= 0) return frames1; } err = next->client_channels(next, frames1, &dst_channels); if (err < 0) return err; if (err != frames1) { frames = err; if (plugin->src_frames) { frames = plugin->src_frames(plugin, frames1); if (frames <= 0) return frames; } } } else dst_channels = NULL; pdprintf("write plugin: %s, %li\n", plugin->name, frames); frames = plugin->transfer(plugin, src_channels, dst_channels, frames); if (frames < 0) return frames; src_channels = dst_channels; plugin = next; } return calc_src_frames(plug, frames, true); } snd_pcm_sframes_t snd_pcm_plug_read_transfer(struct snd_pcm_substream *plug, struct snd_pcm_plugin_channel *dst_channels_final, snd_pcm_uframes_t size) { struct snd_pcm_plugin *plugin, *next; struct snd_pcm_plugin_channel *src_channels, *dst_channels; snd_pcm_sframes_t frames = size; int err; frames = calc_src_frames(plug, frames, true); if (frames < 0) return frames; src_channels = NULL; plugin = snd_pcm_plug_first(plug); while (plugin && frames > 0) { next = plugin->next; if (next) { err = plugin->client_channels(plugin, frames, &dst_channels); if (err < 0) return err; frames = err; } else { dst_channels = dst_channels_final; } pdprintf("read plugin: %s, %li\n", plugin->name, frames); frames = plugin->transfer(plugin, src_channels, dst_channels, frames); if (frames < 0) return frames; plugin = next; src_channels = dst_channels; } return frames; } int snd_pcm_area_silence(const struct snd_pcm_channel_area *dst_area, size_t dst_offset, size_t samples, snd_pcm_format_t format) { /* FIXME: sub byte resolution and odd dst_offset */ unsigned char *dst; unsigned int dst_step; int width; const unsigned char *silence; if (!dst_area->addr) return 0; dst = dst_area->addr + (dst_area->first + dst_area->step * dst_offset) / 8; width = snd_pcm_format_physical_width(format); if (width <= 0) return -EINVAL; if (dst_area->step == (unsigned int) width && width >= 8) return snd_pcm_format_set_silence(format, dst, samples); silence = snd_pcm_format_silence_64(format); if (! silence) return -EINVAL; dst_step = dst_area->step / 8; if (width == 4) { /* Ima ADPCM */ int dstbit = dst_area->first % 8; int dstbit_step = dst_area->step % 8; while (samples-- > 0) { if (dstbit) *dst &= 0xf0; else *dst &= 0x0f; dst += dst_step; dstbit += dstbit_step; if (dstbit == 8) { dst++; dstbit = 0; } } } else { width /= 8; while (samples-- > 0) { memcpy(dst, silence, width); dst += dst_step; } } return 0; } int snd_pcm_area_copy(const struct snd_pcm_channel_area *src_area, size_t src_offset, const struct snd_pcm_channel_area *dst_area, size_t dst_offset, size_t samples, snd_pcm_format_t format) { /* FIXME: sub byte resolution and odd dst_offset */ char *src, *dst; int width; int src_step, dst_step; src = src_area->addr + (src_area->first + src_area->step * src_offset) / 8; if (!src_area->addr) return snd_pcm_area_silence(dst_area, dst_offset, samples, format); dst = dst_area->addr + (dst_area->first + dst_area->step * dst_offset) / 8; if (!dst_area->addr) return 0; width = snd_pcm_format_physical_width(format); if (width <= 0) return -EINVAL; if (src_area->step == (unsigned int) width && dst_area->step == (unsigned int) width && width >= 8) { size_t bytes = samples * width / 8; memcpy(dst, src, bytes); return 0; } src_step = src_area->step / 8; dst_step = dst_area->step / 8; if (width == 4) { /* Ima ADPCM */ int srcbit = src_area->first % 8; int srcbit_step = src_area->step % 8; int dstbit = dst_area->first % 8; int dstbit_step = dst_area->step % 8; while (samples-- > 0) { unsigned char srcval; if (srcbit) srcval = *src & 0x0f; else srcval = (*src & 0xf0) >> 4; if (dstbit) *dst = (*dst & 0xf0) | srcval; else *dst = (*dst & 0x0f) | (srcval << 4); src += src_step; srcbit += srcbit_step; if (srcbit == 8) { src++; srcbit = 0; } dst += dst_step; dstbit += dstbit_step; if (dstbit == 8) { dst++; dstbit = 0; } } } else { width /= 8; while (samples-- > 0) { memcpy(dst, src, width); src += src_step; dst += dst_step; } } return 0; }
3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 // SPDX-License-Identifier: GPL-2.0 /* Copyright 2011-2014 Autronica Fire and Security AS * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se * * Event handling for HSR and PRP devices. */ #include <linux/netdevice.h> #include <net/rtnetlink.h> #include <linux/rculist.h> #include <linux/timer.h> #include <linux/etherdevice.h> #include "hsr_main.h" #include "hsr_device.h" #include "hsr_netlink.h" #include "hsr_framereg.h" #include "hsr_slave.h" static bool hsr_slave_empty(struct hsr_priv *hsr) { struct hsr_port *port; hsr_for_each_port_rtnl(hsr, port) if (port->type != HSR_PT_MASTER) return false; return true; } static int hsr_netdev_notify(struct notifier_block *nb, unsigned long event, void *ptr) { struct hsr_port *port, *master; struct net_device *dev; struct hsr_priv *hsr; LIST_HEAD(list_kill); int mtu_max; int res; dev = netdev_notifier_info_to_dev(ptr); port = hsr_port_get_rtnl(dev); if (!port) { if (!is_hsr_master(dev)) return NOTIFY_DONE; /* Not an HSR device */ hsr = netdev_priv(dev); port = hsr_port_get_hsr(hsr, HSR_PT_MASTER); if (!port) { /* Resend of notification concerning removed device? */ return NOTIFY_DONE; } } else { hsr = port->hsr; } switch (event) { case NETDEV_UP: /* Administrative state DOWN */ case NETDEV_DOWN: /* Administrative state UP */ case NETDEV_CHANGE: /* Link (carrier) state changes */ hsr_check_carrier_and_operstate(hsr); break; case NETDEV_CHANGENAME: if (is_hsr_master(dev)) hsr_debugfs_rename(dev); break; case NETDEV_CHANGEADDR: if (port->type == HSR_PT_MASTER) { /* This should not happen since there's no * ndo_set_mac_address() for HSR devices - i.e. not * supported. */ break; } master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); if (port->type == HSR_PT_SLAVE_A) { eth_hw_addr_set(master->dev, dev->dev_addr); call_netdevice_notifiers(NETDEV_CHANGEADDR, master->dev); if (hsr->prot_version == PRP_V1) { port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_B); if (port) { eth_hw_addr_set(port->dev, dev->dev_addr); call_netdevice_notifiers(NETDEV_CHANGEADDR, port->dev); } } } /* Make sure we recognize frames from ourselves in hsr_rcv() */ port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_B); res = hsr_create_self_node(hsr, master->dev->dev_addr, port ? port->dev->dev_addr : master->dev->dev_addr); if (res) netdev_warn(master->dev, "Could not update HSR node address.\n"); break; case NETDEV_CHANGEMTU: if (port->type == HSR_PT_MASTER) break; /* Handled in ndo_change_mtu() */ mtu_max = hsr_get_max_mtu(port->hsr); master = hsr_port_get_hsr(port->hsr, HSR_PT_MASTER); WRITE_ONCE(master->dev->mtu, mtu_max); break; case NETDEV_UNREGISTER: if (!is_hsr_master(dev)) { master = hsr_port_get_hsr(port->hsr, HSR_PT_MASTER); hsr_del_port(port); if (hsr_slave_empty(master->hsr)) { const struct rtnl_link_ops *ops; ops = master->dev->rtnl_link_ops; ops->dellink(master->dev, &list_kill); unregister_netdevice_many(&list_kill); } } break; case NETDEV_PRE_TYPE_CHANGE: /* HSR works only on Ethernet devices. Refuse slave to change * its type. */ return NOTIFY_BAD; } return NOTIFY_DONE; } struct hsr_port *hsr_port_get_hsr(struct hsr_priv *hsr, enum hsr_port_type pt) { struct hsr_port *port; hsr_for_each_port_rtnl(hsr, port) if (port->type == pt) return port; return NULL; } int hsr_get_version(struct net_device *dev, enum hsr_version *ver) { struct hsr_priv *hsr; hsr = netdev_priv(dev); *ver = hsr->prot_version; return 0; } EXPORT_SYMBOL(hsr_get_version); static struct notifier_block hsr_nb = { .notifier_call = hsr_netdev_notify, /* Slave event notifications */ }; static int __init hsr_init(void) { int err; BUILD_BUG_ON(sizeof(struct hsr_tag) != HSR_HLEN); err = register_netdevice_notifier(&hsr_nb); if (err) return err; err = hsr_netlink_init(); if (err) { unregister_netdevice_notifier(&hsr_nb); return err; } return 0; } static void __exit hsr_exit(void) { hsr_netlink_exit(); hsr_debugfs_remove_root(); unregister_netdevice_notifier(&hsr_nb); } module_init(hsr_init); module_exit(hsr_exit); MODULE_DESCRIPTION("High-availability Seamless Redundancy (HSR) driver"); MODULE_LICENSE("GPL");
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_BRIDGE_NETFILTER_H #define __LINUX_BRIDGE_NETFILTER_H #include <uapi/linux/netfilter_bridge.h> #include <linux/skbuff.h> struct nf_bridge_frag_data { char mac[ETH_HLEN]; bool vlan_present; u16 vlan_tci; __be16 vlan_proto; }; #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb); static inline void br_drop_fake_rtable(struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); if (dst && (dst->flags & DST_FAKE_RTABLE)) skb_dst_drop(skb); } static inline struct nf_bridge_info * nf_bridge_info_get(const struct sk_buff *skb) { return skb_ext_find(skb, SKB_EXT_BRIDGE_NF); } static inline bool nf_bridge_info_exists(const struct sk_buff *skb) { return skb_ext_exist(skb, SKB_EXT_BRIDGE_NF); } static inline int nf_bridge_get_physinif(const struct sk_buff *skb) { const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); if (!nf_bridge) return 0; return nf_bridge->physinif; } static inline int nf_bridge_get_physoutif(const struct sk_buff *skb) { const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); if (!nf_bridge) return 0; return nf_bridge->physoutdev ? nf_bridge->physoutdev->ifindex : 0; } static inline struct net_device * nf_bridge_get_physindev(const struct sk_buff *skb, struct net *net) { const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); return nf_bridge ? dev_get_by_index_rcu(net, nf_bridge->physinif) : NULL; } static inline struct net_device * nf_bridge_get_physoutdev(const struct sk_buff *skb) { const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); return nf_bridge ? nf_bridge->physoutdev : NULL; } static inline bool nf_bridge_in_prerouting(const struct sk_buff *skb) { const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); return nf_bridge && nf_bridge->in_prerouting; } #else #define br_drop_fake_rtable(skb) do { } while (0) static inline bool nf_bridge_in_prerouting(const struct sk_buff *skb) { return false; } #endif /* CONFIG_BRIDGE_NETFILTER */ #endif
6 6 4 2 1 3 2 4 1 5 3 2 38 23 11 5 5 1 1 5 9 39 1 38 11 1 11 2 23 38 8 26 5 124 1 93 39 14 5 5 2 1 1 2 4 3 2 20 20 2 18 2 2 3 3 3 2 2 14 13 1 14 14 14 14 14 184 63 63 180 104 82 9 21 63 62 1 180 181 184 184 47 139 14 10 8 8 8 8 66 67 137 137 19 211 7 1 1 8 8 106 103 1 7 152 2 153 95 23 46 8 20 76 95 93 95 95 94 95 41 40 62 94 93 8 30 66 1 90 92 92 1 33 63 68 30 62 35 1 91 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 // SPDX-License-Identifier: GPL-2.0-only /* * * Copyright (C) 2011 Novell Inc. */ #include <linux/fs.h> #include <linux/slab.h> #include <linux/cred.h> #include <linux/xattr.h> #include <linux/ratelimit.h> #include <linux/fiemap.h> #include <linux/fileattr.h> #include <linux/security.h> #include <linux/namei.h> #include <linux/posix_acl.h> #include <linux/posix_acl_xattr.h> #include "overlayfs.h" int ovl_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { int err; struct ovl_fs *ofs = OVL_FS(dentry->d_sb); bool full_copy_up = false; struct dentry *upperdentry; err = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (err) return err; if (attr->ia_valid & ATTR_SIZE) { /* Truncate should trigger data copy up as well */ full_copy_up = true; } if (!full_copy_up) err = ovl_copy_up(dentry); else err = ovl_copy_up_with_data(dentry); if (!err) { struct inode *winode = NULL; upperdentry = ovl_dentry_upper(dentry); if (attr->ia_valid & ATTR_SIZE) { winode = d_inode(upperdentry); err = get_write_access(winode); if (err) goto out; } if (attr->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID)) attr->ia_valid &= ~ATTR_MODE; /* * We might have to translate ovl file into real file object * once use cases emerge. For now, simply don't let underlying * filesystem rely on attr->ia_file */ attr->ia_valid &= ~ATTR_FILE; /* * If open(O_TRUNC) is done, VFS calls ->setattr with ATTR_OPEN * set. Overlayfs does not pass O_TRUNC flag to underlying * filesystem during open -> do not pass ATTR_OPEN. This * disables optimization in fuse which assumes open(O_TRUNC) * already set file size to 0. But we never passed O_TRUNC to * fuse. So by clearing ATTR_OPEN, fuse will be forced to send * setattr request to server. */ attr->ia_valid &= ~ATTR_OPEN; err = ovl_want_write(dentry); if (err) goto out_put_write; inode_lock(upperdentry->d_inode); with_ovl_creds(dentry->d_sb) err = ovl_do_notify_change(ofs, upperdentry, attr); if (!err) ovl_copyattr(dentry->d_inode); inode_unlock(upperdentry->d_inode); ovl_drop_write(dentry); out_put_write: if (winode) put_write_access(winode); } out: return err; } static void ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat, int fsid) { struct ovl_fs *ofs = OVL_FS(dentry->d_sb); bool samefs = ovl_same_fs(ofs); unsigned int xinobits = ovl_xino_bits(ofs); unsigned int xinoshift = 64 - xinobits; if (samefs) { /* * When all layers are on the same fs, all real inode * number are unique, so we use the overlay st_dev, * which is friendly to du -x. */ stat->dev = dentry->d_sb->s_dev; return; } else if (xinobits) { /* * All inode numbers of underlying fs should not be using the * high xinobits, so we use high xinobits to partition the * overlay st_ino address space. The high bits holds the fsid * (upper fsid is 0). The lowest xinobit is reserved for mapping * the non-persistent inode numbers range in case of overflow. * This way all overlay inode numbers are unique and use the * overlay st_dev. */ if (likely(!(stat->ino >> xinoshift))) { stat->ino |= ((u64)fsid) << (xinoshift + 1); stat->dev = dentry->d_sb->s_dev; return; } else if (ovl_xino_warn(ofs)) { pr_warn_ratelimited("inode number too big (%pd2, ino=%llu, xinobits=%d)\n", dentry, stat->ino, xinobits); } } /* The inode could not be mapped to a unified st_ino address space */ if (S_ISDIR(dentry->d_inode->i_mode)) { /* * Always use the overlay st_dev for directories, so 'find * -xdev' will scan the entire overlay mount and won't cross the * overlay mount boundaries. * * If not all layers are on the same fs the pair {real st_ino; * overlay st_dev} is not unique, so use the non persistent * overlay st_ino for directories. */ stat->dev = dentry->d_sb->s_dev; stat->ino = dentry->d_inode->i_ino; } else { /* * For non-samefs setup, if we cannot map all layers st_ino * to a unified address space, we need to make sure that st_dev * is unique per underlying fs, so we use the unique anonymous * bdev assigned to the underlying fs. */ stat->dev = ofs->fs[fsid].pseudo_dev; } } static inline int ovl_real_getattr_nosec(struct super_block *sb, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { with_ovl_creds(sb) return vfs_getattr_nosec(path, stat, request_mask, flags); } int ovl_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { struct dentry *dentry = path->dentry; struct super_block *sb = dentry->d_sb; enum ovl_path_type type; struct path realpath; struct inode *inode = d_inode(dentry); bool is_dir = S_ISDIR(inode->i_mode); int fsid = 0; int err; bool metacopy_blocks = false; metacopy_blocks = ovl_is_metacopy_dentry(dentry); type = ovl_path_real(dentry, &realpath); err = ovl_real_getattr_nosec(sb, &realpath, stat, request_mask, flags); if (err) return err; /* Report the effective immutable/append-only STATX flags */ generic_fill_statx_attr(inode, stat); /* * For non-dir or same fs, we use st_ino of the copy up origin. * This guaranties constant st_dev/st_ino across copy up. * With xino feature and non-samefs, we use st_ino of the copy up * origin masked with high bits that represent the layer id. * * If lower filesystem supports NFS file handles, this also guaranties * persistent st_ino across mount cycle. */ if (!is_dir || ovl_same_dev(OVL_FS(dentry->d_sb))) { if (!OVL_TYPE_UPPER(type)) { fsid = ovl_layer_lower(dentry)->fsid; } else if (OVL_TYPE_ORIGIN(type)) { struct kstat lowerstat; u32 lowermask = STATX_INO | STATX_BLOCKS | (!is_dir ? STATX_NLINK : 0); ovl_path_lower(dentry, &realpath); err = ovl_real_getattr_nosec(sb, &realpath, &lowerstat, lowermask, flags); if (err) return err; /* * Lower hardlinks may be broken on copy up to different * upper files, so we cannot use the lower origin st_ino * for those different files, even for the same fs case. * * Similarly, several redirected dirs can point to the * same dir on a lower layer. With the "verify_lower" * feature, we do not use the lower origin st_ino, if * we haven't verified that this redirect is unique. * * With inodes index enabled, it is safe to use st_ino * of an indexed origin. The index validates that the * upper hardlink is not broken and that a redirected * dir is the only redirect to that origin. */ if (ovl_test_flag(OVL_INDEX, d_inode(dentry)) || (!ovl_verify_lower(dentry->d_sb) && (is_dir || lowerstat.nlink == 1))) { fsid = ovl_layer_lower(dentry)->fsid; stat->ino = lowerstat.ino; } /* * If we are querying a metacopy dentry and lower * dentry is data dentry, then use the blocks we * queried just now. We don't have to do additional * vfs_getattr(). If lower itself is metacopy, then * additional vfs_getattr() is unavoidable. */ if (metacopy_blocks && realpath.dentry == ovl_dentry_lowerdata(dentry)) { stat->blocks = lowerstat.blocks; metacopy_blocks = false; } } if (metacopy_blocks) { /* * If lower is not same as lowerdata or if there was * no origin on upper, we can end up here. * With lazy lowerdata lookup, guess lowerdata blocks * from size to avoid lowerdata lookup on stat(2). */ struct kstat lowerdatastat; u32 lowermask = STATX_BLOCKS; ovl_path_lowerdata(dentry, &realpath); if (realpath.dentry) { err = ovl_real_getattr_nosec(sb, &realpath, &lowerdatastat, lowermask, flags); if (err) return err; } else { lowerdatastat.blocks = round_up(stat->size, stat->blksize) >> 9; } stat->blocks = lowerdatastat.blocks; } } ovl_map_dev_ino(dentry, stat, fsid); /* * It's probably not worth it to count subdirs to get the * correct link count. nlink=1 seems to pacify 'find' and * other utilities. */ if (is_dir && OVL_TYPE_MERGE(type)) stat->nlink = 1; /* * Return the overlay inode nlinks for indexed upper inodes. * Overlay inode nlink counts the union of the upper hardlinks * and non-covered lower hardlinks. It does not include the upper * index hardlink. */ if (!is_dir && ovl_test_flag(OVL_INDEX, d_inode(dentry))) stat->nlink = dentry->d_inode->i_nlink; return err; } int ovl_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { struct inode *upperinode = ovl_inode_upper(inode); struct inode *realinode; struct path realpath; int err; /* Careful in RCU walk mode */ realinode = ovl_i_path_real(inode, &realpath); if (!realinode) { WARN_ON(!(mask & MAY_NOT_BLOCK)); return -ECHILD; } /* * Check overlay inode with the creds of task and underlying inode * with creds of mounter */ err = generic_permission(&nop_mnt_idmap, inode, mask); if (err) return err; if (!upperinode && !special_file(realinode->i_mode) && mask & MAY_WRITE) { mask &= ~(MAY_WRITE | MAY_APPEND); /* Make sure mounter can read file for copy up later */ mask |= MAY_READ; } with_ovl_creds(inode->i_sb) return inode_permission(mnt_idmap(realpath.mnt), realinode, mask); } static const char *ovl_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { if (!dentry) return ERR_PTR(-ECHILD); with_ovl_creds(dentry->d_sb) return vfs_get_link(ovl_dentry_real(dentry), done); } #ifdef CONFIG_FS_POSIX_ACL /* * Apply the idmapping of the layer to POSIX ACLs. The caller must pass a clone * of the POSIX ACLs retrieved from the lower layer to this function to not * alter the POSIX ACLs for the underlying filesystem. */ static void ovl_idmap_posix_acl(const struct inode *realinode, struct mnt_idmap *idmap, struct posix_acl *acl) { struct user_namespace *fs_userns = i_user_ns(realinode); for (unsigned int i = 0; i < acl->a_count; i++) { vfsuid_t vfsuid; vfsgid_t vfsgid; struct posix_acl_entry *e = &acl->a_entries[i]; switch (e->e_tag) { case ACL_USER: vfsuid = make_vfsuid(idmap, fs_userns, e->e_uid); e->e_uid = vfsuid_into_kuid(vfsuid); break; case ACL_GROUP: vfsgid = make_vfsgid(idmap, fs_userns, e->e_gid); e->e_gid = vfsgid_into_kgid(vfsgid); break; } } } /* * The @noperm argument is used to skip permission checking and is a temporary * measure. Quoting Miklos from an earlier discussion: * * > So there are two paths to getting an acl: * > 1) permission checking and 2) retrieving the value via getxattr(2). * > This is a similar situation as reading a symlink vs. following it. * > When following a symlink overlayfs always reads the link on the * > underlying fs just as if it was a readlink(2) call, calling * > security_inode_readlink() instead of security_inode_follow_link(). * > This is logical: we are reading the link from the underlying storage, * > and following it on overlayfs. * > * > Applying the same logic to acl: we do need to call the * > security_inode_getxattr() on the underlying fs, even if just want to * > check permissions on overlay. This is currently not done, which is an * > inconsistency. * > * > Maybe adding the check to ovl_get_acl() is the right way to go, but * > I'm a little afraid of a performance regression. Will look into that. * * Until we have made a decision allow this helper to take the @noperm * argument. We should hopefully be able to remove it soon. */ struct posix_acl *ovl_get_acl_path(const struct path *path, const char *acl_name, bool noperm) { struct posix_acl *real_acl, *clone; struct mnt_idmap *idmap; struct inode *realinode = d_inode(path->dentry); idmap = mnt_idmap(path->mnt); if (noperm) real_acl = get_inode_acl(realinode, posix_acl_type(acl_name)); else real_acl = vfs_get_acl(idmap, path->dentry, acl_name); if (IS_ERR_OR_NULL(real_acl)) return real_acl; if (!is_idmapped_mnt(path->mnt)) return real_acl; /* * We cannot alter the ACLs returned from the relevant layer as that * would alter the cached values filesystem wide for the lower * filesystem. Instead we can clone the ACLs and then apply the * relevant idmapping of the layer. */ clone = posix_acl_clone(real_acl, GFP_KERNEL); posix_acl_release(real_acl); /* release original acl */ if (!clone) return ERR_PTR(-ENOMEM); ovl_idmap_posix_acl(realinode, idmap, clone); return clone; } /* * When the relevant layer is an idmapped mount we need to take the idmapping * of the layer into account and translate any ACL_{GROUP,USER} values * according to the idmapped mount. * * We cannot alter the ACLs returned from the relevant layer as that would * alter the cached values filesystem wide for the lower filesystem. Instead we * can clone the ACLs and then apply the relevant idmapping of the layer. * * This is obviously only relevant when idmapped layers are used. */ struct posix_acl *do_ovl_get_acl(struct mnt_idmap *idmap, struct inode *inode, int type, bool rcu, bool noperm) { struct inode *realinode; struct posix_acl *acl; struct path realpath; /* Careful in RCU walk mode */ realinode = ovl_i_path_real(inode, &realpath); if (!realinode) { WARN_ON(!rcu); return ERR_PTR(-ECHILD); } if (!IS_POSIXACL(realinode)) return NULL; if (rcu) { /* * If the layer is idmapped drop out of RCU path walk * so we can clone the ACLs. */ if (is_idmapped_mnt(realpath.mnt)) return ERR_PTR(-ECHILD); acl = get_cached_acl_rcu(realinode, type); } else { with_ovl_creds(inode->i_sb) acl = ovl_get_acl_path(&realpath, posix_acl_xattr_name(type), noperm); } return acl; } static int ovl_set_or_remove_acl(struct dentry *dentry, struct inode *inode, struct posix_acl *acl, int type) { int err; struct path realpath; const char *acl_name; struct ovl_fs *ofs = OVL_FS(dentry->d_sb); struct dentry *upperdentry = ovl_dentry_upper(dentry); struct dentry *realdentry = upperdentry ?: ovl_dentry_lower(dentry); /* * If ACL is to be removed from a lower file, check if it exists in * the first place before copying it up. */ acl_name = posix_acl_xattr_name(type); if (!acl && !upperdentry) { struct posix_acl *real_acl; ovl_path_lower(dentry, &realpath); with_ovl_creds(dentry->d_sb) real_acl = vfs_get_acl(mnt_idmap(realpath.mnt), realdentry, acl_name); if (IS_ERR(real_acl)) { err = PTR_ERR(real_acl); goto out; } posix_acl_release(real_acl); } if (!upperdentry) { err = ovl_copy_up(dentry); if (err) goto out; realdentry = ovl_dentry_upper(dentry); } err = ovl_want_write(dentry); if (err) goto out; with_ovl_creds(dentry->d_sb) { if (acl) err = ovl_do_set_acl(ofs, realdentry, acl_name, acl); else err = ovl_do_remove_acl(ofs, realdentry, acl_name); } ovl_drop_write(dentry); /* copy c/mtime */ ovl_copyattr(inode); out: return err; } int ovl_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { int err; struct inode *inode = d_inode(dentry); struct dentry *workdir = ovl_workdir(dentry); struct inode *realinode = ovl_inode_real(inode); if (!IS_POSIXACL(d_inode(workdir))) return -EOPNOTSUPP; if (!realinode->i_op->set_acl) return -EOPNOTSUPP; if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode)) return acl ? -EACCES : 0; if (!inode_owner_or_capable(&nop_mnt_idmap, inode)) return -EPERM; /* * Check if sgid bit needs to be cleared (actual setacl operation will * be done with mounter's capabilities and so that won't do it for us). */ if (unlikely(inode->i_mode & S_ISGID) && type == ACL_TYPE_ACCESS && !in_group_p(inode->i_gid) && !capable_wrt_inode_uidgid(&nop_mnt_idmap, inode, CAP_FSETID)) { struct iattr iattr = { .ia_valid = ATTR_KILL_SGID }; err = ovl_setattr(&nop_mnt_idmap, dentry, &iattr); if (err) return err; } return ovl_set_or_remove_acl(dentry, inode, acl, type); } #endif int ovl_update_time(struct inode *inode, enum fs_update_time type, unsigned int flags) { if (type == FS_UPD_ATIME) { struct ovl_fs *ofs = OVL_FS(inode->i_sb); struct path upperpath = { .mnt = ovl_upper_mnt(ofs), .dentry = ovl_upperdentry_dereference(OVL_I(inode)), }; if (upperpath.dentry) { if (flags & IOCB_NOWAIT) return -EAGAIN; touch_atime(&upperpath); inode_set_atime_to_ts(inode, inode_get_atime(d_inode(upperpath.dentry))); } } return 0; } static int ovl_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { struct inode *realinode = ovl_inode_realdata(inode); if (!realinode) return -EIO; if (!realinode->i_op->fiemap) return -EOPNOTSUPP; with_ovl_creds(inode->i_sb) return realinode->i_op->fiemap(realinode, fieinfo, start, len); } /* * Work around the fact that security_file_ioctl() takes a file argument. * Introducing security_inode_fileattr_get/set() hooks would solve this issue * properly. */ static int ovl_security_fileattr(const struct path *realpath, struct file_kattr *fa, bool set) { struct file *file; unsigned int cmd; int err; unsigned int flags; flags = O_RDONLY; if (force_o_largefile()) flags |= O_LARGEFILE; file = dentry_open(realpath, flags, current_cred()); if (IS_ERR(file)) return PTR_ERR(file); if (set) cmd = fa->fsx_valid ? FS_IOC_FSSETXATTR : FS_IOC_SETFLAGS; else cmd = fa->fsx_valid ? FS_IOC_FSGETXATTR : FS_IOC_GETFLAGS; err = security_file_ioctl(file, cmd, 0); fput(file); return err; } int ovl_real_fileattr_set(const struct path *realpath, struct file_kattr *fa) { int err; err = ovl_security_fileattr(realpath, fa, true); if (err) return err; return vfs_fileattr_set(mnt_idmap(realpath->mnt), realpath->dentry, fa); } int ovl_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); struct path upperpath; unsigned int flags; int err; err = ovl_copy_up(dentry); if (!err) { ovl_path_real(dentry, &upperpath); err = ovl_want_write(dentry); if (err) goto out; with_ovl_creds(inode->i_sb) { /* * Store immutable/append-only flags in xattr and clear them * in upper fileattr (in case they were set by older kernel) * so children of "ovl-immutable" directories lower aliases of * "ovl-immutable" hardlinks could be copied up. * Clear xattr when flags are cleared. */ err = ovl_set_protattr(inode, upperpath.dentry, fa); if (!err) err = ovl_real_fileattr_set(&upperpath, fa); } ovl_drop_write(dentry); /* * Merge real inode flags with inode flags read from * overlay.protattr xattr */ flags = ovl_inode_real(inode)->i_flags & OVL_COPY_I_FLAGS_MASK; BUILD_BUG_ON(OVL_PROT_I_FLAGS_MASK & ~OVL_COPY_I_FLAGS_MASK); flags |= inode->i_flags & OVL_PROT_I_FLAGS_MASK; inode_set_flags(inode, flags, OVL_COPY_I_FLAGS_MASK); /* Update ctime */ ovl_copyattr(inode); } out: return err; } /* Convert inode protection flags to fileattr flags */ static void ovl_fileattr_prot_flags(struct inode *inode, struct file_kattr *fa) { BUILD_BUG_ON(OVL_PROT_FS_FLAGS_MASK & ~FS_COMMON_FL); BUILD_BUG_ON(OVL_PROT_FSX_FLAGS_MASK & ~FS_XFLAG_COMMON); if (inode->i_flags & S_APPEND) { fa->flags |= FS_APPEND_FL; fa->fsx_xflags |= FS_XFLAG_APPEND; } if (inode->i_flags & S_IMMUTABLE) { fa->flags |= FS_IMMUTABLE_FL; fa->fsx_xflags |= FS_XFLAG_IMMUTABLE; } } int ovl_real_fileattr_get(const struct path *realpath, struct file_kattr *fa) { int err; err = ovl_security_fileattr(realpath, fa, false); if (err) return err; err = vfs_fileattr_get(realpath->dentry, fa); if (err == -ENOIOCTLCMD) err = -ENOTTY; return err; } int ovl_fileattr_get(struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); struct path realpath; int err; ovl_path_real(dentry, &realpath); with_ovl_creds(inode->i_sb) err = ovl_real_fileattr_get(&realpath, fa); ovl_fileattr_prot_flags(inode, fa); return err; } static const struct inode_operations ovl_file_inode_operations = { .setattr = ovl_setattr, .permission = ovl_permission, .getattr = ovl_getattr, .listxattr = ovl_listxattr, .get_inode_acl = ovl_get_inode_acl, .get_acl = ovl_get_acl, .set_acl = ovl_set_acl, .update_time = ovl_update_time, .fiemap = ovl_fiemap, .fileattr_get = ovl_fileattr_get, .fileattr_set = ovl_fileattr_set, }; static const struct inode_operations ovl_symlink_inode_operations = { .setattr = ovl_setattr, .get_link = ovl_get_link, .getattr = ovl_getattr, .listxattr = ovl_listxattr, .update_time = ovl_update_time, }; static const struct inode_operations ovl_special_inode_operations = { .setattr = ovl_setattr, .permission = ovl_permission, .getattr = ovl_getattr, .listxattr = ovl_listxattr, .get_inode_acl = ovl_get_inode_acl, .get_acl = ovl_get_acl, .set_acl = ovl_set_acl, .update_time = ovl_update_time, }; static const struct address_space_operations ovl_aops = { /* For O_DIRECT dentry_open() checks f_mapping->a_ops->direct_IO */ .direct_IO = noop_direct_IO, }; /* * It is possible to stack overlayfs instance on top of another * overlayfs instance as lower layer. We need to annotate the * stackable i_mutex locks according to stack level of the super * block instance. An overlayfs instance can never be in stack * depth 0 (there is always a real fs below it). An overlayfs * inode lock will use the lockdep annotation ovl_i_mutex_key[depth]. * * For example, here is a snip from /proc/lockdep_chains after * dir_iterate of nested overlayfs: * * [...] &ovl_i_mutex_dir_key[depth] (stack_depth=2) * [...] &ovl_i_mutex_dir_key[depth]#2 (stack_depth=1) * [...] &type->i_mutex_dir_key (stack_depth=0) * * Locking order w.r.t ovl_want_write() is important for nested overlayfs. * * This chain is valid: * - inode->i_rwsem (inode_lock[2]) * - upper_mnt->mnt_sb->s_writers (ovl_want_write[0]) * - OVL_I(inode)->lock (ovl_inode_lock[2]) * - OVL_I(lowerinode)->lock (ovl_inode_lock[1]) * * And this chain is valid: * - inode->i_rwsem (inode_lock[2]) * - OVL_I(inode)->lock (ovl_inode_lock[2]) * - lowerinode->i_rwsem (inode_lock[1]) * - OVL_I(lowerinode)->lock (ovl_inode_lock[1]) * * But lowerinode->i_rwsem SHOULD NOT be acquired while ovl_want_write() is * held, because it is in reverse order of the non-nested case using the same * upper fs: * - inode->i_rwsem (inode_lock[1]) * - upper_mnt->mnt_sb->s_writers (ovl_want_write[0]) * - OVL_I(inode)->lock (ovl_inode_lock[1]) */ #define OVL_MAX_NESTING FILESYSTEM_MAX_STACK_DEPTH static inline void ovl_lockdep_annotate_inode_mutex_key(struct inode *inode) { #ifdef CONFIG_LOCKDEP static struct lock_class_key ovl_i_mutex_key[OVL_MAX_NESTING]; static struct lock_class_key ovl_i_mutex_dir_key[OVL_MAX_NESTING]; static struct lock_class_key ovl_i_lock_key[OVL_MAX_NESTING]; int depth = inode->i_sb->s_stack_depth - 1; if (WARN_ON_ONCE(depth < 0 || depth >= OVL_MAX_NESTING)) depth = 0; if (S_ISDIR(inode->i_mode)) lockdep_set_class(&inode->i_rwsem, &ovl_i_mutex_dir_key[depth]); else lockdep_set_class(&inode->i_rwsem, &ovl_i_mutex_key[depth]); lockdep_set_class(&OVL_I(inode)->lock, &ovl_i_lock_key[depth]); #endif } static void ovl_next_ino(struct inode *inode) { struct ovl_fs *ofs = OVL_FS(inode->i_sb); inode->i_ino = atomic_long_inc_return(&ofs->last_ino); if (unlikely(!inode->i_ino)) inode->i_ino = atomic_long_inc_return(&ofs->last_ino); } static void ovl_map_ino(struct inode *inode, unsigned long ino, int fsid) { struct ovl_fs *ofs = OVL_FS(inode->i_sb); int xinobits = ovl_xino_bits(ofs); unsigned int xinoshift = 64 - xinobits; /* * When d_ino is consistent with st_ino (samefs or i_ino has enough * bits to encode layer), set the same value used for st_ino to i_ino, * so inode number exposed via /proc/locks and a like will be * consistent with d_ino and st_ino values. An i_ino value inconsistent * with d_ino also causes nfsd readdirplus to fail. */ inode->i_ino = ino; if (ovl_same_fs(ofs)) { return; } else if (xinobits && likely(!(ino >> xinoshift))) { inode->i_ino |= (unsigned long)fsid << (xinoshift + 1); return; } /* * For directory inodes on non-samefs with xino disabled or xino * overflow, we allocate a non-persistent inode number, to be used for * resolving st_ino collisions in ovl_map_dev_ino(). * * To avoid ino collision with legitimate xino values from upper * layer (fsid 0), use the lowest xinobit to map the non * persistent inode numbers to the unified st_ino address space. */ if (S_ISDIR(inode->i_mode)) { ovl_next_ino(inode); if (xinobits) { inode->i_ino &= ~0UL >> xinobits; inode->i_ino |= 1UL << xinoshift; } } } void ovl_inode_init(struct inode *inode, struct ovl_inode_params *oip, unsigned long ino, int fsid) { struct inode *realinode; struct ovl_inode *oi = OVL_I(inode); oi->__upperdentry = oip->upperdentry; oi->oe = oip->oe; oi->redirect = oip->redirect; oi->lowerdata_redirect = oip->lowerdata_redirect; realinode = ovl_inode_real(inode); ovl_copyattr(inode); ovl_copyflags(realinode, inode); ovl_map_ino(inode, ino, fsid); } static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev) { inode->i_mode = mode; inode->i_flags |= S_NOCMTIME; #ifdef CONFIG_FS_POSIX_ACL inode->i_acl = inode->i_default_acl = ACL_DONT_CACHE; #endif ovl_lockdep_annotate_inode_mutex_key(inode); switch (mode & S_IFMT) { case S_IFREG: inode->i_op = &ovl_file_inode_operations; inode->i_fop = &ovl_file_operations; inode->i_mapping->a_ops = &ovl_aops; break; case S_IFDIR: inode->i_op = &ovl_dir_inode_operations; inode->i_fop = &ovl_dir_operations; break; case S_IFLNK: inode->i_op = &ovl_symlink_inode_operations; break; default: inode->i_op = &ovl_special_inode_operations; init_special_inode(inode, mode, rdev); break; } } /* * With inodes index enabled, an overlay inode nlink counts the union of upper * hardlinks and non-covered lower hardlinks. During the lifetime of a non-pure * upper inode, the following nlink modifying operations can happen: * * 1. Lower hardlink copy up * 2. Upper hardlink created, unlinked or renamed over * 3. Lower hardlink whiteout or renamed over * * For the first, copy up case, the union nlink does not change, whether the * operation succeeds or fails, but the upper inode nlink may change. * Therefore, before copy up, we store the union nlink value relative to the * lower inode nlink in the index inode xattr .overlay.nlink. * * For the second, upper hardlink case, the union nlink should be incremented * or decremented IFF the operation succeeds, aligned with nlink change of the * upper inode. Therefore, before link/unlink/rename, we store the union nlink * value relative to the upper inode nlink in the index inode. * * For the last, lower cover up case, we simplify things by preceding the * whiteout or cover up with copy up. This makes sure that there is an index * upper inode where the nlink xattr can be stored before the copied up upper * entry is unlink. */ #define OVL_NLINK_ADD_UPPER (1 << 0) /* * On-disk format for indexed nlink: * * nlink relative to the upper inode - "U[+-]NUM" * nlink relative to the lower inode - "L[+-]NUM" */ static int ovl_set_nlink_common(struct dentry *dentry, struct dentry *realdentry, const char *format) { struct inode *inode = d_inode(dentry); struct inode *realinode = d_inode(realdentry); char buf[13]; int len; len = snprintf(buf, sizeof(buf), format, (int) (inode->i_nlink - realinode->i_nlink)); if (WARN_ON(len >= sizeof(buf))) return -EIO; return ovl_setxattr(OVL_FS(inode->i_sb), ovl_dentry_upper(dentry), OVL_XATTR_NLINK, buf, len); } int ovl_set_nlink_upper(struct dentry *dentry) { return ovl_set_nlink_common(dentry, ovl_dentry_upper(dentry), "U%+i"); } int ovl_set_nlink_lower(struct dentry *dentry) { return ovl_set_nlink_common(dentry, ovl_dentry_lower(dentry), "L%+i"); } unsigned int ovl_get_nlink(struct ovl_fs *ofs, struct dentry *lowerdentry, struct dentry *upperdentry, unsigned int fallback) { int nlink_diff; int nlink; char buf[13]; int err; if (!lowerdentry || !upperdentry || d_inode(lowerdentry)->i_nlink == 1) return fallback; err = ovl_getxattr_upper(ofs, upperdentry, OVL_XATTR_NLINK, &buf, sizeof(buf) - 1); if (err < 0) goto fail; buf[err] = '\0'; if ((buf[0] != 'L' && buf[0] != 'U') || (buf[1] != '+' && buf[1] != '-')) goto fail; err = kstrtoint(buf + 1, 10, &nlink_diff); if (err < 0) goto fail; nlink = d_inode(buf[0] == 'L' ? lowerdentry : upperdentry)->i_nlink; nlink += nlink_diff; if (nlink <= 0) goto fail; return nlink; fail: pr_warn_ratelimited("failed to get index nlink (%pd2, err=%i)\n", upperdentry, err); return fallback; } struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev) { struct inode *inode; inode = new_inode(sb); if (inode) ovl_fill_inode(inode, mode, rdev); return inode; } static int ovl_inode_test(struct inode *inode, void *data) { return inode->i_private == data; } static int ovl_inode_set(struct inode *inode, void *data) { inode->i_private = data; return 0; } static bool ovl_verify_inode(struct inode *inode, struct dentry *lowerdentry, struct dentry *upperdentry, bool strict) { /* * For directories, @strict verify from lookup path performs consistency * checks, so NULL lower/upper in dentry must match NULL lower/upper in * inode. Non @strict verify from NFS handle decode path passes NULL for * 'unknown' lower/upper. */ if (S_ISDIR(inode->i_mode) && strict) { /* Real lower dir moved to upper layer under us? */ if (!lowerdentry && ovl_inode_lower(inode)) return false; /* Lookup of an uncovered redirect origin? */ if (!upperdentry && ovl_inode_upper(inode)) return false; } /* * Allow non-NULL lower inode in ovl_inode even if lowerdentry is NULL. * This happens when finding a copied up overlay inode for a renamed * or hardlinked overlay dentry and lower dentry cannot be followed * by origin because lower fs does not support file handles. */ if (lowerdentry && ovl_inode_lower(inode) != d_inode(lowerdentry)) return false; /* * Allow non-NULL __upperdentry in inode even if upperdentry is NULL. * This happens when finding a lower alias for a copied up hard link. */ if (upperdentry && ovl_inode_upper(inode) != d_inode(upperdentry)) return false; return true; } struct inode *ovl_lookup_inode(struct super_block *sb, struct dentry *real, bool is_upper) { struct inode *inode, *key = d_inode(real); inode = ilookup5(sb, (unsigned long) key, ovl_inode_test, key); if (!inode) return NULL; if (!ovl_verify_inode(inode, is_upper ? NULL : real, is_upper ? real : NULL, false)) { iput(inode); return ERR_PTR(-ESTALE); } return inode; } bool ovl_lookup_trap_inode(struct super_block *sb, struct dentry *dir) { struct inode *key = d_inode(dir); struct inode *trap; bool res; trap = ilookup5(sb, (unsigned long) key, ovl_inode_test, key); if (!trap) return false; res = IS_DEADDIR(trap) && !ovl_inode_upper(trap) && !ovl_inode_lower(trap); iput(trap); return res; } /* * Create an inode cache entry for layer root dir, that will intentionally * fail ovl_verify_inode(), so any lookup that will find some layer root * will fail. */ struct inode *ovl_get_trap_inode(struct super_block *sb, struct dentry *dir) { struct inode *key = d_inode(dir); struct inode *trap; if (!d_is_dir(dir)) return ERR_PTR(-ENOTDIR); trap = iget5_locked(sb, (unsigned long) key, ovl_inode_test, ovl_inode_set, key); if (!trap) return ERR_PTR(-ENOMEM); if (!(inode_state_read_once(trap) & I_NEW)) { /* Conflicting layer roots? */ iput(trap); return ERR_PTR(-ELOOP); } trap->i_mode = S_IFDIR; trap->i_flags = S_DEAD; unlock_new_inode(trap); return trap; } /* * Does overlay inode need to be hashed by lower inode? */ static bool ovl_hash_bylower(struct super_block *sb, struct dentry *upper, struct dentry *lower, bool index) { struct ovl_fs *ofs = OVL_FS(sb); /* No, if pure upper */ if (!lower) return false; /* Yes, if already indexed */ if (index) return true; /* Yes, if won't be copied up */ if (!ovl_upper_mnt(ofs)) return true; /* No, if lower hardlink is or will be broken on copy up */ if ((upper || !ovl_indexdir(sb)) && !d_is_dir(lower) && d_inode(lower)->i_nlink > 1) return false; /* No, if non-indexed upper with NFS export */ if (ofs->config.nfs_export && upper) return false; /* Otherwise, hash by lower inode for fsnotify */ return true; } static struct inode *ovl_iget5(struct super_block *sb, struct inode *newinode, struct inode *key) { return newinode ? inode_insert5(newinode, (unsigned long) key, ovl_inode_test, ovl_inode_set, key) : iget5_locked(sb, (unsigned long) key, ovl_inode_test, ovl_inode_set, key); } struct inode *ovl_get_inode(struct super_block *sb, struct ovl_inode_params *oip) { struct ovl_fs *ofs = OVL_FS(sb); struct dentry *upperdentry = oip->upperdentry; struct ovl_path *lowerpath = ovl_lowerpath(oip->oe); struct inode *realinode = upperdentry ? d_inode(upperdentry) : NULL; struct inode *inode; struct dentry *lowerdentry = lowerpath ? lowerpath->dentry : NULL; struct path realpath = { .dentry = upperdentry ?: lowerdentry, .mnt = upperdentry ? ovl_upper_mnt(ofs) : lowerpath->layer->mnt, }; bool bylower = ovl_hash_bylower(sb, upperdentry, lowerdentry, oip->index); int fsid = bylower ? lowerpath->layer->fsid : 0; bool is_dir; unsigned long ino = 0; int err = oip->newinode ? -EEXIST : -ENOMEM; if (!realinode) realinode = d_inode(lowerdentry); /* * Copy up origin (lower) may exist for non-indexed upper, but we must * not use lower as hash key if this is a broken hardlink. */ is_dir = S_ISDIR(realinode->i_mode); if (upperdentry || bylower) { struct inode *key = d_inode(bylower ? lowerdentry : upperdentry); unsigned int nlink = is_dir ? 1 : realinode->i_nlink; inode = ovl_iget5(sb, oip->newinode, key); if (!inode) goto out_err; if (!(inode_state_read_once(inode) & I_NEW)) { /* * Verify that the underlying files stored in the inode * match those in the dentry. */ if (!ovl_verify_inode(inode, lowerdentry, upperdentry, true)) { iput(inode); err = -ESTALE; goto out_err; } dput(upperdentry); ovl_free_entry(oip->oe); kfree(oip->redirect); kfree(oip->lowerdata_redirect); goto out; } /* Recalculate nlink for non-dir due to indexing */ if (!is_dir) nlink = ovl_get_nlink(ofs, lowerdentry, upperdentry, nlink); set_nlink(inode, nlink); ino = key->i_ino; } else { /* Lower hardlink that will be broken on copy up */ inode = new_inode(sb); if (!inode) { err = -ENOMEM; goto out_err; } ino = realinode->i_ino; fsid = lowerpath->layer->fsid; } ovl_fill_inode(inode, realinode->i_mode, realinode->i_rdev); ovl_inode_init(inode, oip, ino, fsid); WARN_ON_ONCE(!!IS_CASEFOLDED(inode) != ofs->casefold); if (upperdentry && ovl_is_impuredir(sb, upperdentry)) ovl_set_flag(OVL_IMPURE, inode); if (oip->index) ovl_set_flag(OVL_INDEX, inode); if (bylower) ovl_set_flag(OVL_CONST_INO, inode); /* Check for non-merge dir that may have whiteouts */ if (is_dir) { if (((upperdentry && lowerdentry) || ovl_numlower(oip->oe) > 1) || ovl_path_check_origin_xattr(ofs, &realpath)) { ovl_set_flag(OVL_WHITEOUTS, inode); } } /* Check for immutable/append-only inode flags in xattr */ if (upperdentry) ovl_check_protattr(inode, upperdentry); if (inode_state_read_once(inode) & I_NEW) unlock_new_inode(inode); out: return inode; out_err: pr_warn_ratelimited("failed to get inode (%i)\n", err); inode = ERR_PTR(err); goto out; }
2 2 2 2 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 // SPDX-License-Identifier: GPL-2.0 #include <linux/ptrace.h> #include <linux/sched.h> #include <linux/sched/task_stack.h> #include <linux/export.h> #include <asm/syscall.h> static int collect_syscall(struct task_struct *target, struct syscall_info *info) { unsigned long args[6] = { }; struct pt_regs *regs; if (!try_get_task_stack(target)) { /* Task has no stack, so the task isn't in a syscall. */ memset(info, 0, sizeof(*info)); info->data.nr = -1; return 0; } regs = task_pt_regs(target); if (unlikely(!regs)) { put_task_stack(target); return -EAGAIN; } info->sp = user_stack_pointer(regs); info->data.instruction_pointer = instruction_pointer(regs); info->data.nr = syscall_get_nr(target, regs); if (info->data.nr != -1L) syscall_get_arguments(target, regs, args); info->data.args[0] = args[0]; info->data.args[1] = args[1]; info->data.args[2] = args[2]; info->data.args[3] = args[3]; info->data.args[4] = args[4]; info->data.args[5] = args[5]; put_task_stack(target); return 0; } /** * task_current_syscall - Discover what a blocked task is doing. * @target: thread to examine * @info: structure with the following fields: * .sp - filled with user stack pointer * .data.nr - filled with system call number or -1 * .data.args - filled with @maxargs system call arguments * .data.instruction_pointer - filled with user PC * * If @target is blocked in a system call, returns zero with @info.data.nr * set to the call's number and @info.data.args filled in with its * arguments. Registers not used for system call arguments may not be available * and it is not kosher to use &struct user_regset calls while the system * call is still in progress. Note we may get this result if @target * has finished its system call but not yet returned to user mode, such * as when it's stopped for signal handling or syscall exit tracing. * * If @target is blocked in the kernel during a fault or exception, * returns zero with *@info.data.nr set to -1 and does not fill in * @info.data.args. If so, it's now safe to examine @target using * &struct user_regset get() calls as long as we're sure @target won't return * to user mode. * * Returns -%EAGAIN if @target does not remain blocked. */ int task_current_syscall(struct task_struct *target, struct syscall_info *info) { unsigned long ncsw; unsigned int state; if (target == current) return collect_syscall(target, info); state = READ_ONCE(target->__state); if (unlikely(!state)) return -EAGAIN; ncsw = wait_task_inactive(target, state); if (unlikely(!ncsw) || unlikely(collect_syscall(target, info)) || unlikely(wait_task_inactive(target, state) != ncsw)) return -EAGAIN; return 0; }
3 3 3 3 3 3 3 7 7 3 3 3 3 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 // SPDX-License-Identifier: GPL-2.0-only /* * stackglue.c * * Code which implements an OCFS2 specific interface to underlying * cluster stacks. * * Copyright (C) 2007, 2009 Oracle. All rights reserved. */ #include <linux/list.h> #include <linux/spinlock.h> #include <linux/string.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/kmod.h> #include <linux/fs.h> #include <linux/kobject.h> #include <linux/sysfs.h> #include <linux/sysctl.h> #include "ocfs2_fs.h" #include "stackglue.h" #define OCFS2_STACK_PLUGIN_O2CB "o2cb" #define OCFS2_STACK_PLUGIN_USER "user" #define OCFS2_MAX_HB_CTL_PATH 256 static struct ocfs2_protocol_version locking_max_version; static DEFINE_SPINLOCK(ocfs2_stack_lock); static LIST_HEAD(ocfs2_stack_list); static char cluster_stack_name[OCFS2_STACK_LABEL_LEN + 1]; static char ocfs2_hb_ctl_path[OCFS2_MAX_HB_CTL_PATH] = "/sbin/ocfs2_hb_ctl"; /* * The stack currently in use. If not null, active_stack->sp_count > 0, * the module is pinned, and the locking protocol cannot be changed. */ static struct ocfs2_stack_plugin *active_stack; static struct ocfs2_stack_plugin *ocfs2_stack_lookup(const char *name) { struct ocfs2_stack_plugin *p; assert_spin_locked(&ocfs2_stack_lock); list_for_each_entry(p, &ocfs2_stack_list, sp_list) { if (!strcmp(p->sp_name, name)) return p; } return NULL; } static int ocfs2_stack_driver_request(const char *stack_name, const char *plugin_name) { int rc; struct ocfs2_stack_plugin *p; spin_lock(&ocfs2_stack_lock); /* * If the stack passed by the filesystem isn't the selected one, * we can't continue. */ if (strcmp(stack_name, cluster_stack_name)) { rc = -EBUSY; goto out; } if (active_stack) { /* * If the active stack isn't the one we want, it cannot * be selected right now. */ if (!strcmp(active_stack->sp_name, plugin_name)) rc = 0; else rc = -EBUSY; goto out; } p = ocfs2_stack_lookup(plugin_name); if (!p || !try_module_get(p->sp_owner)) { rc = -ENOENT; goto out; } active_stack = p; rc = 0; out: /* If we found it, pin it */ if (!rc) active_stack->sp_count++; spin_unlock(&ocfs2_stack_lock); return rc; } /* * This function looks up the appropriate stack and makes it active. If * there is no stack, it tries to load it. It will fail if the stack still * cannot be found. It will also fail if a different stack is in use. */ static int ocfs2_stack_driver_get(const char *stack_name) { int rc; char *plugin_name = OCFS2_STACK_PLUGIN_O2CB; /* * Classic stack does not pass in a stack name. This is * compatible with older tools as well. */ if (!stack_name || !*stack_name) stack_name = OCFS2_STACK_PLUGIN_O2CB; if (strlen(stack_name) != OCFS2_STACK_LABEL_LEN) { printk(KERN_ERR "ocfs2 passed an invalid cluster stack label: \"%s\"\n", stack_name); return -EINVAL; } /* Anything that isn't the classic stack is a user stack */ if (strcmp(stack_name, OCFS2_STACK_PLUGIN_O2CB)) plugin_name = OCFS2_STACK_PLUGIN_USER; rc = ocfs2_stack_driver_request(stack_name, plugin_name); if (rc == -ENOENT) { request_module("ocfs2_stack_%s", plugin_name); rc = ocfs2_stack_driver_request(stack_name, plugin_name); } if (rc == -ENOENT) { printk(KERN_ERR "ocfs2: Cluster stack driver \"%s\" cannot be found\n", plugin_name); } else if (rc == -EBUSY) { printk(KERN_ERR "ocfs2: A different cluster stack is in use\n"); } return rc; } static void ocfs2_stack_driver_put(void) { spin_lock(&ocfs2_stack_lock); BUG_ON(active_stack == NULL); BUG_ON(active_stack->sp_count == 0); active_stack->sp_count--; if (!active_stack->sp_count) { module_put(active_stack->sp_owner); active_stack = NULL; } spin_unlock(&ocfs2_stack_lock); } int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin) { int rc; spin_lock(&ocfs2_stack_lock); if (!ocfs2_stack_lookup(plugin->sp_name)) { plugin->sp_count = 0; plugin->sp_max_proto = locking_max_version; list_add(&plugin->sp_list, &ocfs2_stack_list); printk(KERN_INFO "ocfs2: Registered cluster interface %s\n", plugin->sp_name); rc = 0; } else { printk(KERN_ERR "ocfs2: Stack \"%s\" already registered\n", plugin->sp_name); rc = -EEXIST; } spin_unlock(&ocfs2_stack_lock); return rc; } EXPORT_SYMBOL_GPL(ocfs2_stack_glue_register); void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin) { struct ocfs2_stack_plugin *p; spin_lock(&ocfs2_stack_lock); p = ocfs2_stack_lookup(plugin->sp_name); if (p) { BUG_ON(p != plugin); BUG_ON(plugin == active_stack); BUG_ON(plugin->sp_count != 0); list_del_init(&plugin->sp_list); printk(KERN_INFO "ocfs2: Unregistered cluster interface %s\n", plugin->sp_name); } else { printk(KERN_ERR "Stack \"%s\" is not registered\n", plugin->sp_name); } spin_unlock(&ocfs2_stack_lock); } EXPORT_SYMBOL_GPL(ocfs2_stack_glue_unregister); void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_proto) { struct ocfs2_stack_plugin *p; spin_lock(&ocfs2_stack_lock); if (memcmp(max_proto, &locking_max_version, sizeof(struct ocfs2_protocol_version))) { BUG_ON(locking_max_version.pv_major != 0); locking_max_version = *max_proto; list_for_each_entry(p, &ocfs2_stack_list, sp_list) { p->sp_max_proto = locking_max_version; } } spin_unlock(&ocfs2_stack_lock); } EXPORT_SYMBOL_GPL(ocfs2_stack_glue_set_max_proto_version); /* * The ocfs2_dlm_lock() and ocfs2_dlm_unlock() functions take no argument * for the ast and bast functions. They will pass the lksb to the ast * and bast. The caller can wrap the lksb with their own structure to * get more information. */ int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn, int mode, struct ocfs2_dlm_lksb *lksb, u32 flags, void *name, unsigned int namelen) { if (!lksb->lksb_conn) lksb->lksb_conn = conn; else BUG_ON(lksb->lksb_conn != conn); return active_stack->sp_ops->dlm_lock(conn, mode, lksb, flags, name, namelen); } EXPORT_SYMBOL_GPL(ocfs2_dlm_lock); int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn, struct ocfs2_dlm_lksb *lksb, u32 flags) { BUG_ON(lksb->lksb_conn == NULL); return active_stack->sp_ops->dlm_unlock(conn, lksb, flags); } EXPORT_SYMBOL_GPL(ocfs2_dlm_unlock); int ocfs2_dlm_lock_status(struct ocfs2_dlm_lksb *lksb) { return active_stack->sp_ops->lock_status(lksb); } EXPORT_SYMBOL_GPL(ocfs2_dlm_lock_status); int ocfs2_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb) { return active_stack->sp_ops->lvb_valid(lksb); } EXPORT_SYMBOL_GPL(ocfs2_dlm_lvb_valid); void *ocfs2_dlm_lvb(struct ocfs2_dlm_lksb *lksb) { return active_stack->sp_ops->lock_lvb(lksb); } EXPORT_SYMBOL_GPL(ocfs2_dlm_lvb); void ocfs2_dlm_dump_lksb(struct ocfs2_dlm_lksb *lksb) { active_stack->sp_ops->dump_lksb(lksb); } EXPORT_SYMBOL_GPL(ocfs2_dlm_dump_lksb); int ocfs2_stack_supports_plocks(void) { return active_stack && active_stack->sp_ops->plock; } EXPORT_SYMBOL_GPL(ocfs2_stack_supports_plocks); /* * ocfs2_plock() can only be safely called if * ocfs2_stack_supports_plocks() returned true */ int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino, struct file *file, int cmd, struct file_lock *fl) { WARN_ON_ONCE(active_stack->sp_ops->plock == NULL); if (active_stack->sp_ops->plock) return active_stack->sp_ops->plock(conn, ino, file, cmd, fl); return -EOPNOTSUPP; } EXPORT_SYMBOL_GPL(ocfs2_plock); int ocfs2_cluster_connect(const char *stack_name, const char *cluster_name, int cluster_name_len, const char *group, int grouplen, struct ocfs2_locking_protocol *lproto, void (*recovery_handler)(int node_num, void *recovery_data), void *recovery_data, struct ocfs2_cluster_connection **conn) { int rc = 0; struct ocfs2_cluster_connection *new_conn; BUG_ON(group == NULL); BUG_ON(conn == NULL); BUG_ON(recovery_handler == NULL); if (grouplen > GROUP_NAME_MAX) { rc = -EINVAL; goto out; } if (memcmp(&lproto->lp_max_version, &locking_max_version, sizeof(struct ocfs2_protocol_version))) { rc = -EINVAL; goto out; } new_conn = kzalloc_obj(struct ocfs2_cluster_connection); if (!new_conn) { rc = -ENOMEM; goto out; } strscpy(new_conn->cc_name, group, GROUP_NAME_MAX + 1); new_conn->cc_namelen = grouplen; if (cluster_name_len) strscpy(new_conn->cc_cluster_name, cluster_name, CLUSTER_NAME_MAX + 1); new_conn->cc_cluster_name_len = cluster_name_len; new_conn->cc_recovery_handler = recovery_handler; new_conn->cc_recovery_data = recovery_data; new_conn->cc_proto = lproto; /* Start the new connection at our maximum compatibility level */ new_conn->cc_version = lproto->lp_max_version; /* This will pin the stack driver if successful */ rc = ocfs2_stack_driver_get(stack_name); if (rc) goto out_free; rc = active_stack->sp_ops->connect(new_conn); if (rc) { ocfs2_stack_driver_put(); goto out_free; } *conn = new_conn; out_free: if (rc) kfree(new_conn); out: return rc; } EXPORT_SYMBOL_GPL(ocfs2_cluster_connect); /* The caller will ensure all nodes have the same cluster stack */ int ocfs2_cluster_connect_agnostic(const char *group, int grouplen, struct ocfs2_locking_protocol *lproto, void (*recovery_handler)(int node_num, void *recovery_data), void *recovery_data, struct ocfs2_cluster_connection **conn) { char *stack_name = NULL; if (cluster_stack_name[0]) stack_name = cluster_stack_name; return ocfs2_cluster_connect(stack_name, NULL, 0, group, grouplen, lproto, recovery_handler, recovery_data, conn); } EXPORT_SYMBOL_GPL(ocfs2_cluster_connect_agnostic); /* If hangup_pending is 0, the stack driver will be dropped */ int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn, int hangup_pending) { int ret; BUG_ON(conn == NULL); ret = active_stack->sp_ops->disconnect(conn); /* XXX Should we free it anyway? */ if (!ret) { kfree(conn); if (!hangup_pending) ocfs2_stack_driver_put(); } return ret; } EXPORT_SYMBOL_GPL(ocfs2_cluster_disconnect); /* * Leave the group for this filesystem. This is executed by a userspace * program (stored in ocfs2_hb_ctl_path). */ static void ocfs2_leave_group(const char *group) { int ret; char *argv[5], *envp[3]; argv[0] = ocfs2_hb_ctl_path; argv[1] = "-K"; argv[2] = "-u"; argv[3] = (char *)group; argv[4] = NULL; /* minimal command environment taken from cpu_run_sbin_hotplug */ envp[0] = "HOME=/"; envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; envp[2] = NULL; ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); if (ret < 0) { printk(KERN_ERR "ocfs2: Error %d running user helper " "\"%s %s %s %s\"\n", ret, argv[0], argv[1], argv[2], argv[3]); } } /* * Hangup is a required post-umount. ocfs2-tools software expects the * filesystem to call "ocfs2_hb_ctl" during unmount. This happens * regardless of whether the DLM got started, so we can't do it * in ocfs2_cluster_disconnect(). The ocfs2_leave_group() function does * the actual work. */ void ocfs2_cluster_hangup(const char *group, int grouplen) { BUG_ON(group == NULL); BUG_ON(group[grouplen] != '\0'); ocfs2_leave_group(group); /* cluster_disconnect() was called with hangup_pending==1 */ ocfs2_stack_driver_put(); } EXPORT_SYMBOL_GPL(ocfs2_cluster_hangup); int ocfs2_cluster_this_node(struct ocfs2_cluster_connection *conn, unsigned int *node) { return active_stack->sp_ops->this_node(conn, node); } EXPORT_SYMBOL_GPL(ocfs2_cluster_this_node); /* * Sysfs bits */ static ssize_t ocfs2_max_locking_protocol_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { ssize_t ret = 0; spin_lock(&ocfs2_stack_lock); if (locking_max_version.pv_major) ret = snprintf(buf, PAGE_SIZE, "%u.%u\n", locking_max_version.pv_major, locking_max_version.pv_minor); spin_unlock(&ocfs2_stack_lock); return ret; } static struct kobj_attribute ocfs2_attr_max_locking_protocol = __ATTR(max_locking_protocol, S_IRUGO, ocfs2_max_locking_protocol_show, NULL); static ssize_t ocfs2_loaded_cluster_plugins_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { ssize_t ret = 0, total = 0, remain = PAGE_SIZE; struct ocfs2_stack_plugin *p; spin_lock(&ocfs2_stack_lock); list_for_each_entry(p, &ocfs2_stack_list, sp_list) { ret = snprintf(buf, remain, "%s\n", p->sp_name); if (ret >= remain) { /* snprintf() didn't fit */ total = -E2BIG; break; } total += ret; remain -= ret; } spin_unlock(&ocfs2_stack_lock); return total; } static struct kobj_attribute ocfs2_attr_loaded_cluster_plugins = __ATTR(loaded_cluster_plugins, S_IRUGO, ocfs2_loaded_cluster_plugins_show, NULL); static ssize_t ocfs2_active_cluster_plugin_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { ssize_t ret = 0; spin_lock(&ocfs2_stack_lock); if (active_stack) { ret = snprintf(buf, PAGE_SIZE, "%s\n", active_stack->sp_name); if (ret >= PAGE_SIZE) ret = -E2BIG; } spin_unlock(&ocfs2_stack_lock); return ret; } static struct kobj_attribute ocfs2_attr_active_cluster_plugin = __ATTR(active_cluster_plugin, S_IRUGO, ocfs2_active_cluster_plugin_show, NULL); static ssize_t ocfs2_cluster_stack_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { ssize_t ret; spin_lock(&ocfs2_stack_lock); ret = snprintf(buf, PAGE_SIZE, "%s\n", cluster_stack_name); spin_unlock(&ocfs2_stack_lock); return ret; } static ssize_t ocfs2_cluster_stack_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { size_t len = count; ssize_t ret; if (len == 0) return len; if (buf[len - 1] == '\n') len--; if ((len != OCFS2_STACK_LABEL_LEN) || (strnlen(buf, len) != len)) return -EINVAL; spin_lock(&ocfs2_stack_lock); if (active_stack) { if (!strncmp(buf, cluster_stack_name, len)) ret = count; else ret = -EBUSY; } else { memcpy(cluster_stack_name, buf, len); ret = count; } spin_unlock(&ocfs2_stack_lock); return ret; } static struct kobj_attribute ocfs2_attr_cluster_stack = __ATTR(cluster_stack, S_IRUGO | S_IWUSR, ocfs2_cluster_stack_show, ocfs2_cluster_stack_store); static ssize_t ocfs2_dlm_recover_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return snprintf(buf, PAGE_SIZE, "1\n"); } static struct kobj_attribute ocfs2_attr_dlm_recover_support = __ATTR(dlm_recover_callback_support, S_IRUGO, ocfs2_dlm_recover_show, NULL); static struct attribute *ocfs2_attrs[] = { &ocfs2_attr_max_locking_protocol.attr, &ocfs2_attr_loaded_cluster_plugins.attr, &ocfs2_attr_active_cluster_plugin.attr, &ocfs2_attr_cluster_stack.attr, &ocfs2_attr_dlm_recover_support.attr, NULL, }; static const struct attribute_group ocfs2_attr_group = { .attrs = ocfs2_attrs, }; struct kset *ocfs2_kset; EXPORT_SYMBOL_GPL(ocfs2_kset); static void ocfs2_sysfs_exit(void) { kset_unregister(ocfs2_kset); } static int ocfs2_sysfs_init(void) { int ret; ocfs2_kset = kset_create_and_add("ocfs2", NULL, fs_kobj); if (!ocfs2_kset) return -ENOMEM; ret = sysfs_create_group(&ocfs2_kset->kobj, &ocfs2_attr_group); if (ret) goto error; return 0; error: kset_unregister(ocfs2_kset); return ret; } /* * Sysctl bits * * The sysctl lives at /proc/sys/fs/ocfs2/nm/hb_ctl_path. The 'nm' doesn't * make as much sense in a multiple cluster stack world, but it's safer * and easier to preserve the name. */ static const struct ctl_table ocfs2_nm_table[] = { { .procname = "hb_ctl_path", .data = ocfs2_hb_ctl_path, .maxlen = OCFS2_MAX_HB_CTL_PATH, .mode = 0644, .proc_handler = proc_dostring, }, }; static struct ctl_table_header *ocfs2_table_header; /* * Initialization */ static int __init ocfs2_stack_glue_init(void) { int ret; strscpy(cluster_stack_name, OCFS2_STACK_PLUGIN_O2CB); ocfs2_table_header = register_sysctl("fs/ocfs2/nm", ocfs2_nm_table); if (!ocfs2_table_header) { printk(KERN_ERR "ocfs2 stack glue: unable to register sysctl\n"); return -ENOMEM; /* or something. */ } ret = ocfs2_sysfs_init(); if (ret) unregister_sysctl_table(ocfs2_table_header); return ret; } static void __exit ocfs2_stack_glue_exit(void) { memset(&locking_max_version, 0, sizeof(struct ocfs2_protocol_version)); ocfs2_sysfs_exit(); unregister_sysctl_table(ocfs2_table_header); } MODULE_AUTHOR("Oracle"); MODULE_DESCRIPTION("ocfs2 cluster stack glue layer"); MODULE_LICENSE("GPL"); module_init(ocfs2_stack_glue_init); module_exit(ocfs2_stack_glue_exit);
2 2 2 2 2 9 1 6 2 2 4 1 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 // SPDX-License-Identifier: GPL-2.0 /* * Implement the manual drop-all-pagecache function */ #include <linux/pagemap.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/fs.h> #include <linux/writeback.h> #include <linux/sysctl.h> #include <linux/gfp.h> #include <linux/swap.h> #include "internal.h" /* A global variable is a bit ugly, but it keeps the code simple */ static int sysctl_drop_caches; static void drop_pagecache_sb(struct super_block *sb, void *unused) { struct inode *inode, *toput_inode = NULL; spin_lock(&sb->s_inode_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { spin_lock(&inode->i_lock); /* * We must skip inodes in unusual state. We may also skip * inodes without pages but we deliberately won't in case * we need to reschedule to avoid softlockups. */ if ((inode_state_read(inode) & (I_FREEING | I_WILL_FREE | I_NEW)) || (mapping_empty(inode->i_mapping) && !need_resched())) { spin_unlock(&inode->i_lock); continue; } __iget(inode); spin_unlock(&inode->i_lock); spin_unlock(&sb->s_inode_list_lock); invalidate_mapping_pages(inode->i_mapping, 0, -1); iput(toput_inode); toput_inode = inode; cond_resched(); spin_lock(&sb->s_inode_list_lock); } spin_unlock(&sb->s_inode_list_lock); iput(toput_inode); } static int drop_caches_sysctl_handler(const struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { int ret; ret = proc_dointvec_minmax(table, write, buffer, length, ppos); if (ret) return ret; if (write) { static int stfu; if (sysctl_drop_caches & 1) { lru_add_drain_all(); iterate_supers(drop_pagecache_sb, NULL); count_vm_event(DROP_PAGECACHE); } if (sysctl_drop_caches & 2) { drop_slab(); count_vm_event(DROP_SLAB); } if (!stfu) { pr_info("%s (%d): drop_caches: %d\n", current->comm, task_pid_nr(current), sysctl_drop_caches); } stfu |= sysctl_drop_caches & 4; } return 0; } static const struct ctl_table drop_caches_table[] = { { .procname = "drop_caches", .data = &sysctl_drop_caches, .maxlen = sizeof(int), .mode = 0200, .proc_handler = drop_caches_sysctl_handler, .extra1 = SYSCTL_ONE, .extra2 = SYSCTL_FOUR, }, }; static int __init init_vm_drop_caches_sysctls(void) { register_sysctl_init("vm", drop_caches_table); return 0; } fs_initcall(init_vm_drop_caches_sysctls);
1 1 1 3 3 3 7 1 4 4 4 4 8 18 19 18 16 4 18 18 18 6 13 13 12 1 10 3 3 12 4 8 12 9 9 3 6 5 1 1 2 6 8 8 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 // SPDX-License-Identifier: GPL-2.0 /* * Waiting for completion events */ #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/io_uring.h> #include <trace/events/io_uring.h> #include <uapi/linux/io_uring.h> #include "io_uring.h" #include "napi.h" #include "wait.h" static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, int wake_flags, void *key) { struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue, wq); /* * Cannot safely flush overflowed CQEs from here, ensure we wake up * the task, and the next invocation will do it. */ if (io_should_wake(iowq) || io_has_work(iowq->ctx)) return autoremove_wake_function(curr, mode, wake_flags, key); return -1; } int io_run_task_work_sig(struct io_ring_ctx *ctx) { if (io_local_work_pending(ctx)) { __set_current_state(TASK_RUNNING); if (io_run_local_work(ctx, INT_MAX, IO_LOCAL_TW_DEFAULT_MAX) > 0) return 0; } if (io_run_task_work() > 0) return 0; if (task_sigpending(current)) return -EINTR; return 0; } static bool current_pending_io(void) { struct io_uring_task *tctx = current->io_uring; if (!tctx) return false; return percpu_counter_read_positive(&tctx->inflight); } static enum hrtimer_restart io_cqring_timer_wakeup(struct hrtimer *timer) { struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t); WRITE_ONCE(iowq->hit_timeout, 1); iowq->min_timeout = 0; wake_up_process(iowq->wq.private); return HRTIMER_NORESTART; } /* * Doing min_timeout portion. If we saw any timeouts, events, or have work, * wake up. If not, and we have a normal timeout, switch to that and keep * sleeping. */ static enum hrtimer_restart io_cqring_min_timer_wakeup(struct hrtimer *timer) { struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t); struct io_ring_ctx *ctx = iowq->ctx; /* no general timeout, or shorter (or equal), we are done */ if (iowq->timeout == KTIME_MAX || ktime_compare(iowq->min_timeout, iowq->timeout) >= 0) goto out_wake; /* work we may need to run, wake function will see if we need to wake */ if (io_has_work(ctx)) goto out_wake; /* got events since we started waiting, min timeout is done */ scoped_guard(rcu) { struct io_rings *rings = io_get_rings(ctx); if (iowq->cq_min_tail != READ_ONCE(rings->cq.tail)) goto out_wake; /* if we have any events and min timeout expired, we're done */ if (io_cqring_events(ctx)) goto out_wake; } /* * If using deferred task_work running and application is waiting on * more than one request, ensure we reset it now where we are switching * to normal sleeps. Any request completion post min_wait should wake * the task and return. */ if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { atomic_set(&ctx->cq_wait_nr, 1); smp_mb(); if (!llist_empty(&ctx->work_llist)) goto out_wake; } /* any generated CQE posted past this time should wake us up */ iowq->cq_tail = iowq->cq_min_tail; hrtimer_update_function(&iowq->t, io_cqring_timer_wakeup); hrtimer_set_expires(timer, iowq->timeout); return HRTIMER_RESTART; out_wake: return io_cqring_timer_wakeup(timer); } static int io_cqring_schedule_timeout(struct io_wait_queue *iowq, clockid_t clock_id, ktime_t start_time) { ktime_t timeout; if (iowq->min_timeout) { timeout = ktime_add_ns(iowq->min_timeout, start_time); hrtimer_setup_on_stack(&iowq->t, io_cqring_min_timer_wakeup, clock_id, HRTIMER_MODE_ABS); } else { timeout = iowq->timeout; hrtimer_setup_on_stack(&iowq->t, io_cqring_timer_wakeup, clock_id, HRTIMER_MODE_ABS); } hrtimer_set_expires_range_ns(&iowq->t, timeout, 0); hrtimer_start_expires(&iowq->t, HRTIMER_MODE_ABS); if (!READ_ONCE(iowq->hit_timeout)) schedule(); hrtimer_cancel(&iowq->t); destroy_hrtimer_on_stack(&iowq->t); __set_current_state(TASK_RUNNING); return READ_ONCE(iowq->hit_timeout) ? -ETIME : 0; } static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx, struct io_wait_queue *iowq, struct ext_arg *ext_arg, ktime_t start_time) { int ret = 0; /* * Mark us as being in io_wait if we have pending requests, so cpufreq * can take into account that the task is waiting for IO - turns out * to be important for low QD IO. */ if (ext_arg->iowait && current_pending_io()) current->in_iowait = 1; if (iowq->timeout != KTIME_MAX || iowq->min_timeout) ret = io_cqring_schedule_timeout(iowq, ctx->clockid, start_time); else schedule(); current->in_iowait = 0; return ret; } /* If this returns > 0, the caller should retry */ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, struct io_wait_queue *iowq, struct ext_arg *ext_arg, ktime_t start_time) { if (unlikely(READ_ONCE(ctx->check_cq))) return 1; if (unlikely(io_local_work_pending(ctx))) return 1; if (unlikely(task_work_pending(current))) return 1; if (unlikely(task_sigpending(current))) return -EINTR; if (unlikely(io_should_wake(iowq))) return 0; return __io_cqring_wait_schedule(ctx, iowq, ext_arg, start_time); } /* * Wait until events become available, if we don't already have some. The * application must reap them itself, as they reside on the shared cq ring. */ int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, struct ext_arg *ext_arg) { struct io_wait_queue iowq; struct io_rings *rings; ktime_t start_time; int ret, nr_wait; min_events = min_t(int, min_events, ctx->cq_entries); if (!io_allowed_run_tw(ctx)) return -EEXIST; if (io_local_work_pending(ctx)) io_run_local_work(ctx, min_events, max(IO_LOCAL_TW_DEFAULT_MAX, min_events)); io_run_task_work(); if (unlikely(test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq))) io_cqring_do_overflow_flush(ctx); rcu_read_lock(); rings = io_get_rings(ctx); if (__io_cqring_events_user(ctx) >= min_events) { rcu_read_unlock(); return 0; } init_waitqueue_func_entry(&iowq.wq, io_wake_function); iowq.wq.private = current; INIT_LIST_HEAD(&iowq.wq.entry); iowq.ctx = ctx; iowq.cq_tail = READ_ONCE(rings->cq.head) + min_events; iowq.cq_min_tail = READ_ONCE(rings->cq.tail); nr_wait = (int) iowq.cq_tail - READ_ONCE(rings->cq.tail); rcu_read_unlock(); rings = NULL; iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts); iowq.hit_timeout = 0; iowq.min_timeout = ext_arg->min_time; iowq.timeout = KTIME_MAX; start_time = io_get_time(ctx); if (ext_arg->ts_set) { iowq.timeout = timespec64_to_ktime(ext_arg->ts); if (!(flags & IORING_ENTER_ABS_TIMER)) iowq.timeout = ktime_add(iowq.timeout, start_time); } if (ext_arg->sig) { #ifdef CONFIG_COMPAT if (in_compat_syscall()) ret = set_compat_user_sigmask((const compat_sigset_t __user *)ext_arg->sig, ext_arg->argsz); else #endif ret = set_user_sigmask(ext_arg->sig, ext_arg->argsz); if (ret) return ret; } io_napi_busy_loop(ctx, &iowq); trace_io_uring_cqring_wait(ctx, min_events); do { unsigned long check_cq; if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { atomic_set(&ctx->cq_wait_nr, nr_wait); set_current_state(TASK_INTERRUPTIBLE); } else { prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq, TASK_INTERRUPTIBLE); } ret = io_cqring_wait_schedule(ctx, &iowq, ext_arg, start_time); __set_current_state(TASK_RUNNING); atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT); /* * Run task_work after scheduling and before io_should_wake(). * If we got woken because of task_work being processed, run it * now rather than let the caller do another wait loop. */ if (io_local_work_pending(ctx)) io_run_local_work(ctx, nr_wait, nr_wait); io_run_task_work(); /* * Non-local task_work will be run on exit to userspace, but * if we're using DEFER_TASKRUN, then we could have waited * with a timeout for a number of requests. If the timeout * hits, we could have some requests ready to process. Ensure * this break is _after_ we have run task_work, to avoid * deferring running potentially pending requests until the * next time we wait for events. */ if (ret < 0) break; check_cq = READ_ONCE(ctx->check_cq); if (unlikely(check_cq)) { /* let the caller flush overflows, retry */ if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)) io_cqring_do_overflow_flush(ctx); if (check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT)) { ret = -EBADR; break; } } if (io_should_wake(&iowq)) { ret = 0; break; } cond_resched(); /* if min timeout has been hit, don't reset wait count */ if (!iowq.hit_timeout) scoped_guard(rcu) nr_wait = (int) iowq.cq_tail - READ_ONCE(io_get_rings(ctx)->cq.tail); else nr_wait = 1; } while (1); if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) finish_wait(&ctx->cq_wait, &iowq.wq); restore_saved_sigmask_unless(ret == -EINTR); guard(rcu)(); return READ_ONCE(io_get_rings(ctx)->cq.head) == READ_ONCE(io_get_rings(ctx)->cq.tail) ? ret : 0; }
2 2 3 3 3 3 3 3 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ /* ****************************************************************** * bitstream * Part of FSE library * Copyright (c) Meta Platforms, Inc. and affiliates. * * You can contact the author at : * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy * * This source code is licensed under both the BSD-style license (found in the * LICENSE file in the root directory of this source tree) and the GPLv2 (found * in the COPYING file in the root directory of this source tree). * You may select, at your option, one of the above-listed licenses. ****************************************************************** */ #ifndef BITSTREAM_H_MODULE #define BITSTREAM_H_MODULE /* * This API consists of small unitary functions, which must be inlined for best performance. * Since link-time-optimization is not available for all compilers, * these functions are defined into a .h to be included. */ /*-**************************************** * Dependencies ******************************************/ #include "mem.h" /* unaligned access routines */ #include "compiler.h" /* UNLIKELY() */ #include "debug.h" /* assert(), DEBUGLOG(), RAWLOG() */ #include "error_private.h" /* error codes and messages */ #include "bits.h" /* ZSTD_highbit32 */ /*========================================= * Target specific =========================================*/ #define STREAM_ACCUMULATOR_MIN_32 25 #define STREAM_ACCUMULATOR_MIN_64 57 #define STREAM_ACCUMULATOR_MIN ((U32)(MEM_32bits() ? STREAM_ACCUMULATOR_MIN_32 : STREAM_ACCUMULATOR_MIN_64)) /*-****************************************** * bitStream encoding API (write forward) ********************************************/ typedef size_t BitContainerType; /* bitStream can mix input from multiple sources. * A critical property of these streams is that they encode and decode in **reverse** direction. * So the first bit sequence you add will be the last to be read, like a LIFO stack. */ typedef struct { BitContainerType bitContainer; unsigned bitPos; char* startPtr; char* ptr; char* endPtr; } BIT_CStream_t; MEM_STATIC size_t BIT_initCStream(BIT_CStream_t* bitC, void* dstBuffer, size_t dstCapacity); MEM_STATIC void BIT_addBits(BIT_CStream_t* bitC, BitContainerType value, unsigned nbBits); MEM_STATIC void BIT_flushBits(BIT_CStream_t* bitC); MEM_STATIC size_t BIT_closeCStream(BIT_CStream_t* bitC); /* Start with initCStream, providing the size of buffer to write into. * bitStream will never write outside of this buffer. * `dstCapacity` must be >= sizeof(bitD->bitContainer), otherwise @return will be an error code. * * bits are first added to a local register. * Local register is BitContainerType, 64-bits on 64-bits systems, or 32-bits on 32-bits systems. * Writing data into memory is an explicit operation, performed by the flushBits function. * Hence keep track how many bits are potentially stored into local register to avoid register overflow. * After a flushBits, a maximum of 7 bits might still be stored into local register. * * Avoid storing elements of more than 24 bits if you want compatibility with 32-bits bitstream readers. * * Last operation is to close the bitStream. * The function returns the final size of CStream in bytes. * If data couldn't fit into `dstBuffer`, it will return a 0 ( == not storable) */ /*-******************************************** * bitStream decoding API (read backward) **********************************************/ typedef struct { BitContainerType bitContainer; unsigned bitsConsumed; const char* ptr; const char* start; const char* limitPtr; } BIT_DStream_t; typedef enum { BIT_DStream_unfinished = 0, /* fully refilled */ BIT_DStream_endOfBuffer = 1, /* still some bits left in bitstream */ BIT_DStream_completed = 2, /* bitstream entirely consumed, bit-exact */ BIT_DStream_overflow = 3 /* user requested more bits than present in bitstream */ } BIT_DStream_status; /* result of BIT_reloadDStream() */ MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, size_t srcSize); MEM_STATIC BitContainerType BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits); MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD); MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* bitD); /* Start by invoking BIT_initDStream(). * A chunk of the bitStream is then stored into a local register. * Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (BitContainerType). * You can then retrieve bitFields stored into the local register, **in reverse order**. * Local register is explicitly reloaded from memory by the BIT_reloadDStream() method. * A reload guarantee a minimum of ((8*sizeof(bitD->bitContainer))-7) bits when its result is BIT_DStream_unfinished. * Otherwise, it can be less than that, so proceed accordingly. * Checking if DStream has reached its end can be performed with BIT_endOfDStream(). */ /*-**************************************** * unsafe API ******************************************/ MEM_STATIC void BIT_addBitsFast(BIT_CStream_t* bitC, BitContainerType value, unsigned nbBits); /* faster, but works only if value is "clean", meaning all high bits above nbBits are 0 */ MEM_STATIC void BIT_flushBitsFast(BIT_CStream_t* bitC); /* unsafe version; does not check buffer overflow */ MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits); /* faster, but works only if nbBits >= 1 */ /*===== Local Constants =====*/ static const unsigned BIT_mask[] = { 0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, 0x1FF, 0x3FF, 0x7FF, 0xFFF, 0x1FFF, 0x3FFF, 0x7FFF, 0xFFFF, 0x1FFFF, 0x3FFFF, 0x7FFFF, 0xFFFFF, 0x1FFFFF, 0x3FFFFF, 0x7FFFFF, 0xFFFFFF, 0x1FFFFFF, 0x3FFFFFF, 0x7FFFFFF, 0xFFFFFFF, 0x1FFFFFFF, 0x3FFFFFFF, 0x7FFFFFFF}; /* up to 31 bits */ #define BIT_MASK_SIZE (sizeof(BIT_mask) / sizeof(BIT_mask[0])) /*-************************************************************** * bitStream encoding ****************************************************************/ /*! BIT_initCStream() : * `dstCapacity` must be > sizeof(size_t) * @return : 0 if success, * otherwise an error code (can be tested using ERR_isError()) */ MEM_STATIC size_t BIT_initCStream(BIT_CStream_t* bitC, void* startPtr, size_t dstCapacity) { bitC->bitContainer = 0; bitC->bitPos = 0; bitC->startPtr = (char*)startPtr; bitC->ptr = bitC->startPtr; bitC->endPtr = bitC->startPtr + dstCapacity - sizeof(bitC->bitContainer); if (dstCapacity <= sizeof(bitC->bitContainer)) return ERROR(dstSize_tooSmall); return 0; } FORCE_INLINE_TEMPLATE BitContainerType BIT_getLowerBits(BitContainerType bitContainer, U32 const nbBits) { assert(nbBits < BIT_MASK_SIZE); return bitContainer & BIT_mask[nbBits]; } /*! BIT_addBits() : * can add up to 31 bits into `bitC`. * Note : does not check for register overflow ! */ MEM_STATIC void BIT_addBits(BIT_CStream_t* bitC, BitContainerType value, unsigned nbBits) { DEBUG_STATIC_ASSERT(BIT_MASK_SIZE == 32); assert(nbBits < BIT_MASK_SIZE); assert(nbBits + bitC->bitPos < sizeof(bitC->bitContainer) * 8); bitC->bitContainer |= BIT_getLowerBits(value, nbBits) << bitC->bitPos; bitC->bitPos += nbBits; } /*! BIT_addBitsFast() : * works only if `value` is _clean_, * meaning all high bits above nbBits are 0 */ MEM_STATIC void BIT_addBitsFast(BIT_CStream_t* bitC, BitContainerType value, unsigned nbBits) { assert((value>>nbBits) == 0); assert(nbBits + bitC->bitPos < sizeof(bitC->bitContainer) * 8); bitC->bitContainer |= value << bitC->bitPos; bitC->bitPos += nbBits; } /*! BIT_flushBitsFast() : * assumption : bitContainer has not overflowed * unsafe version; does not check buffer overflow */ MEM_STATIC void BIT_flushBitsFast(BIT_CStream_t* bitC) { size_t const nbBytes = bitC->bitPos >> 3; assert(bitC->bitPos < sizeof(bitC->bitContainer) * 8); assert(bitC->ptr <= bitC->endPtr); MEM_writeLEST(bitC->ptr, bitC->bitContainer); bitC->ptr += nbBytes; bitC->bitPos &= 7; bitC->bitContainer >>= nbBytes*8; } /*! BIT_flushBits() : * assumption : bitContainer has not overflowed * safe version; check for buffer overflow, and prevents it. * note : does not signal buffer overflow. * overflow will be revealed later on using BIT_closeCStream() */ MEM_STATIC void BIT_flushBits(BIT_CStream_t* bitC) { size_t const nbBytes = bitC->bitPos >> 3; assert(bitC->bitPos < sizeof(bitC->bitContainer) * 8); assert(bitC->ptr <= bitC->endPtr); MEM_writeLEST(bitC->ptr, bitC->bitContainer); bitC->ptr += nbBytes; if (bitC->ptr > bitC->endPtr) bitC->ptr = bitC->endPtr; bitC->bitPos &= 7; bitC->bitContainer >>= nbBytes*8; } /*! BIT_closeCStream() : * @return : size of CStream, in bytes, * or 0 if it could not fit into dstBuffer */ MEM_STATIC size_t BIT_closeCStream(BIT_CStream_t* bitC) { BIT_addBitsFast(bitC, 1, 1); /* endMark */ BIT_flushBits(bitC); if (bitC->ptr >= bitC->endPtr) return 0; /* overflow detected */ return (size_t)(bitC->ptr - bitC->startPtr) + (bitC->bitPos > 0); } /*-******************************************************** * bitStream decoding **********************************************************/ /*! BIT_initDStream() : * Initialize a BIT_DStream_t. * `bitD` : a pointer to an already allocated BIT_DStream_t structure. * `srcSize` must be the *exact* size of the bitStream, in bytes. * @return : size of stream (== srcSize), or an errorCode if a problem is detected */ MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, size_t srcSize) { if (srcSize < 1) { ZSTD_memset(bitD, 0, sizeof(*bitD)); return ERROR(srcSize_wrong); } bitD->start = (const char*)srcBuffer; bitD->limitPtr = bitD->start + sizeof(bitD->bitContainer); if (srcSize >= sizeof(bitD->bitContainer)) { /* normal case */ bitD->ptr = (const char*)srcBuffer + srcSize - sizeof(bitD->bitContainer); bitD->bitContainer = MEM_readLEST(bitD->ptr); { BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1]; bitD->bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0; /* ensures bitsConsumed is always set */ if (lastByte == 0) return ERROR(GENERIC); /* endMark not present */ } } else { bitD->ptr = bitD->start; bitD->bitContainer = *(const BYTE*)(bitD->start); switch(srcSize) { case 7: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[6]) << (sizeof(bitD->bitContainer)*8 - 16); ZSTD_FALLTHROUGH; case 6: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[5]) << (sizeof(bitD->bitContainer)*8 - 24); ZSTD_FALLTHROUGH; case 5: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[4]) << (sizeof(bitD->bitContainer)*8 - 32); ZSTD_FALLTHROUGH; case 4: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[3]) << 24; ZSTD_FALLTHROUGH; case 3: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[2]) << 16; ZSTD_FALLTHROUGH; case 2: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[1]) << 8; ZSTD_FALLTHROUGH; default: break; } { BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1]; bitD->bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0; if (lastByte == 0) return ERROR(corruption_detected); /* endMark not present */ } bitD->bitsConsumed += (U32)(sizeof(bitD->bitContainer) - srcSize)*8; } return srcSize; } FORCE_INLINE_TEMPLATE BitContainerType BIT_getUpperBits(BitContainerType bitContainer, U32 const start) { return bitContainer >> start; } FORCE_INLINE_TEMPLATE BitContainerType BIT_getMiddleBits(BitContainerType bitContainer, U32 const start, U32 const nbBits) { U32 const regMask = sizeof(bitContainer)*8 - 1; /* if start > regMask, bitstream is corrupted, and result is undefined */ assert(nbBits < BIT_MASK_SIZE); /* x86 transform & ((1 << nbBits) - 1) to bzhi instruction, it is better * than accessing memory. When bmi2 instruction is not present, we consider * such cpus old (pre-Haswell, 2013) and their performance is not of that * importance. */ #if defined(__x86_64__) || defined(_M_X64) return (bitContainer >> (start & regMask)) & ((((U64)1) << nbBits) - 1); #else return (bitContainer >> (start & regMask)) & BIT_mask[nbBits]; #endif } /*! BIT_lookBits() : * Provides next n bits from local register. * local register is not modified. * On 32-bits, maxNbBits==24. * On 64-bits, maxNbBits==56. * @return : value extracted */ FORCE_INLINE_TEMPLATE BitContainerType BIT_lookBits(const BIT_DStream_t* bitD, U32 nbBits) { /* arbitrate between double-shift and shift+mask */ #if 1 /* if bitD->bitsConsumed + nbBits > sizeof(bitD->bitContainer)*8, * bitstream is likely corrupted, and result is undefined */ return BIT_getMiddleBits(bitD->bitContainer, (sizeof(bitD->bitContainer)*8) - bitD->bitsConsumed - nbBits, nbBits); #else /* this code path is slower on my os-x laptop */ U32 const regMask = sizeof(bitD->bitContainer)*8 - 1; return ((bitD->bitContainer << (bitD->bitsConsumed & regMask)) >> 1) >> ((regMask-nbBits) & regMask); #endif } /*! BIT_lookBitsFast() : * unsafe version; only works if nbBits >= 1 */ MEM_STATIC BitContainerType BIT_lookBitsFast(const BIT_DStream_t* bitD, U32 nbBits) { U32 const regMask = sizeof(bitD->bitContainer)*8 - 1; assert(nbBits >= 1); return (bitD->bitContainer << (bitD->bitsConsumed & regMask)) >> (((regMask+1)-nbBits) & regMask); } FORCE_INLINE_TEMPLATE void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits) { bitD->bitsConsumed += nbBits; } /*! BIT_readBits() : * Read (consume) next n bits from local register and update. * Pay attention to not read more than nbBits contained into local register. * @return : extracted value. */ FORCE_INLINE_TEMPLATE BitContainerType BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits) { BitContainerType const value = BIT_lookBits(bitD, nbBits); BIT_skipBits(bitD, nbBits); return value; } /*! BIT_readBitsFast() : * unsafe version; only works if nbBits >= 1 */ MEM_STATIC BitContainerType BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits) { BitContainerType const value = BIT_lookBitsFast(bitD, nbBits); assert(nbBits >= 1); BIT_skipBits(bitD, nbBits); return value; } /*! BIT_reloadDStream_internal() : * Simple variant of BIT_reloadDStream(), with two conditions: * 1. bitstream is valid : bitsConsumed <= sizeof(bitD->bitContainer)*8 * 2. look window is valid after shifted down : bitD->ptr >= bitD->start */ MEM_STATIC BIT_DStream_status BIT_reloadDStream_internal(BIT_DStream_t* bitD) { assert(bitD->bitsConsumed <= sizeof(bitD->bitContainer)*8); bitD->ptr -= bitD->bitsConsumed >> 3; assert(bitD->ptr >= bitD->start); bitD->bitsConsumed &= 7; bitD->bitContainer = MEM_readLEST(bitD->ptr); return BIT_DStream_unfinished; } /*! BIT_reloadDStreamFast() : * Similar to BIT_reloadDStream(), but with two differences: * 1. bitsConsumed <= sizeof(bitD->bitContainer)*8 must hold! * 2. Returns BIT_DStream_overflow when bitD->ptr < bitD->limitPtr, at this * point you must use BIT_reloadDStream() to reload. */ MEM_STATIC BIT_DStream_status BIT_reloadDStreamFast(BIT_DStream_t* bitD) { if (UNLIKELY(bitD->ptr < bitD->limitPtr)) return BIT_DStream_overflow; return BIT_reloadDStream_internal(bitD); } /*! BIT_reloadDStream() : * Refill `bitD` from buffer previously set in BIT_initDStream() . * This function is safe, it guarantees it will not never beyond src buffer. * @return : status of `BIT_DStream_t` internal register. * when status == BIT_DStream_unfinished, internal register is filled with at least 25 or 57 bits */ FORCE_INLINE_TEMPLATE BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD) { /* note : once in overflow mode, a bitstream remains in this mode until it's reset */ if (UNLIKELY(bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))) { static const BitContainerType zeroFilled = 0; bitD->ptr = (const char*)&zeroFilled; /* aliasing is allowed for char */ /* overflow detected, erroneous scenario or end of stream: no update */ return BIT_DStream_overflow; } assert(bitD->ptr >= bitD->start); if (bitD->ptr >= bitD->limitPtr) { return BIT_reloadDStream_internal(bitD); } if (bitD->ptr == bitD->start) { /* reached end of bitStream => no update */ if (bitD->bitsConsumed < sizeof(bitD->bitContainer)*8) return BIT_DStream_endOfBuffer; return BIT_DStream_completed; } /* start < ptr < limitPtr => cautious update */ { U32 nbBytes = bitD->bitsConsumed >> 3; BIT_DStream_status result = BIT_DStream_unfinished; if (bitD->ptr - nbBytes < bitD->start) { nbBytes = (U32)(bitD->ptr - bitD->start); /* ptr > start */ result = BIT_DStream_endOfBuffer; } bitD->ptr -= nbBytes; bitD->bitsConsumed -= nbBytes*8; bitD->bitContainer = MEM_readLEST(bitD->ptr); /* reminder : srcSize > sizeof(bitD->bitContainer), otherwise bitD->ptr == bitD->start */ return result; } } /*! BIT_endOfDStream() : * @return : 1 if DStream has _exactly_ reached its end (all bits consumed). */ MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* DStream) { return ((DStream->ptr == DStream->start) && (DStream->bitsConsumed == sizeof(DStream->bitContainer)*8)); } #endif /* BITSTREAM_H_MODULE */
8535 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM x86_fpu #if !defined(_TRACE_FPU_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_FPU_H #include <linux/tracepoint.h> DECLARE_EVENT_CLASS(x86_fpu, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu), TP_STRUCT__entry( __field(struct fpu *, fpu) __field(bool, load_fpu) __field(u64, xfeatures) __field(u64, xcomp_bv) ), TP_fast_assign( __entry->fpu = fpu; __entry->load_fpu = test_thread_flag(TIF_NEED_FPU_LOAD); if (boot_cpu_has(X86_FEATURE_OSXSAVE)) { __entry->xfeatures = fpu->fpstate->regs.xsave.header.xfeatures; __entry->xcomp_bv = fpu->fpstate->regs.xsave.header.xcomp_bv; } ), TP_printk("x86/fpu: %p load: %d xfeatures: %llx xcomp_bv: %llx", __entry->fpu, __entry->load_fpu, __entry->xfeatures, __entry->xcomp_bv ) ); DEFINE_EVENT(x86_fpu, x86_fpu_before_save, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); DEFINE_EVENT(x86_fpu, x86_fpu_after_save, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); DEFINE_EVENT(x86_fpu, x86_fpu_regs_activated, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); DEFINE_EVENT(x86_fpu, x86_fpu_regs_deactivated, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); DEFINE_EVENT(x86_fpu, x86_fpu_dropped, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); DEFINE_EVENT(x86_fpu, x86_fpu_copy_dst, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); DEFINE_EVENT(x86_fpu, x86_fpu_xstate_check_failed, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH asm/trace/ #undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_FILE fpu #endif /* _TRACE_FPU_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
2 3 3 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved * Copyright 2001-2006 Ian Kent <raven@themaw.net> */ #include <linux/sched/signal.h> #include "autofs_i.h" /* We make this a static variable rather than a part of the superblock; it * is better if we don't reassign numbers easily even across filesystems */ static autofs_wqt_t autofs_next_wait_queue = 1; void autofs_catatonic_mode(struct autofs_sb_info *sbi) { struct autofs_wait_queue *wq, *nwq; mutex_lock(&sbi->wq_mutex); if (sbi->flags & AUTOFS_SBI_CATATONIC) { mutex_unlock(&sbi->wq_mutex); return; } pr_debug("entering catatonic mode\n"); sbi->flags |= AUTOFS_SBI_CATATONIC; wq = sbi->queues; sbi->queues = NULL; /* Erase all wait queues */ while (wq) { nwq = wq->next; wq->status = -ENOENT; /* Magic is gone - report failure */ kfree(wq->name.name - wq->offset); wq->name.name = NULL; wake_up(&wq->queue); if (!--wq->wait_ctr) kfree(wq); wq = nwq; } fput(sbi->pipe); /* Close the pipe */ sbi->pipe = NULL; sbi->pipefd = -1; mutex_unlock(&sbi->wq_mutex); } static int autofs_write(struct autofs_sb_info *sbi, struct file *file, const void *addr, int bytes) { unsigned long sigpipe, flags; const char *data = (const char *)addr; ssize_t wr = 0; sigpipe = sigismember(&current->pending.signal, SIGPIPE); mutex_lock(&sbi->pipe_mutex); while (bytes) { wr = __kernel_write(file, data, bytes, NULL); if (wr <= 0) break; data += wr; bytes -= wr; } mutex_unlock(&sbi->pipe_mutex); /* Keep the currently executing process from receiving a * SIGPIPE unless it was already supposed to get one */ if (wr == -EPIPE && !sigpipe) { spin_lock_irqsave(&current->sighand->siglock, flags); sigdelset(&current->pending.signal, SIGPIPE); recalc_sigpending(); spin_unlock_irqrestore(&current->sighand->siglock, flags); } /* if 'wr' returned 0 (impossible) we assume -EIO (safe) */ return bytes == 0 ? 0 : wr < 0 ? wr : -EIO; } static void autofs_notify_daemon(struct autofs_sb_info *sbi, struct autofs_wait_queue *wq, int type) { union { struct autofs_packet_hdr hdr; union autofs_packet_union v4_pkt; union autofs_v5_packet_union v5_pkt; } pkt; struct file *pipe = NULL; size_t pktsz; int ret; pr_debug("wait id = 0x%08lx, name = %.*s, type=%d\n", (unsigned long) wq->wait_queue_token, wq->name.len, wq->name.name, type); memset(&pkt, 0, sizeof(pkt)); /* For security reasons */ pkt.hdr.proto_version = sbi->version; pkt.hdr.type = type; switch (type) { /* Kernel protocol v4 missing and expire packets */ case autofs_ptype_missing: { struct autofs_packet_missing *mp = &pkt.v4_pkt.missing; pktsz = sizeof(*mp); mp->wait_queue_token = wq->wait_queue_token; mp->len = wq->name.len; memcpy(mp->name, wq->name.name, wq->name.len); mp->name[wq->name.len] = '\0'; break; } case autofs_ptype_expire_multi: { struct autofs_packet_expire_multi *ep = &pkt.v4_pkt.expire_multi; pktsz = sizeof(*ep); ep->wait_queue_token = wq->wait_queue_token; ep->len = wq->name.len; memcpy(ep->name, wq->name.name, wq->name.len); ep->name[wq->name.len] = '\0'; break; } /* * Kernel protocol v5 packet for handling indirect and direct * mount missing and expire requests */ case autofs_ptype_missing_indirect: case autofs_ptype_expire_indirect: case autofs_ptype_missing_direct: case autofs_ptype_expire_direct: { struct autofs_v5_packet *packet = &pkt.v5_pkt.v5_packet; struct user_namespace *user_ns = sbi->pipe->f_cred->user_ns; pktsz = sizeof(*packet); packet->wait_queue_token = wq->wait_queue_token; packet->len = wq->name.len; memcpy(packet->name, wq->name.name, wq->name.len); packet->name[wq->name.len] = '\0'; packet->dev = wq->dev; packet->ino = wq->ino; packet->uid = from_kuid_munged(user_ns, wq->uid); packet->gid = from_kgid_munged(user_ns, wq->gid); packet->pid = wq->pid; packet->tgid = wq->tgid; break; } default: pr_warn("bad type %d!\n", type); mutex_unlock(&sbi->wq_mutex); return; } pipe = get_file(sbi->pipe); mutex_unlock(&sbi->wq_mutex); switch (ret = autofs_write(sbi, pipe, &pkt, pktsz)) { case 0: break; case -ENOMEM: case -ERESTARTSYS: /* Just fail this one */ autofs_wait_release(sbi, wq->wait_queue_token, ret); break; default: autofs_catatonic_mode(sbi); break; } fput(pipe); } static struct autofs_wait_queue * autofs_find_wait(struct autofs_sb_info *sbi, const struct qstr *qstr) { struct autofs_wait_queue *wq; for (wq = sbi->queues; wq; wq = wq->next) { if (wq->name.hash == qstr->hash && wq->name.len == qstr->len && wq->name.name && !memcmp(wq->name.name, qstr->name, qstr->len)) break; } return wq; } /* * Check if we have a valid request. * Returns * 1 if the request should continue. * In this case we can return an autofs_wait_queue entry if one is * found or NULL to idicate a new wait needs to be created. * 0 or a negative errno if the request shouldn't continue. */ static int validate_request(struct autofs_wait_queue **wait, struct autofs_sb_info *sbi, const struct qstr *qstr, const struct path *path, enum autofs_notify notify) { struct dentry *dentry = path->dentry; struct autofs_wait_queue *wq; struct autofs_info *ino; if (sbi->flags & AUTOFS_SBI_CATATONIC) return -ENOENT; /* Wait in progress, continue; */ wq = autofs_find_wait(sbi, qstr); if (wq) { *wait = wq; return 1; } *wait = NULL; /* If we don't yet have any info this is a new request */ ino = autofs_dentry_ino(dentry); if (!ino) return 1; /* * If we've been asked to wait on an existing expire (NFY_NONE) * but there is no wait in the queue ... */ if (notify == NFY_NONE) { /* * Either we've betean the pending expire to post it's * wait or it finished while we waited on the mutex. * So we need to wait till either, the wait appears * or the expire finishes. */ while (ino->flags & AUTOFS_INF_EXPIRING) { mutex_unlock(&sbi->wq_mutex); schedule_timeout_interruptible(HZ/10); if (mutex_lock_interruptible(&sbi->wq_mutex)) return -EINTR; if (sbi->flags & AUTOFS_SBI_CATATONIC) return -ENOENT; wq = autofs_find_wait(sbi, qstr); if (wq) { *wait = wq; return 1; } } /* * Not ideal but the status has already gone. Of the two * cases where we wait on NFY_NONE neither depend on the * return status of the wait. */ return 0; } /* * If we've been asked to trigger a mount and the request * completed while we waited on the mutex ... */ if (notify == NFY_MOUNT) { struct dentry *new = NULL; struct path this; int valid = 1; /* * If the dentry was successfully mounted while we slept * on the wait queue mutex we can return success. If it * isn't mounted (doesn't have submounts for the case of * a multi-mount with no mount at it's base) we can * continue on and create a new request. */ if (!IS_ROOT(dentry)) { if (d_unhashed(dentry) && d_really_is_positive(dentry)) { struct dentry *parent = dentry->d_parent; new = d_lookup(parent, &dentry->d_name); if (new) dentry = new; } } this.mnt = path->mnt; this.dentry = dentry; if (path_has_submounts(&this)) valid = 0; if (new) dput(new); return valid; } return 1; } int autofs_wait(struct autofs_sb_info *sbi, const struct path *path, enum autofs_notify notify) { struct dentry *dentry = path->dentry; struct autofs_wait_queue *wq; struct qstr qstr; char *name; int status, ret, type; unsigned int offset = 0; pid_t pid; pid_t tgid; /* In catatonic mode, we don't wait for nobody */ if (sbi->flags & AUTOFS_SBI_CATATONIC) return -ENOENT; /* * Try translating pids to the namespace of the daemon. * * Zero means failure: we are in an unrelated pid namespace. */ pid = task_pid_nr_ns(current, ns_of_pid(sbi->oz_pgrp)); tgid = task_tgid_nr_ns(current, ns_of_pid(sbi->oz_pgrp)); if (pid == 0 || tgid == 0) return -ENOENT; if (d_really_is_negative(dentry)) { /* * A wait for a negative dentry is invalid for certain * cases. A direct or offset mount "always" has its mount * point directory created and so the request dentry must * be positive or the map key doesn't exist. The situation * is very similar for indirect mounts except only dentrys * in the root of the autofs file system may be negative. */ if (autofs_type_trigger(sbi->type)) return -ENOENT; else if (!IS_ROOT(dentry->d_parent)) return -ENOENT; } name = kmalloc(NAME_MAX + 1, GFP_KERNEL); if (!name) return -ENOMEM; /* If this is a direct mount request create a dummy name */ if (IS_ROOT(dentry) && autofs_type_trigger(sbi->type)) { qstr.name = name; qstr.len = sprintf(name, "%p", dentry); } else { char *p = dentry_path_raw(dentry, name, NAME_MAX); if (IS_ERR(p)) { kfree(name); return -ENOENT; } qstr.name = ++p; // skip the leading slash qstr.len = strlen(p); offset = p - name; } qstr.hash = full_name_hash(dentry, qstr.name, qstr.len); if (mutex_lock_interruptible(&sbi->wq_mutex)) { kfree(name); return -EINTR; } ret = validate_request(&wq, sbi, &qstr, path, notify); if (ret <= 0) { if (ret != -EINTR) mutex_unlock(&sbi->wq_mutex); kfree(name); return ret; } if (!wq) { /* Create a new wait queue */ wq = kmalloc_obj(struct autofs_wait_queue); if (!wq) { kfree(name); mutex_unlock(&sbi->wq_mutex); return -ENOMEM; } wq->wait_queue_token = autofs_next_wait_queue; if (++autofs_next_wait_queue == 0) autofs_next_wait_queue = 1; wq->next = sbi->queues; sbi->queues = wq; init_waitqueue_head(&wq->queue); memcpy(&wq->name, &qstr, sizeof(struct qstr)); wq->offset = offset; wq->dev = autofs_get_dev(sbi); wq->ino = autofs_get_ino(sbi); wq->uid = current_uid(); wq->gid = current_gid(); wq->pid = pid; wq->tgid = tgid; wq->status = -EINTR; /* Status return if interrupted */ wq->wait_ctr = 2; if (sbi->version < 5) { if (notify == NFY_MOUNT) type = autofs_ptype_missing; else type = autofs_ptype_expire_multi; } else { if (notify == NFY_MOUNT) type = autofs_type_trigger(sbi->type) ? autofs_ptype_missing_direct : autofs_ptype_missing_indirect; else type = autofs_type_trigger(sbi->type) ? autofs_ptype_expire_direct : autofs_ptype_expire_indirect; } pr_debug("new wait id = 0x%08lx, name = %.*s, nfy=%d\n", (unsigned long) wq->wait_queue_token, wq->name.len, wq->name.name, notify); /* * autofs_notify_daemon() may block; it will unlock ->wq_mutex */ autofs_notify_daemon(sbi, wq, type); } else { wq->wait_ctr++; pr_debug("existing wait id = 0x%08lx, name = %.*s, nfy=%d\n", (unsigned long) wq->wait_queue_token, wq->name.len, wq->name.name, notify); mutex_unlock(&sbi->wq_mutex); kfree(name); } /* * wq->name.name is NULL iff the lock is already released * or the mount has been made catatonic. */ wait_event_killable(wq->queue, wq->name.name == NULL); status = wq->status; /* * For direct and offset mounts we need to track the requester's * uid and gid in the dentry info struct. This is so it can be * supplied, on request, by the misc device ioctl interface. * This is needed during daemon resatart when reconnecting * to existing, active, autofs mounts. The uid and gid (and * related string values) may be used for macro substitution * in autofs mount maps. */ if (!status) { struct autofs_info *ino; struct dentry *de = NULL; /* direct mount or browsable map */ ino = autofs_dentry_ino(dentry); if (!ino) { /* If not lookup actual dentry used */ de = d_lookup(dentry->d_parent, &dentry->d_name); if (de) ino = autofs_dentry_ino(de); } /* Set mount requester */ if (ino) { spin_lock(&sbi->fs_lock); ino->uid = wq->uid; ino->gid = wq->gid; spin_unlock(&sbi->fs_lock); } if (de) dput(de); } /* Are we the last process to need status? */ mutex_lock(&sbi->wq_mutex); if (!--wq->wait_ctr) kfree(wq); mutex_unlock(&sbi->wq_mutex); return status; } int autofs_wait_release(struct autofs_sb_info *sbi, autofs_wqt_t wait_queue_token, int status) { struct autofs_wait_queue *wq, **wql; mutex_lock(&sbi->wq_mutex); for (wql = &sbi->queues; (wq = *wql) != NULL; wql = &wq->next) { if (wq->wait_queue_token == wait_queue_token) break; } if (!wq) { mutex_unlock(&sbi->wq_mutex); return -EINVAL; } *wql = wq->next; /* Unlink from chain */ kfree(wq->name.name - wq->offset); wq->name.name = NULL; /* Do not wait on this queue */ wq->status = status; wake_up(&wq->queue); if (!--wq->wait_ctr) kfree(wq); mutex_unlock(&sbi->wq_mutex); return 0; }
24 263 263 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2008 Oracle. All rights reserved. */ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/mm.h> #include <linux/init.h> #include <linux/err.h> #include <linux/sched.h> #include <linux/pagemap.h> #include <linux/bio.h> #include <linux/lzo.h> #include <linux/refcount.h> #include "messages.h" #include "compression.h" #include "ctree.h" #include "super.h" #include "btrfs_inode.h" #define LZO_LEN 4 /* * Btrfs LZO compression format * * Regular and inlined LZO compressed data extents consist of: * * 1. Header * Fixed size. LZO_LEN (4) bytes long, LE32. * Records the total size (including the header) of compressed data. * * 2. Segment(s) * Variable size. Each segment includes one segment header, followed by data * payload. * One regular LZO compressed extent can have one or more segments. * For inlined LZO compressed extent, only one segment is allowed. * One segment represents at most one sector of uncompressed data. * * 2.1 Segment header * Fixed size. LZO_LEN (4) bytes long, LE32. * Records the total size of the segment (not including the header). * Segment header never crosses sector boundary, thus it's possible to * have at most 3 padding zeros at the end of the sector. * * 2.2 Data Payload * Variable size. Size up limit should be lzo1x_worst_compress(sectorsize) * which is 4419 for a 4KiB sectorsize. * * Example with 4K sectorsize: * Page 1: * 0 0x2 0x4 0x6 0x8 0xa 0xc 0xe 0x10 * 0x0000 | Header | SegHdr 01 | Data payload 01 ... | * ... * 0x0ff0 | SegHdr N | Data payload N ... |00| * ^^ padding zeros * Page 2: * 0x1000 | SegHdr N+1| Data payload N+1 ... | */ struct workspace { void *mem; void *buf; /* where decompressed data goes */ void *cbuf; /* where compressed data goes */ struct list_head list; }; static u32 workspace_buf_length(const struct btrfs_fs_info *fs_info) { return lzo1x_worst_compress(fs_info->sectorsize); } static u32 workspace_cbuf_length(const struct btrfs_fs_info *fs_info) { return lzo1x_worst_compress(fs_info->sectorsize); } void lzo_free_workspace(struct list_head *ws) { struct workspace *workspace = list_entry(ws, struct workspace, list); kvfree(workspace->buf); kvfree(workspace->cbuf); kvfree(workspace->mem); kfree(workspace); } struct list_head *lzo_alloc_workspace(struct btrfs_fs_info *fs_info) { struct workspace *workspace; workspace = kzalloc_obj(*workspace); if (!workspace) return ERR_PTR(-ENOMEM); workspace->mem = kvmalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL | __GFP_NOWARN); workspace->buf = kvmalloc(workspace_buf_length(fs_info), GFP_KERNEL | __GFP_NOWARN); workspace->cbuf = kvmalloc(workspace_cbuf_length(fs_info), GFP_KERNEL | __GFP_NOWARN); if (!workspace->mem || !workspace->buf || !workspace->cbuf) goto fail; INIT_LIST_HEAD(&workspace->list); return &workspace->list; fail: lzo_free_workspace(&workspace->list); return ERR_PTR(-ENOMEM); } static inline void write_compress_length(char *buf, size_t len) { __le32 dlen; dlen = cpu_to_le32(len); memcpy(buf, &dlen, LZO_LEN); } static inline size_t read_compress_length(const char *buf) { __le32 dlen; memcpy(&dlen, buf, LZO_LEN); return le32_to_cpu(dlen); } /* * Write data into @out_folio and queue it into @out_bio. * * Return 0 if everything is fine and @total_out will be increased. * Return <0 for error. * * The @out_folio can be NULL after a full folio is queued. * Thus the caller should check and allocate a new folio when needed. */ static int write_and_queue_folio(struct bio *out_bio, struct folio **out_folio, u32 *total_out, u32 write_len) { const u32 fsize = folio_size(*out_folio); const u32 foffset = offset_in_folio(*out_folio, *total_out); ASSERT(out_folio && *out_folio); /* Should not cross folio boundary. */ ASSERT(foffset + write_len <= fsize); /* We can not use bio_add_folio_nofail() which doesn't do any merge. */ if (!bio_add_folio(out_bio, *out_folio, write_len, foffset)) { /* * We have allocated a bio that havs BTRFS_MAX_COMPRESSED_PAGES * vecs, and all ranges inside the same folio should have been * merged. If bio_add_folio() still failed, that means we have * reached the bvec limits. * * This should only happen at the beginning of a folio, and * caller is responsible for releasing the folio, since it's * not yet queued into the bio. */ ASSERT(IS_ALIGNED(*total_out, fsize)); return -E2BIG; } *total_out += write_len; /* * The full folio has been filled and queued, reset @out_folio to NULL, * so that error handling is fully handled by the bio. */ if (IS_ALIGNED(*total_out, fsize)) *out_folio = NULL; return 0; } /* * Copy compressed data to bio. * * @out_bio: The bio that will contain all the compressed data. * @compressed_data: The compressed data of this segment. * @compressed_size: The size of the compressed data. * @out_folio: The current output folio, will be updated if a new * folio is allocated. * @total_out: The total bytes of current output. * @max_out: The maximum size of the compressed data. * * Will do: * * - Write a segment header into the destination * - Copy the compressed buffer into the destination * - Make sure we have enough space in the last sector to fit a segment header * If not, we will pad at most (LZO_LEN (4)) - 1 bytes of zeros. * - If a full folio is filled, it will be queued into @out_bio, and @out_folio * will be updated. * * Will allocate new pages when needed. */ static int copy_compressed_data_to_bio(struct btrfs_fs_info *fs_info, struct bio *out_bio, const char *compressed_data, size_t compressed_size, struct folio **out_folio, u32 *total_out, u32 max_out) { const u32 sectorsize = fs_info->sectorsize; const u32 sectorsize_bits = fs_info->sectorsize_bits; const u32 fsize = btrfs_min_folio_size(fs_info); const u32 old_size = out_bio->bi_iter.bi_size; u32 copy_start; u32 sector_bytes_left; char *kaddr; int ret; ASSERT(out_folio); /* There should be at least a lzo header queued. */ ASSERT(old_size); ASSERT(old_size == *total_out); /* * We never allow a segment header crossing sector boundary, previous * run should ensure we have enough space left inside the sector. */ ASSERT((old_size >> sectorsize_bits) == (old_size + LZO_LEN - 1) >> sectorsize_bits); if (!*out_folio) { *out_folio = btrfs_alloc_compr_folio(fs_info); if (!*out_folio) return -ENOMEM; } /* Write the segment header first. */ kaddr = kmap_local_folio(*out_folio, offset_in_folio(*out_folio, *total_out)); write_compress_length(kaddr, compressed_size); kunmap_local(kaddr); ret = write_and_queue_folio(out_bio, out_folio, total_out, LZO_LEN); if (ret < 0) return ret; copy_start = *total_out; /* Copy compressed data. */ while (*total_out - copy_start < compressed_size) { u32 copy_len = min_t(u32, sectorsize - *total_out % sectorsize, copy_start + compressed_size - *total_out); u32 foffset = *total_out & (fsize - 1); /* With the range copied, we're larger than the original range. */ if (((*total_out + copy_len) >> sectorsize_bits) >= max_out >> sectorsize_bits) return -E2BIG; if (!*out_folio) { *out_folio = btrfs_alloc_compr_folio(fs_info); if (!*out_folio) return -ENOMEM; } kaddr = kmap_local_folio(*out_folio, foffset); memcpy(kaddr, compressed_data + *total_out - copy_start, copy_len); kunmap_local(kaddr); ret = write_and_queue_folio(out_bio, out_folio, total_out, copy_len); if (ret < 0) return ret; } /* * Check if we can fit the next segment header into the remaining space * of the sector. */ sector_bytes_left = round_up(*total_out, sectorsize) - *total_out; if (sector_bytes_left >= LZO_LEN || sector_bytes_left == 0) return 0; ASSERT(*out_folio); /* The remaining size is not enough, pad it with zeros */ folio_zero_range(*out_folio, offset_in_folio(*out_folio, *total_out), sector_bytes_left); return write_and_queue_folio(out_bio, out_folio, total_out, sector_bytes_left); } int lzo_compress_bio(struct list_head *ws, struct compressed_bio *cb) { struct btrfs_inode *inode = cb->bbio.inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; struct workspace *workspace = list_entry(ws, struct workspace, list); struct bio *bio = &cb->bbio.bio; const u64 start = cb->start; const u32 len = cb->len; const u32 sectorsize = fs_info->sectorsize; const u32 min_folio_size = btrfs_min_folio_size(fs_info); struct address_space *mapping = inode->vfs_inode.i_mapping; struct folio *folio_in = NULL; struct folio *folio_out = NULL; char *sizes_ptr; int ret = 0; /* Points to the file offset of input data. */ u64 cur_in = start; /* Points to the current output byte. */ u32 total_out = 0; ASSERT(bio->bi_iter.bi_size == 0); ASSERT(len); folio_out = btrfs_alloc_compr_folio(fs_info); if (!folio_out) return -ENOMEM; /* Queue a segment header first. */ ret = write_and_queue_folio(bio, &folio_out, &total_out, LZO_LEN); /* The first header should not fail. */ ASSERT(ret == 0); while (cur_in < start + len) { char *data_in; const u32 sectorsize_mask = sectorsize - 1; u32 sector_off = (cur_in - start) & sectorsize_mask; u32 in_len; size_t out_len; /* Get the input page first. */ if (!folio_in) { ret = btrfs_compress_filemap_get_folio(mapping, cur_in, &folio_in); if (ret < 0) goto out; } /* Compress at most one sector of data each time. */ in_len = min_t(u32, start + len - cur_in, sectorsize - sector_off); ASSERT(in_len); data_in = kmap_local_folio(folio_in, offset_in_folio(folio_in, cur_in)); ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf, &out_len, workspace->mem); kunmap_local(data_in); if (unlikely(ret < 0)) { /* lzo1x_1_compress never fails. */ ret = -EIO; goto out; } ret = copy_compressed_data_to_bio(fs_info, bio, workspace->cbuf, out_len, &folio_out, &total_out, len); if (ret < 0) goto out; cur_in += in_len; /* * Check if we're making it bigger after two sectors. And if * it is so, give up. */ if (cur_in - start > sectorsize * 2 && cur_in - start < total_out) { ret = -E2BIG; goto out; } /* Check if we have reached input folio boundary. */ if (IS_ALIGNED(cur_in, min_folio_size)) { folio_put(folio_in); folio_in = NULL; } } /* * The last folio is already queued. Bio is responsible for freeing * those folios now. */ folio_out = NULL; /* Store the size of all chunks of compressed data */ sizes_ptr = kmap_local_folio(bio_first_folio_all(bio), 0); write_compress_length(sizes_ptr, total_out); kunmap_local(sizes_ptr); out: /* * We can only free the folio that has no part queued into the bio. * * As any folio that is already queued into bio will be released by * the endio function of bio. */ if (folio_out && IS_ALIGNED(total_out, min_folio_size)) { btrfs_free_compr_folio(folio_out); folio_out = NULL; } if (folio_in) folio_put(folio_in); return ret; } static struct folio *get_current_folio(struct compressed_bio *cb, struct folio_iter *fi, u32 *cur_folio_index, u32 cur_in) { struct btrfs_fs_info *fs_info = cb_to_fs_info(cb); const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; ASSERT(cur_folio_index); /* Need to switch to the next folio. */ if (cur_in >> min_folio_shift != *cur_folio_index) { /* We can only do the switch one folio a time. */ ASSERT(cur_in >> min_folio_shift == *cur_folio_index + 1); bio_next_folio(fi, &cb->bbio.bio); (*cur_folio_index)++; } return fi->folio; } /* * Copy the compressed segment payload into @dest. * * For the payload there will be no padding, just need to do page switching. */ static void copy_compressed_segment(struct compressed_bio *cb, struct folio_iter *fi, u32 *cur_folio_index, char *dest, u32 len, u32 *cur_in) { u32 orig_in = *cur_in; while (*cur_in < orig_in + len) { struct folio *cur_folio = get_current_folio(cb, fi, cur_folio_index, *cur_in); u32 copy_len; ASSERT(cur_folio); copy_len = min_t(u32, orig_in + len - *cur_in, folio_size(cur_folio) - offset_in_folio(cur_folio, *cur_in)); ASSERT(copy_len); memcpy_from_folio(dest + *cur_in - orig_in, cur_folio, offset_in_folio(cur_folio, *cur_in), copy_len); *cur_in += copy_len; } } int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) { struct workspace *workspace = list_entry(ws, struct workspace, list); struct btrfs_fs_info *fs_info = cb->bbio.inode->root->fs_info; const u32 sectorsize = fs_info->sectorsize; struct folio_iter fi; char *kaddr; int ret; /* Compressed data length, can be unaligned */ u32 len_in; /* Offset inside the compressed data */ u32 cur_in = 0; /* Bytes decompressed so far */ u32 cur_out = 0; /* The current folio index number inside the bio. */ u32 cur_folio_index = 0; bio_first_folio(&fi, &cb->bbio.bio, 0); /* There must be a compressed folio and matches the sectorsize. */ if (unlikely(!fi.folio)) return -EINVAL; ASSERT(folio_size(fi.folio) == btrfs_min_folio_size(fs_info)); kaddr = kmap_local_folio(fi.folio, 0); len_in = read_compress_length(kaddr); kunmap_local(kaddr); cur_in += LZO_LEN; /* * LZO header length check * * The total length should not exceed the maximum extent length, * and all sectors should be used. * If this happens, it means the compressed extent is corrupted. */ if (unlikely(len_in > min_t(size_t, BTRFS_MAX_COMPRESSED, cb->compressed_len) || round_up(len_in, sectorsize) < cb->compressed_len)) { struct btrfs_inode *inode = cb->bbio.inode; btrfs_err(fs_info, "lzo header invalid, root %llu inode %llu offset %llu lzo len %u compressed len %u", btrfs_root_id(inode->root), btrfs_ino(inode), cb->start, len_in, cb->compressed_len); return -EUCLEAN; } /* Go through each lzo segment */ while (cur_in < len_in) { struct folio *cur_folio; /* Length of the compressed segment */ u32 seg_len; u32 sector_bytes_left; size_t out_len = lzo1x_worst_compress(sectorsize); /* * We should always have enough space for one segment header * inside current sector. */ ASSERT(cur_in / sectorsize == (cur_in + LZO_LEN - 1) / sectorsize); cur_folio = get_current_folio(cb, &fi, &cur_folio_index, cur_in); ASSERT(cur_folio); kaddr = kmap_local_folio(cur_folio, 0); seg_len = read_compress_length(kaddr + offset_in_folio(cur_folio, cur_in)); kunmap_local(kaddr); cur_in += LZO_LEN; if (unlikely(seg_len > workspace_cbuf_length(fs_info))) { struct btrfs_inode *inode = cb->bbio.inode; /* * seg_len shouldn't be larger than we have allocated * for workspace->cbuf */ btrfs_err(fs_info, "lzo segment too big, root %llu inode %llu offset %llu len %u", btrfs_root_id(inode->root), btrfs_ino(inode), cb->start, seg_len); return -EIO; } /* Copy the compressed segment payload into workspace */ copy_compressed_segment(cb, &fi, &cur_folio_index, workspace->cbuf, seg_len, &cur_in); /* Decompress the data */ ret = lzo1x_decompress_safe(workspace->cbuf, seg_len, workspace->buf, &out_len); if (unlikely(ret != LZO_E_OK)) { struct btrfs_inode *inode = cb->bbio.inode; btrfs_err(fs_info, "lzo decompression failed, error %d root %llu inode %llu offset %llu", ret, btrfs_root_id(inode->root), btrfs_ino(inode), cb->start); return -EIO; } /* Copy the data into inode pages */ ret = btrfs_decompress_buf2page(workspace->buf, out_len, cb, cur_out); cur_out += out_len; /* All data read, exit */ if (ret == 0) return 0; ret = 0; /* Check if the sector has enough space for a segment header */ sector_bytes_left = sectorsize - (cur_in % sectorsize); if (sector_bytes_left >= LZO_LEN) continue; /* Skip the padding zeros */ cur_in += sector_bytes_left; } return 0; } int lzo_decompress(struct list_head *ws, const u8 *data_in, struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen) { struct workspace *workspace = list_entry(ws, struct workspace, list); struct btrfs_fs_info *fs_info = folio_to_fs_info(dest_folio); const u32 sectorsize = fs_info->sectorsize; size_t in_len; size_t out_len; size_t max_segment_len = workspace_buf_length(fs_info); int ret; if (unlikely(srclen < LZO_LEN || srclen > max_segment_len + LZO_LEN * 2)) return -EUCLEAN; in_len = read_compress_length(data_in); if (unlikely(in_len != srclen)) return -EUCLEAN; data_in += LZO_LEN; in_len = read_compress_length(data_in); if (unlikely(in_len != srclen - LZO_LEN * 2)) return -EUCLEAN; data_in += LZO_LEN; out_len = sectorsize; ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len); if (unlikely(ret != LZO_E_OK)) { struct btrfs_inode *inode = folio_to_inode(dest_folio); btrfs_err(fs_info, "lzo decompression failed, error %d root %llu inode %llu offset %llu", ret, btrfs_root_id(inode->root), btrfs_ino(inode), folio_pos(dest_folio)); return -EIO; } ASSERT(out_len <= sectorsize); memcpy_to_folio(dest_folio, dest_pgoff, workspace->buf, out_len); /* Early end, considered as an error. */ if (unlikely(out_len < destlen)) { folio_zero_range(dest_folio, dest_pgoff + out_len, destlen - out_len); return -EIO; } return 0; } const struct btrfs_compress_levels btrfs_lzo_compress = { .max_level = 1, .default_level = 1, };
62 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2014 Facebook. All rights reserved. */ #ifndef BTRFS_QGROUP_H #define BTRFS_QGROUP_H #include <linux/types.h> #include <linux/spinlock.h> #include <linux/rbtree.h> #include <linux/kobject.h> #include <linux/list.h> #include <uapi/linux/btrfs_tree.h> struct extent_buffer; struct extent_changeset; struct btrfs_delayed_extent_op; struct btrfs_fs_info; struct btrfs_root; struct btrfs_ioctl_quota_ctl_args; struct btrfs_trans_handle; struct btrfs_delayed_ref_root; struct btrfs_inode; struct btrfs_transaction; struct btrfs_block_group; struct btrfs_qgroup_swapped_blocks; /* * Btrfs qgroup overview * * Btrfs qgroup splits into 3 main part: * 1) Reserve * Reserve metadata/data space for incoming operations * Affect how qgroup limit works * * 2) Trace * Tell btrfs qgroup to trace dirty extents. * * Dirty extents including: * - Newly allocated extents * - Extents going to be deleted (in this trans) * - Extents whose owner is going to be modified * * This is the main part affects whether qgroup numbers will stay * consistent. * Btrfs qgroup can trace clean extents and won't cause any problem, * but it will consume extra CPU time, it should be avoided if possible. * * 3) Account * Btrfs qgroup will updates its numbers, based on dirty extents traced * in previous step. * * Normally at qgroup rescan and transaction commit time. */ /* * Special performance optimization for balance. * * For balance, we need to swap subtree of subvolume and reloc trees. * In theory, we need to trace all subtree blocks of both subvolume and reloc * trees, since their owner has changed during such swap. * * However since balance has ensured that both subtrees are containing the * same contents and have the same tree structures, such swap won't cause * qgroup number change. * * But there is a race window between subtree swap and transaction commit, * during that window, if we increase/decrease tree level or merge/split tree * blocks, we still need to trace the original subtrees. * * So for balance, we use a delayed subtree tracing, whose workflow is: * * 1) Record the subtree root block get swapped. * * During subtree swap: * O = Old tree blocks * N = New tree blocks * reloc tree subvolume tree X * Root Root * / \ / \ * NA OB OA OB * / | | \ / | | \ * NC ND OE OF OC OD OE OF * * In this case, NA and OA are going to be swapped, record (NA, OA) into * subvolume tree X. * * 2) After subtree swap. * reloc tree subvolume tree X * Root Root * / \ / \ * OA OB NA OB * / | | \ / | | \ * OC OD OE OF NC ND OE OF * * 3a) COW happens for OB * If we are going to COW tree block OB, we check OB's bytenr against * tree X's swapped_blocks structure. * If it doesn't fit any, nothing will happen. * * 3b) COW happens for NA * Check NA's bytenr against tree X's swapped_blocks, and get a hit. * Then we do subtree scan on both subtrees OA and NA. * Resulting 6 tree blocks to be scanned (OA, OC, OD, NA, NC, ND). * * Then no matter what we do to subvolume tree X, qgroup numbers will * still be correct. * Then NA's record gets removed from X's swapped_blocks. * * 4) Transaction commit * Any record in X's swapped_blocks gets removed, since there is no * modification to the swapped subtrees, no need to trigger heavy qgroup * subtree rescan for them. */ /* * These flags share the flags field of the btrfs_qgroup_status_item with the * persisted flags defined in btrfs_tree.h. * * To minimize the chance of collision with new persisted status flags, these * count backwards from the MSB. */ #define BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN (1ULL << 63) #define BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING (1ULL << 62) #define BTRFS_QGROUP_DROP_SUBTREE_THRES_DEFAULT (3) /* * Record a dirty extent, and info qgroup to update quota on it */ struct btrfs_qgroup_extent_record { /* * The bytenr of the extent is given by its index in the dirty_extents * xarray of struct btrfs_delayed_ref_root left shifted by * fs_info->sectorsize_bits. */ u64 num_bytes; /* * For qgroup reserved data space freeing. * * @data_rsv_refroot and @data_rsv will be recorded after * BTRFS_ADD_DELAYED_EXTENT is called. * And will be used to free reserved qgroup space at * transaction commit time. */ u32 data_rsv; /* reserved data space needs to be freed */ u64 data_rsv_refroot; /* which root the reserved data belongs to */ struct ulist *old_roots; }; struct btrfs_qgroup_swapped_block { struct rb_node node; int level; bool trace_leaf; /* bytenr/generation of the tree block in subvolume tree after swap */ u64 subvol_bytenr; u64 subvol_generation; /* bytenr/generation of the tree block in reloc tree after swap */ u64 reloc_bytenr; u64 reloc_generation; u64 last_snapshot; struct btrfs_key first_key; }; /* * Qgroup reservation types: * * DATA: * space reserved for data * * META_PERTRANS: * Space reserved for metadata (per-transaction) * Due to the fact that qgroup data is only updated at transaction commit * time, reserved space for metadata must be kept until transaction * commits. * Any metadata reserved that are used in btrfs_start_transaction() should * be of this type. * * META_PREALLOC: * There are cases where metadata space is reserved before starting * transaction, and then btrfs_join_transaction() to get a trans handle. * Any metadata reserved for such usage should be of this type. * And after join_transaction() part (or all) of such reservation should * be converted into META_PERTRANS. */ enum btrfs_qgroup_rsv_type { BTRFS_QGROUP_RSV_DATA, BTRFS_QGROUP_RSV_META_PERTRANS, BTRFS_QGROUP_RSV_META_PREALLOC, BTRFS_QGROUP_RSV_LAST, }; /* * Represents how many bytes we have reserved for this qgroup. * * Each type should have different reservation behavior. * E.g, data follows its io_tree flag modification, while * *currently* meta is just reserve-and-clear during transaction. * * TODO: Add new type for reservation which can survive transaction commit. * Current metadata reservation behavior is not suitable for such case. */ struct btrfs_qgroup_rsv { u64 values[BTRFS_QGROUP_RSV_LAST]; }; /* * one struct for each qgroup, organized in fs_info->qgroup_tree. */ struct btrfs_qgroup { u64 qgroupid; /* * state */ u64 rfer; /* referenced */ u64 rfer_cmpr; /* referenced compressed */ u64 excl; /* exclusive */ u64 excl_cmpr; /* exclusive compressed */ /* * limits */ u64 lim_flags; /* which limits are set */ u64 max_rfer; u64 max_excl; u64 rsv_rfer; u64 rsv_excl; /* * reservation tracking */ struct btrfs_qgroup_rsv rsv; /* * lists */ struct list_head groups; /* groups this group is member of */ struct list_head members; /* groups that are members of this group */ struct list_head dirty; /* dirty groups */ /* * For qgroup iteration usage. * * The iteration list should always be empty until qgroup_iterator_add() * is called. And should be reset to empty after the iteration is * finished. */ struct list_head iterator; /* * For nested iterator usage. * * Here we support at most one level of nested iterator calls like: * * LIST_HEAD(all_qgroups); * { * LIST_HEAD(local_qgroups); * qgroup_iterator_add(local_qgroups, qg); * qgroup_iterator_nested_add(all_qgroups, qg); * do_some_work(local_qgroups); * qgroup_iterator_clean(local_qgroups); * } * do_some_work(all_qgroups); * qgroup_iterator_nested_clean(all_qgroups); */ struct list_head nested_iterator; struct rb_node node; /* tree of qgroups */ /* * temp variables for accounting operations * Refer to qgroup_shared_accounting() for details. */ u64 old_refcnt; u64 new_refcnt; /* * Sysfs kobjectid */ struct kobject kobj; }; /* Glue structure to represent the relations between qgroups. */ struct btrfs_qgroup_list { struct list_head next_group; struct list_head next_member; struct btrfs_qgroup *group; struct btrfs_qgroup *member; }; struct btrfs_squota_delta { /* The fstree root this delta counts against. */ u64 root; /* The number of bytes in the extent being counted. */ u64 num_bytes; /* The generation the extent was created in. */ u64 generation; /* Whether we are using or freeing the extent. */ bool is_inc; /* Whether the extent is data or metadata. */ bool is_data; }; static inline u64 btrfs_qgroup_subvolid(u64 qgroupid) { return (qgroupid & ((1ULL << BTRFS_QGROUP_LEVEL_SHIFT) - 1)); } /* * For qgroup event trace points only */ enum { ENUM_BIT(QGROUP_RESERVE), ENUM_BIT(QGROUP_RELEASE), ENUM_BIT(QGROUP_FREE), }; enum btrfs_qgroup_mode { BTRFS_QGROUP_MODE_DISABLED, BTRFS_QGROUP_MODE_FULL, BTRFS_QGROUP_MODE_SIMPLE }; enum btrfs_qgroup_mode btrfs_qgroup_mode(const struct btrfs_fs_info *fs_info); bool btrfs_qgroup_enabled(const struct btrfs_fs_info *fs_info); bool btrfs_qgroup_full_accounting(const struct btrfs_fs_info *fs_info); int btrfs_quota_enable(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_quota_ctl_args *quota_ctl_args); int btrfs_quota_disable(struct btrfs_fs_info *fs_info); int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info); void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info); int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info, bool interruptible); int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst, struct btrfs_qgroup_list *prealloc); int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst); int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid); int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid); int btrfs_qgroup_cleanup_dropped_subvolume(struct btrfs_fs_info *fs_info, u64 subvolid); int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid, struct btrfs_qgroup_limit *limit); int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info); void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info); int btrfs_qgroup_trace_extent_nolock( struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_qgroup_extent_record *record, u64 bytenr); int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, struct btrfs_qgroup_extent_record *qrecord, u64 bytenr); int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes); int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans, struct extent_buffer *eb); int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, struct extent_buffer *root_eb, u64 root_gen, int root_level); int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, struct ulist *old_roots, struct ulist *new_roots); int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans); int btrfs_run_qgroups(struct btrfs_trans_handle *trans); int btrfs_qgroup_check_inherit(struct btrfs_fs_info *fs_info, struct btrfs_qgroup_inherit *inherit, size_t size); int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, u64 objectid, u64 inode_rootid, struct btrfs_qgroup_inherit *inherit); void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, u64 ref_root, u64 num_bytes, enum btrfs_qgroup_rsv_type type); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS int btrfs_verify_qgroup_counts(const struct btrfs_fs_info *fs_info, u64 qgroupid, u64 rfer, u64 excl); #endif /* New io_tree based accurate qgroup reserve API */ int btrfs_qgroup_reserve_data(struct btrfs_inode *inode, struct extent_changeset **reserved, u64 start, u64 len); int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len, u64 *released); int btrfs_qgroup_free_data(struct btrfs_inode *inode, struct extent_changeset *reserved, u64 start, u64 len, u64 *freed); int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, enum btrfs_qgroup_rsv_type type, bool enforce); int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, enum btrfs_qgroup_rsv_type type, bool enforce, bool noflush); /* Reserve metadata space for pertrans and prealloc type */ static inline int btrfs_qgroup_reserve_meta_pertrans(struct btrfs_root *root, int num_bytes, bool enforce) { return __btrfs_qgroup_reserve_meta(root, num_bytes, BTRFS_QGROUP_RSV_META_PERTRANS, enforce, false); } static inline int btrfs_qgroup_reserve_meta_prealloc(struct btrfs_root *root, int num_bytes, bool enforce, bool noflush) { return __btrfs_qgroup_reserve_meta(root, num_bytes, BTRFS_QGROUP_RSV_META_PREALLOC, enforce, noflush); } void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes, enum btrfs_qgroup_rsv_type type); /* Free per-transaction meta reservation for error handling */ static inline void btrfs_qgroup_free_meta_pertrans(struct btrfs_root *root, int num_bytes) { __btrfs_qgroup_free_meta(root, num_bytes, BTRFS_QGROUP_RSV_META_PERTRANS); } /* Pre-allocated meta reservation can be freed at need */ static inline void btrfs_qgroup_free_meta_prealloc(struct btrfs_root *root, int num_bytes) { __btrfs_qgroup_free_meta(root, num_bytes, BTRFS_QGROUP_RSV_META_PREALLOC); } void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root); void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes); void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode); /* btrfs_qgroup_swapped_blocks related functions */ void btrfs_qgroup_init_swapped_blocks( struct btrfs_qgroup_swapped_blocks *swapped_blocks); void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root); int btrfs_qgroup_add_swapped_blocks(struct btrfs_root *subvol_root, struct btrfs_block_group *bg, struct extent_buffer *subvol_parent, int subvol_slot, struct extent_buffer *reloc_parent, int reloc_slot, u64 last_snapshot); int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *eb); void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans); bool btrfs_check_quota_leak(const struct btrfs_fs_info *fs_info); int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info, const struct btrfs_squota_delta *delta); #endif
3 3 3 1 6 2 1 1 1 1 3 4 165 163 1 2 164 164 3 3 3 8 10 10 1 1 3 3 3 1 3 3 3 3 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 // SPDX-License-Identifier: GPL-2.0-or-later /* -*- linux-c -*- --------------------------------------------------------- * * * linux/fs/devpts/inode.c * * Copyright 1998-2004 H. Peter Anvin -- All Rights Reserved * * ------------------------------------------------------------------------- */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/init.h> #include <linux/fs.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include <linux/sched.h> #include <linux/namei.h> #include <linux/slab.h> #include <linux/mount.h> #include <linux/tty.h> #include <linux/mutex.h> #include <linux/magic.h> #include <linux/idr.h> #include <linux/devpts_fs.h> #include <linux/fsnotify.h> #include <linux/seq_file.h> #define DEVPTS_DEFAULT_MODE 0600 /* * ptmx is a new node in /dev/pts and will be unused in legacy (single- * instance) mode. To prevent surprises in user space, set permissions of * ptmx to 0. Use 'chmod' or remount with '-o ptmxmode' to set meaningful * permissions. */ #define DEVPTS_DEFAULT_PTMX_MODE 0000 #define PTMX_MINOR 2 /* * sysctl support for setting limits on the number of Unix98 ptys allocated. * Otherwise one can eat up all kernel memory by opening /dev/ptmx repeatedly. */ static int pty_limit = NR_UNIX98_PTY_DEFAULT; static int pty_reserve = NR_UNIX98_PTY_RESERVE; static int pty_limit_min; static int pty_limit_max = INT_MAX; static atomic_t pty_count = ATOMIC_INIT(0); static const struct ctl_table pty_table[] = { { .procname = "max", .maxlen = sizeof(int), .mode = 0644, .data = &pty_limit, .proc_handler = proc_dointvec_minmax, .extra1 = &pty_limit_min, .extra2 = &pty_limit_max, }, { .procname = "reserve", .maxlen = sizeof(int), .mode = 0644, .data = &pty_reserve, .proc_handler = proc_dointvec_minmax, .extra1 = &pty_limit_min, .extra2 = &pty_limit_max, }, { .procname = "nr", .maxlen = sizeof(int), .mode = 0444, .data = &pty_count, .proc_handler = proc_dointvec, }, }; struct pts_mount_opts { int setuid; int setgid; kuid_t uid; kgid_t gid; umode_t mode; umode_t ptmxmode; int reserve; int max; }; enum { Opt_uid, Opt_gid, Opt_mode, Opt_ptmxmode, Opt_newinstance, Opt_max, Opt_err }; static const struct fs_parameter_spec devpts_param_specs[] = { fsparam_gid ("gid", Opt_gid), fsparam_s32 ("max", Opt_max), fsparam_u32oct ("mode", Opt_mode), fsparam_flag ("newinstance", Opt_newinstance), fsparam_u32oct ("ptmxmode", Opt_ptmxmode), fsparam_uid ("uid", Opt_uid), {} }; struct pts_fs_info { struct ida allocated_ptys; struct pts_mount_opts mount_opts; struct super_block *sb; struct inode *ptmx_inode; // borrowed }; static inline struct pts_fs_info *DEVPTS_SB(struct super_block *sb) { return sb->s_fs_info; } static int devpts_ptmx_path(struct path *path) { struct super_block *sb; int err; /* Is a devpts filesystem at "pts" in the same directory? */ err = path_pts(path); if (err) return err; /* Is the path the root of a devpts filesystem? */ sb = path->mnt->mnt_sb; if ((sb->s_magic != DEVPTS_SUPER_MAGIC) || (path->mnt->mnt_root != sb->s_root)) return -ENODEV; return 0; } /* * Try to find a suitable devpts filesystem. We support the following * scenarios: * - The ptmx device node is located in the same directory as the devpts * mount where the pts device nodes are located. * This is e.g. the case when calling open on the /dev/pts/ptmx device * node when the devpts filesystem is mounted at /dev/pts. * - The ptmx device node is located outside the devpts filesystem mount * where the pts device nodes are located. For example, the ptmx device * is a symlink, separate device node, or bind-mount. * A supported scenario is bind-mounting /dev/pts/ptmx to /dev/ptmx and * then calling open on /dev/ptmx. In this case a suitable pts * subdirectory can be found in the common parent directory /dev of the * devpts mount and the ptmx bind-mount, after resolving the /dev/ptmx * bind-mount. * If no suitable pts subdirectory can be found this function will fail. * This is e.g. the case when bind-mounting /dev/pts/ptmx to /ptmx. */ struct vfsmount *devpts_mntget(struct file *filp, struct pts_fs_info *fsi) { struct path path; int err = 0; path = filp->f_path; path_get(&path); /* Walk upward while the start point is a bind mount of * a single file. */ while (path.mnt->mnt_root == path.dentry) if (follow_up(&path) == 0) break; /* devpts_ptmx_path() finds a devpts fs or returns an error. */ if ((path.mnt->mnt_sb->s_magic != DEVPTS_SUPER_MAGIC) || (DEVPTS_SB(path.mnt->mnt_sb) != fsi)) err = devpts_ptmx_path(&path); dput(path.dentry); if (!err) { if (DEVPTS_SB(path.mnt->mnt_sb) == fsi) return path.mnt; err = -ENODEV; } mntput(path.mnt); return ERR_PTR(err); } struct pts_fs_info *devpts_acquire(struct file *filp) { struct pts_fs_info *result; struct path path; struct super_block *sb; path = filp->f_path; path_get(&path); /* Has the devpts filesystem already been found? */ if (path.mnt->mnt_sb->s_magic != DEVPTS_SUPER_MAGIC) { int err; err = devpts_ptmx_path(&path); if (err) { result = ERR_PTR(err); goto out; } } /* * pty code needs to hold extra references in case of last /dev/tty close */ sb = path.mnt->mnt_sb; atomic_inc(&sb->s_active); result = DEVPTS_SB(sb); out: path_put(&path); return result; } void devpts_release(struct pts_fs_info *fsi) { deactivate_super(fsi->sb); } /* * devpts_parse_param - Parse mount parameters */ static int devpts_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct pts_fs_info *fsi = fc->s_fs_info; struct pts_mount_opts *opts = &fsi->mount_opts; struct fs_parse_result result; int opt; opt = fs_parse(fc, devpts_param_specs, param, &result); if (opt < 0) return opt; switch (opt) { case Opt_uid: opts->uid = result.uid; opts->setuid = 1; break; case Opt_gid: opts->gid = result.gid; opts->setgid = 1; break; case Opt_mode: opts->mode = result.uint_32 & S_IALLUGO; break; case Opt_ptmxmode: opts->ptmxmode = result.uint_32 & S_IALLUGO; break; case Opt_newinstance: break; case Opt_max: if (result.uint_32 > NR_UNIX98_PTY_MAX) return invalf(fc, "max out of range"); opts->max = result.uint_32; break; } return 0; } static int mknod_ptmx(struct super_block *sb, struct fs_context *fc) { int mode; struct dentry *dentry; struct inode *inode; struct dentry *root = sb->s_root; struct pts_fs_info *fsi = DEVPTS_SB(sb); struct pts_mount_opts *opts = &fsi->mount_opts; kuid_t ptmx_uid = current_fsuid(); kgid_t ptmx_gid = current_fsgid(); dentry = simple_start_creating(root, "ptmx"); if (IS_ERR(dentry)) { pr_err("Unable to alloc dentry for ptmx node\n"); return PTR_ERR(dentry); } /* * Create a new 'ptmx' node in this mount of devpts. */ inode = new_inode(sb); if (!inode) { simple_done_creating(dentry); pr_err("Unable to alloc inode for ptmx node\n"); return -ENOMEM; } inode->i_ino = 2; simple_inode_init_ts(inode); mode = S_IFCHR|opts->ptmxmode; init_special_inode(inode, mode, MKDEV(TTYAUX_MAJOR, 2)); inode->i_uid = ptmx_uid; inode->i_gid = ptmx_gid; fsi->ptmx_inode = inode; d_make_persistent(dentry, inode); simple_done_creating(dentry); return 0; } static void update_ptmx_mode(struct pts_fs_info *fsi) { fsi->ptmx_inode->i_mode = S_IFCHR|fsi->mount_opts.ptmxmode; } static int devpts_reconfigure(struct fs_context *fc) { struct pts_fs_info *fsi = DEVPTS_SB(fc->root->d_sb); struct pts_fs_info *new = fc->s_fs_info; /* Apply the revised options. We don't want to change ->reserve. * Ideally, we'd update each option conditionally on it having been * explicitly changed, but the default is to reset everything so that * would break UAPI... */ fsi->mount_opts.setuid = new->mount_opts.setuid; fsi->mount_opts.setgid = new->mount_opts.setgid; fsi->mount_opts.uid = new->mount_opts.uid; fsi->mount_opts.gid = new->mount_opts.gid; fsi->mount_opts.mode = new->mount_opts.mode; fsi->mount_opts.ptmxmode = new->mount_opts.ptmxmode; fsi->mount_opts.max = new->mount_opts.max; /* * parse_mount_options() restores options to default values * before parsing and may have changed ptmxmode. So, update the * mode in the inode too. Bogus options don't fail the remount, * so do this even on error return. */ update_ptmx_mode(fsi); return 0; } static int devpts_show_options(struct seq_file *seq, struct dentry *root) { struct pts_fs_info *fsi = DEVPTS_SB(root->d_sb); struct pts_mount_opts *opts = &fsi->mount_opts; if (opts->setuid) seq_printf(seq, ",uid=%u", from_kuid_munged(&init_user_ns, opts->uid)); if (opts->setgid) seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, opts->gid)); seq_printf(seq, ",mode=%03o", opts->mode); seq_printf(seq, ",ptmxmode=%03o", opts->ptmxmode); if (opts->max < NR_UNIX98_PTY_MAX) seq_printf(seq, ",max=%d", opts->max); return 0; } static const struct super_operations devpts_sops = { .statfs = simple_statfs, .show_options = devpts_show_options, }; static int devpts_fill_super(struct super_block *s, struct fs_context *fc) { struct pts_fs_info *fsi = DEVPTS_SB(s); struct inode *inode; s->s_iflags &= ~SB_I_NODEV; s->s_blocksize = 1024; s->s_blocksize_bits = 10; s->s_magic = DEVPTS_SUPER_MAGIC; s->s_op = &devpts_sops; s->s_d_flags = DCACHE_DONTCACHE; s->s_time_gran = 1; fsi->sb = s; inode = new_inode(s); if (!inode) return -ENOMEM; inode->i_ino = 1; simple_inode_init_ts(inode); inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR; inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; set_nlink(inode, 2); s->s_root = d_make_root(inode); if (!s->s_root) { pr_err("get root dentry failed\n"); return -ENOMEM; } return mknod_ptmx(s, fc); } /* * devpts_get_tree() * * Mount a new (private) instance of devpts. PTYs created in this * instance are independent of the PTYs in other devpts instances. */ static int devpts_get_tree(struct fs_context *fc) { return get_tree_nodev(fc, devpts_fill_super); } static void devpts_free_fc(struct fs_context *fc) { kfree(fc->s_fs_info); } static const struct fs_context_operations devpts_context_ops = { .free = devpts_free_fc, .parse_param = devpts_parse_param, .get_tree = devpts_get_tree, .reconfigure = devpts_reconfigure, }; /* * Set up the filesystem mount context. */ static int devpts_init_fs_context(struct fs_context *fc) { struct pts_fs_info *fsi; fsi = kzalloc_obj(struct pts_fs_info); if (!fsi) return -ENOMEM; ida_init(&fsi->allocated_ptys); fsi->mount_opts.uid = GLOBAL_ROOT_UID; fsi->mount_opts.gid = GLOBAL_ROOT_GID; fsi->mount_opts.mode = DEVPTS_DEFAULT_MODE; fsi->mount_opts.ptmxmode = DEVPTS_DEFAULT_PTMX_MODE; fsi->mount_opts.max = NR_UNIX98_PTY_MAX; if (fc->purpose == FS_CONTEXT_FOR_MOUNT && current->nsproxy->mnt_ns == init_task.nsproxy->mnt_ns) fsi->mount_opts.reserve = true; fc->s_fs_info = fsi; fc->ops = &devpts_context_ops; return 0; } static void devpts_kill_sb(struct super_block *sb) { struct pts_fs_info *fsi = DEVPTS_SB(sb); if (fsi) ida_destroy(&fsi->allocated_ptys); kfree(fsi); kill_anon_super(sb); } static struct file_system_type devpts_fs_type = { .name = "devpts", .init_fs_context = devpts_init_fs_context, .parameters = devpts_param_specs, .kill_sb = devpts_kill_sb, .fs_flags = FS_USERNS_MOUNT, }; /* * The normal naming convention is simply /dev/pts/<number>; this conforms * to the System V naming convention */ int devpts_new_index(struct pts_fs_info *fsi) { int index = -ENOSPC; if (atomic_inc_return(&pty_count) >= (pty_limit - (fsi->mount_opts.reserve ? 0 : pty_reserve))) goto out; index = ida_alloc_max(&fsi->allocated_ptys, fsi->mount_opts.max - 1, GFP_KERNEL); out: if (index < 0) atomic_dec(&pty_count); return index; } void devpts_kill_index(struct pts_fs_info *fsi, int idx) { ida_free(&fsi->allocated_ptys, idx); atomic_dec(&pty_count); } /** * devpts_pty_new -- create a new inode in /dev/pts/ * @fsi: Filesystem info for this instance. * @index: used as a name of the node * @priv: what's given back by devpts_get_priv * * The dentry for the created inode is returned. * Remove it from /dev/pts/ with devpts_pty_kill(). */ struct dentry *devpts_pty_new(struct pts_fs_info *fsi, int index, void *priv) { struct dentry *dentry; struct super_block *sb = fsi->sb; struct inode *inode; struct dentry *root; struct pts_mount_opts *opts; char s[12]; root = sb->s_root; opts = &fsi->mount_opts; inode = new_inode(sb); if (!inode) return ERR_PTR(-ENOMEM); inode->i_ino = index + 3; inode->i_uid = opts->setuid ? opts->uid : current_fsuid(); inode->i_gid = opts->setgid ? opts->gid : current_fsgid(); simple_inode_init_ts(inode); init_special_inode(inode, S_IFCHR|opts->mode, MKDEV(UNIX98_PTY_SLAVE_MAJOR, index)); sprintf(s, "%d", index); dentry = d_alloc_name(root, s); if (!dentry) { iput(inode); return ERR_PTR(-ENOMEM); } dentry->d_fsdata = priv; d_make_persistent(dentry, inode); fsnotify_create(d_inode(root), dentry); dput(dentry); return dentry; // borrowed } /** * devpts_get_priv -- get private data for a slave * @dentry: dentry of the slave * * Returns whatever was passed as priv in devpts_pty_new for a given inode. */ void *devpts_get_priv(struct dentry *dentry) { if (dentry->d_sb->s_magic != DEVPTS_SUPER_MAGIC) return NULL; return dentry->d_fsdata; } /** * devpts_pty_kill -- remove inode form /dev/pts/ * @dentry: dentry of the slave to be removed * * This is an inverse operation of devpts_pty_new. */ void devpts_pty_kill(struct dentry *dentry) { WARN_ON_ONCE(dentry->d_sb->s_magic != DEVPTS_SUPER_MAGIC); dentry->d_fsdata = NULL; drop_nlink(dentry->d_inode); d_drop(dentry); fsnotify_unlink(d_inode(dentry->d_parent), dentry); d_make_discardable(dentry); } static int __init init_devpts_fs(void) { int err = register_filesystem(&devpts_fs_type); if (!err) { register_sysctl("kernel/pty", pty_table); } return err; } module_init(init_devpts_fs)
8413 6766 7806 60 4656 7806 10 767 579 767 233 4 232 5 472 6009 3522 2061 4146 4039 791 221 4211 1653 4828 69 579 1773 23 501 1 2 5680 7 3435 4079 1883 1658 2558 168 4298 3920 99 1 2505 366 5905 7 1551 5905 14 1202 8 1732 996 129 129 9 1658 1666 1659 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 /* SPDX-License-Identifier: GPL-2.0 */ /* * Macros for manipulating and testing page->flags */ #ifndef PAGE_FLAGS_H #define PAGE_FLAGS_H #include <linux/types.h> #include <linux/bug.h> #include <linux/mmdebug.h> #ifndef __GENERATING_BOUNDS_H #include <linux/mm_types.h> #include <generated/bounds.h> #endif /* !__GENERATING_BOUNDS_H */ /* * Various page->flags bits: * * PG_reserved is set for special pages. The "struct page" of such a page * should in general not be touched (e.g. set dirty) except by its owner. * Pages marked as PG_reserved include: * - Pages part of the kernel image (including vDSO) and similar (e.g. BIOS, * initrd, HW tables) * - Pages reserved or allocated early during boot (before the page allocator * was initialized). This includes (depending on the architecture) the * initial vmemmap, initial page tables, crashkernel, elfcorehdr, and much * much more. Once (if ever) freed, PG_reserved is cleared and they will * be given to the page allocator. * - Pages falling into physical memory gaps - not IORESOURCE_SYSRAM. Trying * to read/write these pages might end badly. Don't touch! * - The zero page(s) * - Pages allocated in the context of kexec/kdump (loaded kernel image, * control pages, vmcoreinfo) * - MMIO/DMA pages. Some architectures don't allow to ioremap pages that are * not marked PG_reserved (as they might be in use by somebody else who does * not respect the caching strategy). * - MCA pages on ia64 * - Pages holding CPU notes for POWER Firmware Assisted Dump * - Device memory (e.g. PMEM, DAX, HMM) * Some PG_reserved pages will be excluded from the hibernation image. * PG_reserved does in general not hinder anybody from dumping or swapping * and is no longer required for remap_pfn_range(). ioremap might require it. * Consequently, PG_reserved for a page mapped into user space can indicate * the zero page, the vDSO, MMIO pages or device memory. * * The PG_private bitflag is set on pagecache pages if they contain filesystem * specific data (which is normally at page->private). It can be used by * private allocations for its own usage. * * During initiation of disk I/O, PG_locked is set. This bit is set before I/O * and cleared when writeback _starts_ or when read _completes_. PG_writeback * is set before writeback starts and cleared when it finishes. * * PG_locked also pins a page in pagecache, and blocks truncation of the file * while it is held. * * page_waitqueue(page) is a wait queue of all tasks waiting for the page * to become unlocked. * * PG_swapbacked is set when a page uses swap as a backing storage. This are * usually PageAnon or shmem pages but please note that even anonymous pages * might lose their PG_swapbacked flag when they simply can be dropped (e.g. as * a result of MADV_FREE). * * PG_referenced, PG_reclaim are used for page reclaim for anonymous and * file-backed pagecache (see mm/vmscan.c). * * PG_arch_1 is an architecture specific page state bit. The generic code * guarantees that this bit is cleared for a page when it first is entered into * the page cache. * * PG_hwpoison indicates that a page got corrupted in hardware and contains * data with incorrect ECC bits that triggered a machine check. Accessing is * not safe since it may cause another machine check. Don't touch! */ /* * Don't use the pageflags directly. Use the PageFoo macros. * * The page flags field is split into two parts, the main flags area * which extends from the low bits upwards, and the fields area which * extends from the high bits downwards. * * | FIELD | ... | FLAGS | * N-1 ^ 0 * (NR_PAGEFLAGS) * * The fields area is reserved for fields mapping zone, node (for NUMA) and * SPARSEMEM section (for variants of SPARSEMEM that require section ids like * SPARSEMEM_EXTREME with !SPARSEMEM_VMEMMAP). */ enum pageflags { PG_locked, /* Page is locked. Don't touch. */ PG_writeback, /* Page is under writeback */ PG_referenced, PG_uptodate, PG_dirty, PG_lru, PG_head, /* Must be in bit 6 */ PG_waiters, /* Page has waiters, check its waitqueue. Must be bit #7 and in the same byte as "PG_locked" */ PG_active, PG_workingset, PG_owner_priv_1, /* Owner use. If pagecache, fs may use */ PG_owner_2, /* Owner use. If pagecache, fs may use */ PG_arch_1, PG_reserved, PG_private, /* If pagecache, has fs-private data */ PG_private_2, /* If pagecache, has fs aux data */ PG_reclaim, /* To be reclaimed asap */ PG_swapbacked, /* Page is backed by RAM/swap */ PG_unevictable, /* Page is "unevictable" */ PG_dropbehind, /* drop pages on IO completion */ #ifdef CONFIG_MMU PG_mlocked, /* Page is vma mlocked */ #endif #ifdef CONFIG_MEMORY_FAILURE PG_hwpoison, /* hardware poisoned page. Don't touch */ #endif #if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT) PG_young, PG_idle, #endif #ifdef CONFIG_ARCH_USES_PG_ARCH_2 PG_arch_2, #endif #ifdef CONFIG_ARCH_USES_PG_ARCH_3 PG_arch_3, #endif __NR_PAGEFLAGS, PG_readahead = PG_reclaim, /* Anonymous memory (and shmem) */ PG_swapcache = PG_owner_priv_1, /* Swap page: swp_entry_t in private */ /* Some filesystems */ PG_checked = PG_owner_priv_1, /* * Depending on the way an anonymous folio can be mapped into a page * table (e.g., single PMD/PUD/CONT of the head page vs. PTE-mapped * THP), PG_anon_exclusive may be set only for the head page or for * tail pages of an anonymous folio. For now, we only expect it to be * set on tail pages for PTE-mapped THP. */ PG_anon_exclusive = PG_owner_2, /* * Set if all buffer heads in the folio are mapped. * Filesystems which do not use BHs can use it for their own purpose. */ PG_mappedtodisk = PG_owner_2, /* Two page bits are conscripted by FS-Cache to maintain local caching * state. These bits are set on pages belonging to the netfs's inodes * when those inodes are being locally cached. */ PG_fscache = PG_private_2, /* page backed by cache */ /* XEN */ /* Pinned in Xen as a read-only pagetable page. */ PG_pinned = PG_owner_priv_1, /* Pinned as part of domain save (see xen_mm_pin_all()). */ PG_savepinned = PG_dirty, /* Has a grant mapping of another (foreign) domain's page. */ PG_foreign = PG_owner_priv_1, /* Remapped by swiotlb-xen. */ PG_xen_remapped = PG_owner_priv_1, #ifdef CONFIG_MIGRATION /* movable_ops page that is isolated for migration */ PG_movable_ops_isolated = PG_reclaim, /* this is a movable_ops page (for selected typed pages only) */ PG_movable_ops = PG_uptodate, #endif /* Only valid for buddy pages. Used to track pages that are reported */ PG_reported = PG_uptodate, #ifdef CONFIG_MEMORY_HOTPLUG /* For self-hosted memmap pages */ PG_vmemmap_self_hosted = PG_owner_priv_1, #endif /* * Flags only valid for compound pages. Stored in first tail page's * flags word. Cannot use the first 8 flags or any flag marked as * PF_ANY. */ /* At least one page in this folio has the hwpoison flag set */ PG_has_hwpoisoned = PG_active, PG_large_rmappable = PG_workingset, /* anon or file-backed */ PG_partially_mapped = PG_reclaim, /* was identified to be partially mapped */ }; #define PAGEFLAGS_MASK ((1UL << NR_PAGEFLAGS) - 1) #ifndef __GENERATING_BOUNDS_H #ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP DECLARE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key); /* * Return the real head page struct iff the @page is a fake head page, otherwise * return the @page itself. See Documentation/mm/vmemmap_dedup.rst. */ static __always_inline const struct page *page_fixed_fake_head(const struct page *page) { if (!static_branch_unlikely(&hugetlb_optimize_vmemmap_key)) return page; /* * Only addresses aligned with PAGE_SIZE of struct page may be fake head * struct page. The alignment check aims to avoid access the fields ( * e.g. compound_head) of the @page[1]. It can avoid touch a (possibly) * cold cacheline in some cases. */ if (IS_ALIGNED((unsigned long)page, PAGE_SIZE) && test_bit(PG_head, &page->flags.f)) { /* * We can safely access the field of the @page[1] with PG_head * because the @page is a compound page composed with at least * two contiguous pages. */ unsigned long head = READ_ONCE(page[1].compound_head); if (likely(head & 1)) return (const struct page *)(head - 1); } return page; } static __always_inline bool page_count_writable(const struct page *page, int u) { if (!static_branch_unlikely(&hugetlb_optimize_vmemmap_key)) return true; /* * The refcount check is ordered before the fake-head check to prevent * the following race: * CPU 1 (HVO) CPU 2 (speculative PFN walker) * * page_ref_freeze() * synchronize_rcu() * rcu_read_lock() * page_is_fake_head() is false * vmemmap_remap_pte() * XXX: struct page[] becomes r/o * * page_ref_unfreeze() * page_ref_count() is not zero * * atomic_add_unless(&page->_refcount) * XXX: try to modify r/o struct page[] * * The refcount check also prevents modification attempts to other (r/o) * tail pages that are not fake heads. */ if (atomic_read_acquire(&page->_refcount) == u) return false; return page_fixed_fake_head(page) == page; } #else static inline const struct page *page_fixed_fake_head(const struct page *page) { return page; } static inline bool page_count_writable(const struct page *page, int u) { return true; } #endif static __always_inline int page_is_fake_head(const struct page *page) { return page_fixed_fake_head(page) != page; } static __always_inline unsigned long _compound_head(const struct page *page) { unsigned long head = READ_ONCE(page->compound_head); if (unlikely(head & 1)) return head - 1; return (unsigned long)page_fixed_fake_head(page); } #define compound_head(page) ((typeof(page))_compound_head(page)) /** * page_folio - Converts from page to folio. * @p: The page. * * Every page is part of a folio. This function cannot be called on a * NULL pointer. * * Context: No reference, nor lock is required on @page. If the caller * does not hold a reference, this call may race with a folio split, so * it should re-check the folio still contains this page after gaining * a reference on the folio. * Return: The folio which contains this page. */ #define page_folio(p) (_Generic((p), \ const struct page *: (const struct folio *)_compound_head(p), \ struct page *: (struct folio *)_compound_head(p))) /** * folio_page - Return a page from a folio. * @folio: The folio. * @n: The page number to return. * * @n is relative to the start of the folio. This function does not * check that the page number lies within @folio; the caller is presumed * to have a reference to the page. */ #define folio_page(folio, n) (&(folio)->page + (n)) static __always_inline int PageTail(const struct page *page) { return READ_ONCE(page->compound_head) & 1 || page_is_fake_head(page); } static __always_inline int PageCompound(const struct page *page) { return test_bit(PG_head, &page->flags.f) || READ_ONCE(page->compound_head) & 1; } #define PAGE_POISON_PATTERN -1l static inline int PagePoisoned(const struct page *page) { return READ_ONCE(page->flags.f) == PAGE_POISON_PATTERN; } #ifdef CONFIG_DEBUG_VM void page_init_poison(struct page *page, size_t size); #else static inline void page_init_poison(struct page *page, size_t size) { } #endif static const unsigned long *const_folio_flags(const struct folio *folio, unsigned n) { const struct page *page = &folio->page; VM_BUG_ON_PGFLAGS(page->compound_head & 1, page); VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags.f), page); return &page[n].flags.f; } static unsigned long *folio_flags(struct folio *folio, unsigned n) { struct page *page = &folio->page; VM_BUG_ON_PGFLAGS(page->compound_head & 1, page); VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags.f), page); return &page[n].flags.f; } /* * Page flags policies wrt compound pages * * PF_POISONED_CHECK * check if this struct page poisoned/uninitialized * * PF_ANY: * the page flag is relevant for small, head and tail pages. * * PF_HEAD: * for compound page all operations related to the page flag applied to * head page. * * PF_NO_TAIL: * modifications of the page flag must be done on small or head pages, * checks can be done on tail pages too. * * PF_NO_COMPOUND: * the page flag is not relevant for compound pages. * * PF_SECOND: * the page flag is stored in the first tail page. */ #define PF_POISONED_CHECK(page) ({ \ VM_BUG_ON_PGFLAGS(PagePoisoned(page), page); \ page; }) #define PF_ANY(page, enforce) PF_POISONED_CHECK(page) #define PF_HEAD(page, enforce) PF_POISONED_CHECK(compound_head(page)) #define PF_NO_TAIL(page, enforce) ({ \ VM_BUG_ON_PGFLAGS(enforce && PageTail(page), page); \ PF_POISONED_CHECK(compound_head(page)); }) #define PF_NO_COMPOUND(page, enforce) ({ \ VM_BUG_ON_PGFLAGS(enforce && PageCompound(page), page); \ PF_POISONED_CHECK(page); }) #define PF_SECOND(page, enforce) ({ \ VM_BUG_ON_PGFLAGS(!PageHead(page), page); \ PF_POISONED_CHECK(&page[1]); }) /* Which page is the flag stored in */ #define FOLIO_PF_ANY 0 #define FOLIO_PF_HEAD 0 #define FOLIO_PF_NO_TAIL 0 #define FOLIO_PF_NO_COMPOUND 0 #define FOLIO_PF_SECOND 1 #define FOLIO_HEAD_PAGE 0 #define FOLIO_SECOND_PAGE 1 /* * Macros to create function definitions for page flags */ #define FOLIO_TEST_FLAG(name, page) \ static __always_inline bool folio_test_##name(const struct folio *folio) \ { return test_bit(PG_##name, const_folio_flags(folio, page)); } #define FOLIO_SET_FLAG(name, page) \ static __always_inline void folio_set_##name(struct folio *folio) \ { set_bit(PG_##name, folio_flags(folio, page)); } #define FOLIO_CLEAR_FLAG(name, page) \ static __always_inline void folio_clear_##name(struct folio *folio) \ { clear_bit(PG_##name, folio_flags(folio, page)); } #define __FOLIO_SET_FLAG(name, page) \ static __always_inline void __folio_set_##name(struct folio *folio) \ { __set_bit(PG_##name, folio_flags(folio, page)); } #define __FOLIO_CLEAR_FLAG(name, page) \ static __always_inline void __folio_clear_##name(struct folio *folio) \ { __clear_bit(PG_##name, folio_flags(folio, page)); } #define FOLIO_TEST_SET_FLAG(name, page) \ static __always_inline bool folio_test_set_##name(struct folio *folio) \ { return test_and_set_bit(PG_##name, folio_flags(folio, page)); } #define FOLIO_TEST_CLEAR_FLAG(name, page) \ static __always_inline bool folio_test_clear_##name(struct folio *folio) \ { return test_and_clear_bit(PG_##name, folio_flags(folio, page)); } #define FOLIO_FLAG(name, page) \ FOLIO_TEST_FLAG(name, page) \ FOLIO_SET_FLAG(name, page) \ FOLIO_CLEAR_FLAG(name, page) #define TESTPAGEFLAG(uname, lname, policy) \ FOLIO_TEST_FLAG(lname, FOLIO_##policy) \ static __always_inline int Page##uname(const struct page *page) \ { return test_bit(PG_##lname, &policy(page, 0)->flags.f); } #define SETPAGEFLAG(uname, lname, policy) \ FOLIO_SET_FLAG(lname, FOLIO_##policy) \ static __always_inline void SetPage##uname(struct page *page) \ { set_bit(PG_##lname, &policy(page, 1)->flags.f); } #define CLEARPAGEFLAG(uname, lname, policy) \ FOLIO_CLEAR_FLAG(lname, FOLIO_##policy) \ static __always_inline void ClearPage##uname(struct page *page) \ { clear_bit(PG_##lname, &policy(page, 1)->flags.f); } #define __SETPAGEFLAG(uname, lname, policy) \ __FOLIO_SET_FLAG(lname, FOLIO_##policy) \ static __always_inline void __SetPage##uname(struct page *page) \ { __set_bit(PG_##lname, &policy(page, 1)->flags.f); } #define __CLEARPAGEFLAG(uname, lname, policy) \ __FOLIO_CLEAR_FLAG(lname, FOLIO_##policy) \ static __always_inline void __ClearPage##uname(struct page *page) \ { __clear_bit(PG_##lname, &policy(page, 1)->flags.f); } #define TESTSETFLAG(uname, lname, policy) \ FOLIO_TEST_SET_FLAG(lname, FOLIO_##policy) \ static __always_inline int TestSetPage##uname(struct page *page) \ { return test_and_set_bit(PG_##lname, &policy(page, 1)->flags.f); } #define TESTCLEARFLAG(uname, lname, policy) \ FOLIO_TEST_CLEAR_FLAG(lname, FOLIO_##policy) \ static __always_inline int TestClearPage##uname(struct page *page) \ { return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags.f); } #define PAGEFLAG(uname, lname, policy) \ TESTPAGEFLAG(uname, lname, policy) \ SETPAGEFLAG(uname, lname, policy) \ CLEARPAGEFLAG(uname, lname, policy) #define __PAGEFLAG(uname, lname, policy) \ TESTPAGEFLAG(uname, lname, policy) \ __SETPAGEFLAG(uname, lname, policy) \ __CLEARPAGEFLAG(uname, lname, policy) #define TESTSCFLAG(uname, lname, policy) \ TESTSETFLAG(uname, lname, policy) \ TESTCLEARFLAG(uname, lname, policy) #define FOLIO_TEST_FLAG_FALSE(name) \ static inline bool folio_test_##name(const struct folio *folio) \ { return false; } #define FOLIO_SET_FLAG_NOOP(name) \ static inline void folio_set_##name(struct folio *folio) { } #define FOLIO_CLEAR_FLAG_NOOP(name) \ static inline void folio_clear_##name(struct folio *folio) { } #define __FOLIO_SET_FLAG_NOOP(name) \ static inline void __folio_set_##name(struct folio *folio) { } #define __FOLIO_CLEAR_FLAG_NOOP(name) \ static inline void __folio_clear_##name(struct folio *folio) { } #define FOLIO_TEST_SET_FLAG_FALSE(name) \ static inline bool folio_test_set_##name(struct folio *folio) \ { return false; } #define FOLIO_TEST_CLEAR_FLAG_FALSE(name) \ static inline bool folio_test_clear_##name(struct folio *folio) \ { return false; } #define FOLIO_FLAG_FALSE(name) \ FOLIO_TEST_FLAG_FALSE(name) \ FOLIO_SET_FLAG_NOOP(name) \ FOLIO_CLEAR_FLAG_NOOP(name) #define TESTPAGEFLAG_FALSE(uname, lname) \ FOLIO_TEST_FLAG_FALSE(lname) \ static inline int Page##uname(const struct page *page) { return 0; } #define SETPAGEFLAG_NOOP(uname, lname) \ FOLIO_SET_FLAG_NOOP(lname) \ static inline void SetPage##uname(struct page *page) { } #define CLEARPAGEFLAG_NOOP(uname, lname) \ FOLIO_CLEAR_FLAG_NOOP(lname) \ static inline void ClearPage##uname(struct page *page) { } #define __CLEARPAGEFLAG_NOOP(uname, lname) \ __FOLIO_CLEAR_FLAG_NOOP(lname) \ static inline void __ClearPage##uname(struct page *page) { } #define TESTSETFLAG_FALSE(uname, lname) \ FOLIO_TEST_SET_FLAG_FALSE(lname) \ static inline int TestSetPage##uname(struct page *page) { return 0; } #define TESTCLEARFLAG_FALSE(uname, lname) \ FOLIO_TEST_CLEAR_FLAG_FALSE(lname) \ static inline int TestClearPage##uname(struct page *page) { return 0; } #define PAGEFLAG_FALSE(uname, lname) TESTPAGEFLAG_FALSE(uname, lname) \ SETPAGEFLAG_NOOP(uname, lname) CLEARPAGEFLAG_NOOP(uname, lname) #define TESTSCFLAG_FALSE(uname, lname) \ TESTSETFLAG_FALSE(uname, lname) TESTCLEARFLAG_FALSE(uname, lname) __PAGEFLAG(Locked, locked, PF_NO_TAIL) FOLIO_FLAG(waiters, FOLIO_HEAD_PAGE) FOLIO_FLAG(referenced, FOLIO_HEAD_PAGE) FOLIO_TEST_CLEAR_FLAG(referenced, FOLIO_HEAD_PAGE) __FOLIO_SET_FLAG(referenced, FOLIO_HEAD_PAGE) PAGEFLAG(Dirty, dirty, PF_HEAD) TESTSCFLAG(Dirty, dirty, PF_HEAD) __CLEARPAGEFLAG(Dirty, dirty, PF_HEAD) PAGEFLAG(LRU, lru, PF_HEAD) __CLEARPAGEFLAG(LRU, lru, PF_HEAD) TESTCLEARFLAG(LRU, lru, PF_HEAD) FOLIO_FLAG(active, FOLIO_HEAD_PAGE) __FOLIO_CLEAR_FLAG(active, FOLIO_HEAD_PAGE) FOLIO_TEST_CLEAR_FLAG(active, FOLIO_HEAD_PAGE) PAGEFLAG(Workingset, workingset, PF_HEAD) TESTCLEARFLAG(Workingset, workingset, PF_HEAD) PAGEFLAG(Checked, checked, PF_NO_COMPOUND) /* Used by some filesystems */ /* Xen */ PAGEFLAG(Pinned, pinned, PF_NO_COMPOUND) TESTSCFLAG(Pinned, pinned, PF_NO_COMPOUND) PAGEFLAG(SavePinned, savepinned, PF_NO_COMPOUND); PAGEFLAG(Foreign, foreign, PF_NO_COMPOUND); PAGEFLAG(XenRemapped, xen_remapped, PF_NO_COMPOUND) TESTCLEARFLAG(XenRemapped, xen_remapped, PF_NO_COMPOUND) PAGEFLAG(Reserved, reserved, PF_NO_COMPOUND) __CLEARPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND) __SETPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND) FOLIO_FLAG(swapbacked, FOLIO_HEAD_PAGE) __FOLIO_CLEAR_FLAG(swapbacked, FOLIO_HEAD_PAGE) __FOLIO_SET_FLAG(swapbacked, FOLIO_HEAD_PAGE) /* * Private page markings that may be used by the filesystem that owns the page * for its own purposes. * - PG_private and PG_private_2 cause release_folio() and co to be invoked */ PAGEFLAG(Private, private, PF_ANY) FOLIO_FLAG(private_2, FOLIO_HEAD_PAGE) /* owner_2 can be set on tail pages for anon memory */ FOLIO_FLAG(owner_2, FOLIO_HEAD_PAGE) /* * Only test-and-set exist for PG_writeback. The unconditional operators are * risky: they bypass page accounting. */ TESTPAGEFLAG(Writeback, writeback, PF_NO_TAIL) TESTSCFLAG(Writeback, writeback, PF_NO_TAIL) FOLIO_FLAG(mappedtodisk, FOLIO_HEAD_PAGE) /* PG_readahead is only used for reads; PG_reclaim is only for writes */ PAGEFLAG(Reclaim, reclaim, PF_NO_TAIL) TESTCLEARFLAG(Reclaim, reclaim, PF_NO_TAIL) FOLIO_FLAG(readahead, FOLIO_HEAD_PAGE) FOLIO_TEST_CLEAR_FLAG(readahead, FOLIO_HEAD_PAGE) FOLIO_FLAG(dropbehind, FOLIO_HEAD_PAGE) FOLIO_TEST_CLEAR_FLAG(dropbehind, FOLIO_HEAD_PAGE) __FOLIO_SET_FLAG(dropbehind, FOLIO_HEAD_PAGE) #ifdef CONFIG_HIGHMEM /* * Must use a macro here due to header dependency issues. page_zone() is not * available at this point. */ #define PageHighMem(__p) is_highmem_idx(page_zonenum(__p)) #define folio_test_highmem(__f) is_highmem_idx(folio_zonenum(__f)) #else PAGEFLAG_FALSE(HighMem, highmem) #endif #define PhysHighMem(__p) (PageHighMem(phys_to_page(__p))) /* Does kmap_local_folio() only allow access to one page of the folio? */ #ifdef CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP #define folio_test_partial_kmap(f) true #else #define folio_test_partial_kmap(f) folio_test_highmem(f) #endif #ifdef CONFIG_SWAP static __always_inline bool folio_test_swapcache(const struct folio *folio) { return folio_test_swapbacked(folio) && test_bit(PG_swapcache, const_folio_flags(folio, 0)); } FOLIO_SET_FLAG(swapcache, FOLIO_HEAD_PAGE) FOLIO_CLEAR_FLAG(swapcache, FOLIO_HEAD_PAGE) #else FOLIO_FLAG_FALSE(swapcache) #endif FOLIO_FLAG(unevictable, FOLIO_HEAD_PAGE) __FOLIO_CLEAR_FLAG(unevictable, FOLIO_HEAD_PAGE) FOLIO_TEST_CLEAR_FLAG(unevictable, FOLIO_HEAD_PAGE) #ifdef CONFIG_MMU FOLIO_FLAG(mlocked, FOLIO_HEAD_PAGE) __FOLIO_CLEAR_FLAG(mlocked, FOLIO_HEAD_PAGE) FOLIO_TEST_CLEAR_FLAG(mlocked, FOLIO_HEAD_PAGE) FOLIO_TEST_SET_FLAG(mlocked, FOLIO_HEAD_PAGE) #else FOLIO_FLAG_FALSE(mlocked) __FOLIO_CLEAR_FLAG_NOOP(mlocked) FOLIO_TEST_CLEAR_FLAG_FALSE(mlocked) FOLIO_TEST_SET_FLAG_FALSE(mlocked) #endif #ifdef CONFIG_MEMORY_FAILURE PAGEFLAG(HWPoison, hwpoison, PF_ANY) TESTSCFLAG(HWPoison, hwpoison, PF_ANY) #define __PG_HWPOISON (1UL << PG_hwpoison) #else PAGEFLAG_FALSE(HWPoison, hwpoison) #define __PG_HWPOISON 0 #endif #ifdef CONFIG_PAGE_IDLE_FLAG #ifdef CONFIG_64BIT FOLIO_TEST_FLAG(young, FOLIO_HEAD_PAGE) FOLIO_SET_FLAG(young, FOLIO_HEAD_PAGE) FOLIO_TEST_CLEAR_FLAG(young, FOLIO_HEAD_PAGE) FOLIO_FLAG(idle, FOLIO_HEAD_PAGE) #endif /* See page_idle.h for !64BIT workaround */ #else /* !CONFIG_PAGE_IDLE_FLAG */ FOLIO_FLAG_FALSE(young) FOLIO_TEST_CLEAR_FLAG_FALSE(young) FOLIO_FLAG_FALSE(idle) #endif /* * PageReported() is used to track reported free pages within the Buddy * allocator. We can use the non-atomic version of the test and set * operations as both should be shielded with the zone lock to prevent * any possible races on the setting or clearing of the bit. */ __PAGEFLAG(Reported, reported, PF_NO_COMPOUND) #ifdef CONFIG_MEMORY_HOTPLUG PAGEFLAG(VmemmapSelfHosted, vmemmap_self_hosted, PF_ANY) #else PAGEFLAG_FALSE(VmemmapSelfHosted, vmemmap_self_hosted) #endif /* * On an anonymous folio mapped into a user virtual memory area, * folio->mapping points to its anon_vma, not to a struct address_space; * with the FOLIO_MAPPING_ANON bit set to distinguish it. See rmap.h. * * On an anonymous folio in a VM_MERGEABLE area, if CONFIG_KSM is enabled, * the FOLIO_MAPPING_ANON_KSM bit may be set along with the FOLIO_MAPPING_ANON * bit; and then folio->mapping points, not to an anon_vma, but to a private * structure which KSM associates with that merged folio. See ksm.h. * * Please note that, confusingly, "folio_mapping" refers to the inode * address_space which maps the folio from disk; whereas "folio_mapped" * refers to user virtual address space into which the folio is mapped. * * For slab pages, since slab reuses the bits in struct page to store its * internal states, the folio->mapping does not exist as such, nor do * these flags below. So in order to avoid testing non-existent bits, * please make sure that folio_test_slab(folio) actually evaluates to * false before calling the following functions (e.g., folio_test_anon). * See mm/slab.h. */ #define FOLIO_MAPPING_ANON 0x1 #define FOLIO_MAPPING_ANON_KSM 0x2 #define FOLIO_MAPPING_KSM (FOLIO_MAPPING_ANON | FOLIO_MAPPING_ANON_KSM) #define FOLIO_MAPPING_FLAGS (FOLIO_MAPPING_ANON | FOLIO_MAPPING_ANON_KSM) static __always_inline bool folio_test_anon(const struct folio *folio) { return ((unsigned long)folio->mapping & FOLIO_MAPPING_ANON) != 0; } static __always_inline bool PageAnonNotKsm(const struct page *page) { unsigned long flags = (unsigned long)page_folio(page)->mapping; return (flags & FOLIO_MAPPING_FLAGS) == FOLIO_MAPPING_ANON; } static __always_inline bool PageAnon(const struct page *page) { return folio_test_anon(page_folio(page)); } #ifdef CONFIG_KSM /* * A KSM page is one of those write-protected "shared pages" or "merged pages" * which KSM maps into multiple mms, wherever identical anonymous page content * is found in VM_MERGEABLE vmas. It's a PageAnon page, pointing not to any * anon_vma, but to that page's node of the stable tree. */ static __always_inline bool folio_test_ksm(const struct folio *folio) { return ((unsigned long)folio->mapping & FOLIO_MAPPING_FLAGS) == FOLIO_MAPPING_KSM; } #else FOLIO_TEST_FLAG_FALSE(ksm) #endif u64 stable_page_flags(const struct page *page); /** * folio_xor_flags_has_waiters - Change some folio flags. * @folio: The folio. * @mask: Bits set in this word will be changed. * * This must only be used for flags which are changed with the folio * lock held. For example, it is unsafe to use for PG_dirty as that * can be set without the folio lock held. It can also only be used * on flags which are in the range 0-6 as some of the implementations * only affect those bits. * * Return: Whether there are tasks waiting on the folio. */ static inline bool folio_xor_flags_has_waiters(struct folio *folio, unsigned long mask) { return xor_unlock_is_negative_byte(mask, folio_flags(folio, 0)); } /** * folio_test_uptodate - Is this folio up to date? * @folio: The folio. * * The uptodate flag is set on a folio when every byte in the folio is * at least as new as the corresponding bytes on storage. Anonymous * and CoW folios are always uptodate. If the folio is not uptodate, * some of the bytes in it may be; see the is_partially_uptodate() * address_space operation. */ static inline bool folio_test_uptodate(const struct folio *folio) { bool ret = test_bit(PG_uptodate, const_folio_flags(folio, 0)); /* * Must ensure that the data we read out of the folio is loaded * _after_ we've loaded folio->flags to check the uptodate bit. * We can skip the barrier if the folio is not uptodate, because * we wouldn't be reading anything from it. * * See folio_mark_uptodate() for the other side of the story. */ if (ret) smp_rmb(); return ret; } static inline bool PageUptodate(const struct page *page) { return folio_test_uptodate(page_folio(page)); } static __always_inline void __folio_mark_uptodate(struct folio *folio) { smp_wmb(); __set_bit(PG_uptodate, folio_flags(folio, 0)); } static __always_inline void folio_mark_uptodate(struct folio *folio) { /* * Memory barrier must be issued before setting the PG_uptodate bit, * so that all previous stores issued in order to bring the folio * uptodate are actually visible before folio_test_uptodate becomes true. */ smp_wmb(); set_bit(PG_uptodate, folio_flags(folio, 0)); } static __always_inline void __SetPageUptodate(struct page *page) { __folio_mark_uptodate((struct folio *)page); } static __always_inline void SetPageUptodate(struct page *page) { folio_mark_uptodate((struct folio *)page); } CLEARPAGEFLAG(Uptodate, uptodate, PF_NO_TAIL) void __folio_start_writeback(struct folio *folio, bool keep_write); void set_page_writeback(struct page *page); #define folio_start_writeback(folio) \ __folio_start_writeback(folio, false) static __always_inline bool folio_test_head(const struct folio *folio) { return test_bit(PG_head, const_folio_flags(folio, FOLIO_PF_ANY)); } static __always_inline int PageHead(const struct page *page) { PF_POISONED_CHECK(page); return test_bit(PG_head, &page->flags.f) && !page_is_fake_head(page); } __SETPAGEFLAG(Head, head, PF_ANY) __CLEARPAGEFLAG(Head, head, PF_ANY) CLEARPAGEFLAG(Head, head, PF_ANY) /** * folio_test_large() - Does this folio contain more than one page? * @folio: The folio to test. * * Return: True if the folio is larger than one page. */ static inline bool folio_test_large(const struct folio *folio) { return folio_test_head(folio); } static __always_inline void set_compound_head(struct page *page, struct page *head) { WRITE_ONCE(page->compound_head, (unsigned long)head + 1); } static __always_inline void clear_compound_head(struct page *page) { WRITE_ONCE(page->compound_head, 0); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline void ClearPageCompound(struct page *page) { BUG_ON(!PageHead(page)); ClearPageHead(page); } FOLIO_FLAG(large_rmappable, FOLIO_SECOND_PAGE) FOLIO_FLAG(partially_mapped, FOLIO_SECOND_PAGE) #else FOLIO_FLAG_FALSE(large_rmappable) FOLIO_FLAG_FALSE(partially_mapped) #endif #define PG_head_mask ((1UL << PG_head)) #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* * PageTransCompound returns true for both transparent huge pages * and hugetlbfs pages, so it should only be called when it's known * that hugetlbfs pages aren't involved. */ static inline int PageTransCompound(const struct page *page) { return PageCompound(page); } #else TESTPAGEFLAG_FALSE(TransCompound, transcompound) #endif #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_TRANSPARENT_HUGEPAGE) /* * PageHasHWPoisoned indicates that at least one subpage is hwpoisoned in the * compound page. * * This flag is set by hwpoison handler. Cleared by THP split or free page. */ FOLIO_FLAG(has_hwpoisoned, FOLIO_SECOND_PAGE) #else FOLIO_FLAG_FALSE(has_hwpoisoned) #endif /* * For pages that do not use mapcount, page_type may be used. * The low 24 bits of pagetype may be used for your own purposes, as long * as you are careful to not affect the top 8 bits. The low bits of * pagetype will be overwritten when you clear the page_type from the page. */ enum pagetype { /* 0x00-0x7f are positive numbers, ie mapcount */ /* Reserve 0x80-0xef for mapcount overflow. */ PGTY_buddy = 0xf0, PGTY_offline = 0xf1, PGTY_table = 0xf2, PGTY_guard = 0xf3, PGTY_hugetlb = 0xf4, PGTY_slab = 0xf5, PGTY_zsmalloc = 0xf6, PGTY_unaccepted = 0xf7, PGTY_large_kmalloc = 0xf8, PGTY_mapcount_underflow = 0xff }; static inline bool page_type_has_type(int page_type) { return page_type < (PGTY_mapcount_underflow << 24); } /* This takes a mapcount which is one more than page->_mapcount */ static inline bool page_mapcount_is_type(unsigned int mapcount) { return page_type_has_type(mapcount - 1); } static inline bool page_has_type(const struct page *page) { return page_type_has_type(data_race(page->page_type)); } #define FOLIO_TYPE_OPS(lname, fname) \ static __always_inline bool folio_test_##fname(const struct folio *folio) \ { \ return data_race(folio->page.page_type >> 24) == PGTY_##lname; \ } \ static __always_inline void __folio_set_##fname(struct folio *folio) \ { \ if (folio_test_##fname(folio)) \ return; \ VM_BUG_ON_FOLIO(data_race(folio->page.page_type) != UINT_MAX, \ folio); \ folio->page.page_type = (unsigned int)PGTY_##lname << 24; \ } \ static __always_inline void __folio_clear_##fname(struct folio *folio) \ { \ if (folio->page.page_type == UINT_MAX) \ return; \ VM_BUG_ON_FOLIO(!folio_test_##fname(folio), folio); \ folio->page.page_type = UINT_MAX; \ } #define PAGE_TYPE_OPS(uname, lname, fname) \ FOLIO_TYPE_OPS(lname, fname) \ static __always_inline int Page##uname(const struct page *page) \ { \ return data_race(page->page_type >> 24) == PGTY_##lname; \ } \ static __always_inline void __SetPage##uname(struct page *page) \ { \ if (Page##uname(page)) \ return; \ VM_BUG_ON_PAGE(data_race(page->page_type) != UINT_MAX, page); \ page->page_type = (unsigned int)PGTY_##lname << 24; \ } \ static __always_inline void __ClearPage##uname(struct page *page) \ { \ if (page->page_type == UINT_MAX) \ return; \ VM_BUG_ON_PAGE(!Page##uname(page), page); \ page->page_type = UINT_MAX; \ } /* * PageBuddy() indicates that the page is free and in the buddy system * (see mm/page_alloc.c). */ PAGE_TYPE_OPS(Buddy, buddy, buddy) /* * PageOffline() indicates that the page is logically offline although the * containing section is online. (e.g. inflated in a balloon driver or * not onlined when onlining the section). * The content of these pages is effectively stale. Such pages should not * be touched (read/write/dump/save) except by their owner. * * When a memory block gets onlined, all pages are initialized with a * refcount of 1 and PageOffline(). generic_online_page() will * take care of clearing PageOffline(). * * If a driver wants to allow to offline unmovable PageOffline() pages without * putting them back to the buddy, it can do so via the memory notifier by * decrementing the reference count in MEM_GOING_OFFLINE and incrementing the * reference count in MEM_CANCEL_OFFLINE. When offlining, the PageOffline() * pages (now with a reference count of zero) are treated like free (unmanaged) * pages, allowing the containing memory block to get offlined. A driver that * relies on this feature is aware that re-onlining the memory block will * require not giving them to the buddy via generic_online_page(). * * Memory offlining code will not adjust the managed page count for any * PageOffline() pages, treating them like they were never exposed to the * buddy using generic_online_page(). * * There are drivers that mark a page PageOffline() and expect there won't be * any further access to page content. PFN walkers that read content of random * pages should check PageOffline() and synchronize with such drivers using * page_offline_freeze()/page_offline_thaw(). */ PAGE_TYPE_OPS(Offline, offline, offline) extern void page_offline_freeze(void); extern void page_offline_thaw(void); extern void page_offline_begin(void); extern void page_offline_end(void); /* * Marks pages in use as page tables. */ PAGE_TYPE_OPS(Table, table, pgtable) /* * Marks guardpages used with debug_pagealloc. */ PAGE_TYPE_OPS(Guard, guard, guard) PAGE_TYPE_OPS(Slab, slab, slab) #ifdef CONFIG_HUGETLB_PAGE FOLIO_TYPE_OPS(hugetlb, hugetlb) #else FOLIO_TEST_FLAG_FALSE(hugetlb) #endif PAGE_TYPE_OPS(Zsmalloc, zsmalloc, zsmalloc) /* * Mark pages that has to be accepted before touched for the first time. * * Serialized with zone lock. */ PAGE_TYPE_OPS(Unaccepted, unaccepted, unaccepted) PAGE_TYPE_OPS(LargeKmalloc, large_kmalloc, large_kmalloc) /** * PageHuge - Determine if the page belongs to hugetlbfs * @page: The page to test. * * Context: Any context. * Return: True for hugetlbfs pages, false for anon pages or pages * belonging to other filesystems. */ static inline bool PageHuge(const struct page *page) { return folio_test_hugetlb(page_folio(page)); } /* * Check if a page is currently marked HWPoisoned. Note that this check is * best effort only and inherently racy: there is no way to synchronize with * failing hardware. */ static inline bool is_page_hwpoison(const struct page *page) { const struct folio *folio; if (PageHWPoison(page)) return true; folio = page_folio(page); return folio_test_hugetlb(folio) && PageHWPoison(&folio->page); } static inline bool folio_contain_hwpoisoned_page(struct folio *folio) { return folio_test_hwpoison(folio) || (folio_test_large(folio) && folio_test_has_hwpoisoned(folio)); } bool is_free_buddy_page(const struct page *page); #ifdef CONFIG_MIGRATION /* * This page is migratable through movable_ops (for selected typed pages * only). * * Page migration of such pages might fail, for example, if the page is * already isolated by somebody else, or if the page is about to get freed. * * While a subsystem might set selected typed pages that support page migration * as being movable through movable_ops, it must never clear this flag. * * This flag is only cleared when the page is freed back to the buddy. * * Only selected page types support this flag (see page_movable_ops()) and * the flag might be used in other context for other pages. Always use * page_has_movable_ops() instead. */ TESTPAGEFLAG(MovableOps, movable_ops, PF_NO_TAIL); SETPAGEFLAG(MovableOps, movable_ops, PF_NO_TAIL); /* * A movable_ops page has this flag set while it is isolated for migration. * This flag primarily protects against concurrent migration attempts. * * Once migration ended (success or failure), the flag is cleared. The * flag is managed by the migration core. */ PAGEFLAG(MovableOpsIsolated, movable_ops_isolated, PF_NO_TAIL); #else /* !CONFIG_MIGRATION */ TESTPAGEFLAG_FALSE(MovableOps, movable_ops); SETPAGEFLAG_NOOP(MovableOps, movable_ops); PAGEFLAG_FALSE(MovableOpsIsolated, movable_ops_isolated); #endif /* CONFIG_MIGRATION */ /** * page_has_movable_ops - test for a movable_ops page * @page: The page to test. * * Test whether this is a movable_ops page. Such pages will stay that * way until freed. * * Returns true if this is a movable_ops page, otherwise false. */ static inline bool page_has_movable_ops(const struct page *page) { return PageMovableOps(page) && (PageOffline(page) || PageZsmalloc(page)); } static __always_inline int PageAnonExclusive(const struct page *page) { VM_BUG_ON_PGFLAGS(!PageAnon(page), page); /* * HugeTLB stores this information on the head page; THP keeps it per * page */ if (PageHuge(page)) page = compound_head(page); return test_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags.f); } static __always_inline void SetPageAnonExclusive(struct page *page) { VM_BUG_ON_PGFLAGS(!PageAnonNotKsm(page), page); VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page); set_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags.f); } static __always_inline void ClearPageAnonExclusive(struct page *page) { VM_BUG_ON_PGFLAGS(!PageAnonNotKsm(page), page); VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page); clear_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags.f); } static __always_inline void __ClearPageAnonExclusive(struct page *page) { VM_BUG_ON_PGFLAGS(!PageAnon(page), page); VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page); __clear_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags.f); } #ifdef CONFIG_MMU #define __PG_MLOCKED (1UL << PG_mlocked) #else #define __PG_MLOCKED 0 #endif /* * Flags checked when a page is freed. Pages being freed should not have * these flags set. If they are, there is a problem. */ #define PAGE_FLAGS_CHECK_AT_FREE \ (1UL << PG_lru | 1UL << PG_locked | \ 1UL << PG_private | 1UL << PG_private_2 | \ 1UL << PG_writeback | 1UL << PG_reserved | \ 1UL << PG_active | \ 1UL << PG_unevictable | __PG_MLOCKED | LRU_GEN_MASK) /* * Flags checked when a page is prepped for return by the page allocator. * Pages being prepped should not have these flags set. If they are set, * there has been a kernel bug or struct page corruption. * * __PG_HWPOISON is exceptional because it needs to be kept beyond page's * alloc-free cycle to prevent from reusing the page. */ #define PAGE_FLAGS_CHECK_AT_PREP \ ((PAGEFLAGS_MASK & ~__PG_HWPOISON) | LRU_GEN_MASK | LRU_REFS_MASK) /* * Flags stored in the second page of a compound page. They may overlap * the CHECK_AT_FREE flags above, so need to be cleared. */ #define PAGE_FLAGS_SECOND \ (0xffUL /* order */ | 1UL << PG_has_hwpoisoned | \ 1UL << PG_large_rmappable | 1UL << PG_partially_mapped) #define PAGE_FLAGS_PRIVATE \ (1UL << PG_private | 1UL << PG_private_2) /** * folio_has_private - Determine if folio has private stuff * @folio: The folio to be checked * * Determine if a folio has private stuff, indicating that release routines * should be invoked upon it. */ static inline int folio_has_private(const struct folio *folio) { return !!(folio->flags.f & PAGE_FLAGS_PRIVATE); } #undef PF_ANY #undef PF_HEAD #undef PF_NO_TAIL #undef PF_NO_COMPOUND #undef PF_SECOND #endif /* !__GENERATING_BOUNDS_H */ #endif /* PAGE_FLAGS_H */
2693 2693 2706 131 3 4172 2216 2701 2422 1551 30 2 868 1116 1117 35 29 3 863 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PGTABLE_H #define _LINUX_PGTABLE_H #include <linux/pfn.h> #include <asm/pgtable.h> #define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT) #define PUD_ORDER (PUD_SHIFT - PAGE_SHIFT) #ifndef __ASSEMBLY__ #ifdef CONFIG_MMU #include <linux/mm_types.h> #include <linux/bug.h> #include <linux/errno.h> #include <asm-generic/pgtable_uffd.h> #include <linux/page_table_check.h> #if 5 - defined(__PAGETABLE_P4D_FOLDED) - defined(__PAGETABLE_PUD_FOLDED) - \ defined(__PAGETABLE_PMD_FOLDED) != CONFIG_PGTABLE_LEVELS #error CONFIG_PGTABLE_LEVELS is not consistent with __PAGETABLE_{P4D,PUD,PMD}_FOLDED #endif /* * This defines the generic helper for accessing PMD page * table page. Although platforms can still override this * via their respective <asm/pgtable.h>. */ #ifndef pmd_pgtable #define pmd_pgtable(pmd) pmd_page(pmd) #endif #define pmd_folio(pmd) page_folio(pmd_page(pmd)) /* * A page table page can be thought of an array like this: pXd_t[PTRS_PER_PxD] * * The pXx_index() functions return the index of the entry in the page * table page which would control the given virtual address * * As these functions may be used by the same code for different levels of * the page table folding, they are always available, regardless of * CONFIG_PGTABLE_LEVELS value. For the folded levels they simply return 0 * because in such cases PTRS_PER_PxD equals 1. */ static inline unsigned long pte_index(unsigned long address) { return (address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); } #ifndef pmd_index static inline unsigned long pmd_index(unsigned long address) { return (address >> PMD_SHIFT) & (PTRS_PER_PMD - 1); } #define pmd_index pmd_index #endif #ifndef pud_index static inline unsigned long pud_index(unsigned long address) { return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1); } #define pud_index pud_index #endif #ifndef pgd_index /* Must be a compile-time constant, so implement it as a macro */ #define pgd_index(a) (((a) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1)) #endif #ifndef kernel_pte_init static inline void kernel_pte_init(void *addr) { } #define kernel_pte_init kernel_pte_init #endif #ifndef pmd_init static inline void pmd_init(void *addr) { } #define pmd_init pmd_init #endif #ifndef pud_init static inline void pud_init(void *addr) { } #define pud_init pud_init #endif #ifndef pte_offset_kernel static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address) { return (pte_t *)pmd_page_vaddr(*pmd) + pte_index(address); } #define pte_offset_kernel pte_offset_kernel #endif #ifdef CONFIG_HIGHPTE #define __pte_map(pmd, address) \ ((pte_t *)kmap_local_page(pmd_page(*(pmd))) + pte_index((address))) #define pte_unmap(pte) do { \ kunmap_local((pte)); \ rcu_read_unlock(); \ } while (0) #else static inline pte_t *__pte_map(pmd_t *pmd, unsigned long address) { return pte_offset_kernel(pmd, address); } static inline void pte_unmap(pte_t *pte) { rcu_read_unlock(); } #endif void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable); /* Find an entry in the second-level page table.. */ #ifndef pmd_offset static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) { return pud_pgtable(*pud) + pmd_index(address); } #define pmd_offset pmd_offset #endif #ifndef pud_offset static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address) { return p4d_pgtable(*p4d) + pud_index(address); } #define pud_offset pud_offset #endif static inline pgd_t *pgd_offset_pgd(pgd_t *pgd, unsigned long address) { return (pgd + pgd_index(address)); }; /* * a shortcut to get a pgd_t in a given mm */ #ifndef pgd_offset #define pgd_offset(mm, address) pgd_offset_pgd((mm)->pgd, (address)) #endif /* * a shortcut which implies the use of the kernel's pgd, instead * of a process's */ #define pgd_offset_k(address) pgd_offset(&init_mm, (address)) /* * In many cases it is known that a virtual address is mapped at PMD or PTE * level, so instead of traversing all the page table levels, we can get a * pointer to the PMD entry in user or kernel page table or translate a virtual * address to the pointer in the PTE in the kernel page tables with simple * helpers. */ static inline pmd_t *pmd_off(struct mm_struct *mm, unsigned long va) { return pmd_offset(pud_offset(p4d_offset(pgd_offset(mm, va), va), va), va); } static inline pmd_t *pmd_off_k(unsigned long va) { return pmd_offset(pud_offset(p4d_offset(pgd_offset_k(va), va), va), va); } static inline pte_t *virt_to_kpte(unsigned long vaddr) { pmd_t *pmd = pmd_off_k(vaddr); return pmd_none(*pmd) ? NULL : pte_offset_kernel(pmd, vaddr); } #ifndef pmd_young static inline int pmd_young(pmd_t pmd) { return 0; } #endif #ifndef pmd_dirty static inline int pmd_dirty(pmd_t pmd) { return 0; } #endif /* * A facility to provide lazy MMU batching. This allows PTE updates and * page invalidations to be delayed until a call to leave lazy MMU mode * is issued. Some architectures may benefit from doing this, and it is * beneficial for both shadow and direct mode hypervisors, which may batch * the PTE updates which happen during this window. Note that using this * interface requires that read hazards be removed from the code. A read * hazard could result in the direct mode hypervisor case, since the actual * write to the page tables may not yet have taken place, so reads though * a raw PTE pointer after it has been modified are not guaranteed to be * up to date. * * In the general case, no lock is guaranteed to be held between entry and exit * of the lazy mode. (In practice, for user PTE updates, the appropriate page * table lock(s) are held, but for kernel PTE updates, no lock is held). * The implementation must therefore assume preemption may be enabled upon * entry to the mode and cpu migration is possible; it must take steps to be * robust against this. An implementation may handle this by disabling * preemption, as a consequence generic code may not sleep while the lazy MMU * mode is active. * * The mode is disabled in interrupt context and calls to the lazy_mmu API have * no effect. * * The lazy MMU mode is enabled for a given block of code using: * * lazy_mmu_mode_enable(); * <code> * lazy_mmu_mode_disable(); * * Nesting is permitted: <code> may itself use an enable()/disable() pair. * A nested call to enable() has no functional effect; however disable() causes * any batched architectural state to be flushed regardless of nesting. After a * call to disable(), the caller can therefore rely on all previous page table * modifications to have taken effect, but the lazy MMU mode may still be * enabled. * * In certain cases, it may be desirable to temporarily pause the lazy MMU mode. * This can be done using: * * lazy_mmu_mode_pause(); * <code> * lazy_mmu_mode_resume(); * * pause() ensures that the mode is exited regardless of the nesting level; * resume() re-enters the mode at the same nesting level. Any call to the * lazy_mmu_mode_* API between those two calls has no effect. In particular, * this means that pause()/resume() pairs may nest. * * is_lazy_mmu_mode_active() can be used to check whether the lazy MMU mode is * currently enabled. */ #ifdef CONFIG_ARCH_HAS_LAZY_MMU_MODE /** * lazy_mmu_mode_enable() - Enable the lazy MMU mode. * * Enters a new lazy MMU mode section; if the mode was not already enabled, * enables it and calls arch_enter_lazy_mmu_mode(). * * Must be paired with a call to lazy_mmu_mode_disable(). * * Has no effect if called: * - While paused - see lazy_mmu_mode_pause() * - In interrupt context */ static inline void lazy_mmu_mode_enable(void) { struct lazy_mmu_state *state = &current->lazy_mmu_state; if (in_interrupt() || state->pause_count > 0) return; VM_WARN_ON_ONCE(state->enable_count == U8_MAX); if (state->enable_count++ == 0) arch_enter_lazy_mmu_mode(); } /** * lazy_mmu_mode_disable() - Disable the lazy MMU mode. * * Exits the current lazy MMU mode section. If it is the outermost section, * disables the mode and calls arch_leave_lazy_mmu_mode(). Otherwise (nested * section), calls arch_flush_lazy_mmu_mode(). * * Must match a call to lazy_mmu_mode_enable(). * * Has no effect if called: * - While paused - see lazy_mmu_mode_pause() * - In interrupt context */ static inline void lazy_mmu_mode_disable(void) { struct lazy_mmu_state *state = &current->lazy_mmu_state; if (in_interrupt() || state->pause_count > 0) return; VM_WARN_ON_ONCE(state->enable_count == 0); if (--state->enable_count == 0) arch_leave_lazy_mmu_mode(); else /* Exiting a nested section */ arch_flush_lazy_mmu_mode(); } /** * lazy_mmu_mode_pause() - Pause the lazy MMU mode. * * Pauses the lazy MMU mode; if it is currently active, disables it and calls * arch_leave_lazy_mmu_mode(). * * Must be paired with a call to lazy_mmu_mode_resume(). Calls to the * lazy_mmu_mode_* API have no effect until the matching resume() call. * * Has no effect if called: * - While paused (inside another pause()/resume() pair) * - In interrupt context */ static inline void lazy_mmu_mode_pause(void) { struct lazy_mmu_state *state = &current->lazy_mmu_state; if (in_interrupt()) return; VM_WARN_ON_ONCE(state->pause_count == U8_MAX); if (state->pause_count++ == 0 && state->enable_count > 0) arch_leave_lazy_mmu_mode(); } /** * lazy_mmu_mode_resume() - Resume the lazy MMU mode. * * Resumes the lazy MMU mode; if it was active at the point where the matching * call to lazy_mmu_mode_pause() was made, re-enables it and calls * arch_enter_lazy_mmu_mode(). * * Must match a call to lazy_mmu_mode_pause(). * * Has no effect if called: * - While paused (inside another pause()/resume() pair) * - In interrupt context */ static inline void lazy_mmu_mode_resume(void) { struct lazy_mmu_state *state = &current->lazy_mmu_state; if (in_interrupt()) return; VM_WARN_ON_ONCE(state->pause_count == 0); if (--state->pause_count == 0 && state->enable_count > 0) arch_enter_lazy_mmu_mode(); } #else static inline void lazy_mmu_mode_enable(void) {} static inline void lazy_mmu_mode_disable(void) {} static inline void lazy_mmu_mode_pause(void) {} static inline void lazy_mmu_mode_resume(void) {} #endif #ifndef pte_batch_hint /** * pte_batch_hint - Number of pages that can be added to batch without scanning. * @ptep: Page table pointer for the entry. * @pte: Page table entry. * * Some architectures know that a set of contiguous ptes all map the same * contiguous memory with the same permissions. In this case, it can provide a * hint to aid pte batching without the core code needing to scan every pte. * * An architecture implementation may ignore the PTE accessed state. Further, * the dirty state must apply atomically to all the PTEs described by the hint. * * May be overridden by the architecture, else pte_batch_hint is always 1. */ static inline unsigned int pte_batch_hint(pte_t *ptep, pte_t pte) { return 1; } #endif #ifndef pte_advance_pfn static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr) { return __pte(pte_val(pte) + (nr << PFN_PTE_SHIFT)); } #endif #define pte_next_pfn(pte) pte_advance_pfn(pte, 1) #ifndef set_ptes /** * set_ptes - Map consecutive pages to a contiguous range of addresses. * @mm: Address space to map the pages into. * @addr: Address to map the first page at. * @ptep: Page table pointer for the first entry. * @pte: Page table entry for the first page. * @nr: Number of pages to map. * * When nr==1, initial state of pte may be present or not present, and new state * may be present or not present. When nr>1, initial state of all ptes must be * not present, and new state must be present. * * May be overridden by the architecture, or the architecture can define * set_pte() and PFN_PTE_SHIFT. * * Context: The caller holds the page table lock. The pages all belong * to the same folio. The PTEs are all in the same PMD. */ static inline void set_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte, unsigned int nr) { page_table_check_ptes_set(mm, addr, ptep, pte, nr); for (;;) { set_pte(ptep, pte); if (--nr == 0) break; ptep++; pte = pte_next_pfn(pte); } } #endif #define set_pte_at(mm, addr, ptep, pte) set_ptes(mm, addr, ptep, pte, 1) #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS extern int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t entry, int dirty); #endif #ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t entry, int dirty); extern int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pud_t *pudp, pud_t entry, int dirty); #else static inline int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t entry, int dirty) { BUILD_BUG(); return 0; } static inline int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pud_t *pudp, pud_t entry, int dirty) { BUILD_BUG(); return 0; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif #ifndef ptep_get static inline pte_t ptep_get(pte_t *ptep) { return READ_ONCE(*ptep); } #endif #ifndef pmdp_get static inline pmd_t pmdp_get(pmd_t *pmdp) { return READ_ONCE(*pmdp); } #endif #ifndef pudp_get static inline pud_t pudp_get(pud_t *pudp) { return READ_ONCE(*pudp); } #endif #ifndef p4dp_get static inline p4d_t p4dp_get(p4d_t *p4dp) { return READ_ONCE(*p4dp); } #endif #ifndef pgdp_get static inline pgd_t pgdp_get(pgd_t *pgdp) { return READ_ONCE(*pgdp); } #endif #ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { pte_t pte = ptep_get(ptep); int r = 1; if (!pte_young(pte)) r = 0; else set_pte_at(vma->vm_mm, address, ptep, pte_mkold(pte)); return r; } #endif #ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { pmd_t pmd = *pmdp; int r = 1; if (!pmd_young(pmd)) r = 0; else set_pmd_at(vma->vm_mm, address, pmdp, pmd_mkold(pmd)); return r; } #else static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { BUILD_BUG(); return 0; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG */ #endif #ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH int ptep_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pte_t *ptep); #endif #ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); #else /* * Despite relevant to THP only, this API is called from generic rmap code * under PageTransHuge(), hence needs a dummy implementation for !THP */ static inline int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { BUILD_BUG(); return 0; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif #ifndef arch_has_hw_nonleaf_pmd_young /* * Return whether the accessed bit in non-leaf PMD entries is supported on the * local CPU. */ static inline bool arch_has_hw_nonleaf_pmd_young(void) { return IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG); } #endif #ifndef arch_has_hw_pte_young /* * Return whether the accessed bit is supported on the local CPU. * * This stub assumes accessing through an old PTE triggers a page fault. * Architectures that automatically set the access bit should overwrite it. */ static inline bool arch_has_hw_pte_young(void) { return IS_ENABLED(CONFIG_ARCH_HAS_HW_PTE_YOUNG); } #endif #ifndef exec_folio_order /* * Returns preferred minimum folio order for executable file-backed memory. Must * be in range [0, PMD_ORDER). Default to order-0. */ static inline unsigned int exec_folio_order(void) { return 0; } #endif #ifndef arch_check_zapped_pte static inline void arch_check_zapped_pte(struct vm_area_struct *vma, pte_t pte) { } #endif #ifndef arch_check_zapped_pmd static inline void arch_check_zapped_pmd(struct vm_area_struct *vma, pmd_t pmd) { } #endif #ifndef arch_check_zapped_pud static inline void arch_check_zapped_pud(struct vm_area_struct *vma, pud_t pud) { } #endif #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long address, pte_t *ptep) { pte_t pte = ptep_get(ptep); pte_clear(mm, address, ptep); page_table_check_pte_clear(mm, address, pte); return pte; } #endif #ifndef clear_young_dirty_ptes /** * clear_young_dirty_ptes - Mark PTEs that map consecutive pages of the * same folio as old/clean. * @mm: Address space the pages are mapped into. * @addr: Address the first page is mapped at. * @ptep: Page table pointer for the first entry. * @nr: Number of entries to mark old/clean. * @flags: Flags to modify the PTE batch semantics. * * May be overridden by the architecture; otherwise, implemented by * get_and_clear/modify/set for each pte in the range. * * Note that PTE bits in the PTE range besides the PFN can differ. For example, * some PTEs might be write-protected. * * Context: The caller holds the page table lock. The PTEs map consecutive * pages that belong to the same folio. The PTEs are all in the same PMD. */ static inline void clear_young_dirty_ptes(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, unsigned int nr, cydp_t flags) { pte_t pte; for (;;) { if (flags == CYDP_CLEAR_YOUNG) ptep_test_and_clear_young(vma, addr, ptep); else { pte = ptep_get_and_clear(vma->vm_mm, addr, ptep); if (flags & CYDP_CLEAR_YOUNG) pte = pte_mkold(pte); if (flags & CYDP_CLEAR_DIRTY) pte = pte_mkclean(pte); set_pte_at(vma->vm_mm, addr, ptep, pte); } if (--nr == 0) break; ptep++; addr += PAGE_SIZE; } } #endif static inline void ptep_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { pte_t pte = ptep_get(ptep); pte_clear(mm, addr, ptep); /* * No need for ptep_get_and_clear(): page table check doesn't care about * any bits that could have been set by HW concurrently. */ page_table_check_pte_clear(mm, addr, pte); } #ifdef CONFIG_GUP_GET_PXX_LOW_HIGH /* * For walking the pagetables without holding any locks. Some architectures * (eg x86-32 PAE) cannot load the entries atomically without using expensive * instructions. We are guaranteed that a PTE will only either go from not * present to present, or present to not present -- it will not switch to a * completely different present page without a TLB flush inbetween; which we * are blocking by holding interrupts off. * * Setting ptes from not present to present goes: * * ptep->pte_high = h; * smp_wmb(); * ptep->pte_low = l; * * And present to not present goes: * * ptep->pte_low = 0; * smp_wmb(); * ptep->pte_high = 0; * * We must ensure here that the load of pte_low sees 'l' IFF pte_high sees 'h'. * We load pte_high *after* loading pte_low, which ensures we don't see an older * value of pte_high. *Then* we recheck pte_low, which ensures that we haven't * picked up a changed pte high. We might have gotten rubbish values from * pte_low and pte_high, but we are guaranteed that pte_low will not have the * present bit set *unless* it is 'l'. Because get_user_pages_fast() only * operates on present ptes we're safe. */ static inline pte_t ptep_get_lockless(pte_t *ptep) { pte_t pte; do { pte.pte_low = ptep->pte_low; smp_rmb(); pte.pte_high = ptep->pte_high; smp_rmb(); } while (unlikely(pte.pte_low != ptep->pte_low)); return pte; } #define ptep_get_lockless ptep_get_lockless #if CONFIG_PGTABLE_LEVELS > 2 static inline pmd_t pmdp_get_lockless(pmd_t *pmdp) { pmd_t pmd; do { pmd.pmd_low = pmdp->pmd_low; smp_rmb(); pmd.pmd_high = pmdp->pmd_high; smp_rmb(); } while (unlikely(pmd.pmd_low != pmdp->pmd_low)); return pmd; } #define pmdp_get_lockless pmdp_get_lockless #define pmdp_get_lockless_sync() tlb_remove_table_sync_one() #endif /* CONFIG_PGTABLE_LEVELS > 2 */ #endif /* CONFIG_GUP_GET_PXX_LOW_HIGH */ /* * We require that the PTE can be read atomically. */ #ifndef ptep_get_lockless static inline pte_t ptep_get_lockless(pte_t *ptep) { return ptep_get(ptep); } #endif #ifndef pmdp_get_lockless static inline pmd_t pmdp_get_lockless(pmd_t *pmdp) { return pmdp_get(pmdp); } static inline void pmdp_get_lockless_sync(void) { } #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE #ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long address, pmd_t *pmdp) { pmd_t pmd = *pmdp; pmd_clear(pmdp); page_table_check_pmd_clear(mm, address, pmd); return pmd; } #endif /* __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR */ #ifndef __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm, unsigned long address, pud_t *pudp) { pud_t pud = *pudp; pud_clear(pudp); page_table_check_pud_clear(mm, address, pud); return pud; } #endif /* __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR */ #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE #ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL static inline pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, int full) { return pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp); } #endif #ifndef __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR_FULL static inline pud_t pudp_huge_get_and_clear_full(struct vm_area_struct *vma, unsigned long address, pud_t *pudp, int full) { return pudp_huge_get_and_clear(vma->vm_mm, address, pudp); } #endif #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long address, pte_t *ptep, int full) { return ptep_get_and_clear(mm, address, ptep); } #endif #ifndef get_and_clear_full_ptes /** * get_and_clear_full_ptes - Clear present PTEs that map consecutive pages of * the same folio, collecting dirty/accessed bits. * @mm: Address space the pages are mapped into. * @addr: Address the first page is mapped at. * @ptep: Page table pointer for the first entry. * @nr: Number of entries to clear. * @full: Whether we are clearing a full mm. * * May be overridden by the architecture; otherwise, implemented as a simple * loop over ptep_get_and_clear_full(), merging dirty/accessed bits into the * returned PTE. * * Note that PTE bits in the PTE range besides the PFN can differ. For example, * some PTEs might be write-protected. * * Context: The caller holds the page table lock. The PTEs map consecutive * pages that belong to the same folio. The PTEs are all in the same PMD. */ static inline pte_t get_and_clear_full_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned int nr, int full) { pte_t pte, tmp_pte; pte = ptep_get_and_clear_full(mm, addr, ptep, full); while (--nr) { ptep++; addr += PAGE_SIZE; tmp_pte = ptep_get_and_clear_full(mm, addr, ptep, full); if (pte_dirty(tmp_pte)) pte = pte_mkdirty(pte); if (pte_young(tmp_pte)) pte = pte_mkyoung(pte); } return pte; } #endif /** * get_and_clear_ptes - Clear present PTEs that map consecutive pages of * the same folio, collecting dirty/accessed bits. * @mm: Address space the pages are mapped into. * @addr: Address the first page is mapped at. * @ptep: Page table pointer for the first entry. * @nr: Number of entries to clear. * * Use this instead of get_and_clear_full_ptes() if it is known that we don't * need to clear the full mm, which is mostly the case. * * Note that PTE bits in the PTE range besides the PFN can differ. For example, * some PTEs might be write-protected. * * Context: The caller holds the page table lock. The PTEs map consecutive * pages that belong to the same folio. The PTEs are all in the same PMD. */ static inline pte_t get_and_clear_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned int nr) { return get_and_clear_full_ptes(mm, addr, ptep, nr, 0); } #ifndef clear_full_ptes /** * clear_full_ptes - Clear present PTEs that map consecutive pages of the same * folio. * @mm: Address space the pages are mapped into. * @addr: Address the first page is mapped at. * @ptep: Page table pointer for the first entry. * @nr: Number of entries to clear. * @full: Whether we are clearing a full mm. * * May be overridden by the architecture; otherwise, implemented as a simple * loop over ptep_get_and_clear_full(). * * Note that PTE bits in the PTE range besides the PFN can differ. For example, * some PTEs might be write-protected. * * Context: The caller holds the page table lock. The PTEs map consecutive * pages that belong to the same folio. The PTEs are all in the same PMD. */ static inline void clear_full_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned int nr, int full) { for (;;) { ptep_get_and_clear_full(mm, addr, ptep, full); if (--nr == 0) break; ptep++; addr += PAGE_SIZE; } } #endif /** * clear_ptes - Clear present PTEs that map consecutive pages of the same folio. * @mm: Address space the pages are mapped into. * @addr: Address the first page is mapped at. * @ptep: Page table pointer for the first entry. * @nr: Number of entries to clear. * * Use this instead of clear_full_ptes() if it is known that we don't need to * clear the full mm, which is mostly the case. * * Note that PTE bits in the PTE range besides the PFN can differ. For example, * some PTEs might be write-protected. * * Context: The caller holds the page table lock. The PTEs map consecutive * pages that belong to the same folio. The PTEs are all in the same PMD. */ static inline void clear_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned int nr) { clear_full_ptes(mm, addr, ptep, nr, 0); } /* * If two threads concurrently fault at the same page, the thread that * won the race updates the PTE and its local TLB/Cache. The other thread * gives up, simply does nothing, and continues; on architectures where * software can update TLB, local TLB can be updated here to avoid next page * fault. This function updates TLB only, do nothing with cache or others. * It is the difference with function update_mmu_cache. */ #ifndef update_mmu_tlb_range static inline void update_mmu_tlb_range(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, unsigned int nr) { } #endif static inline void update_mmu_tlb(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { update_mmu_tlb_range(vma, address, ptep, 1); } /* * Some architectures may be able to avoid expensive synchronization * primitives when modifications are made to PTE's which are already * not present, or in the process of an address space destruction. */ #ifndef __HAVE_ARCH_PTE_CLEAR_NOT_PRESENT_FULL static inline void pte_clear_not_present_full(struct mm_struct *mm, unsigned long address, pte_t *ptep, int full) { pte_clear(mm, address, ptep); } #endif #ifndef clear_not_present_full_ptes /** * clear_not_present_full_ptes - Clear multiple not present PTEs which are * consecutive in the pgtable. * @mm: Address space the ptes represent. * @addr: Address of the first pte. * @ptep: Page table pointer for the first entry. * @nr: Number of entries to clear. * @full: Whether we are clearing a full mm. * * May be overridden by the architecture; otherwise, implemented as a simple * loop over pte_clear_not_present_full(). * * Context: The caller holds the page table lock. The PTEs are all not present. * The PTEs are all in the same PMD. */ static inline void clear_not_present_full_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned int nr, int full) { for (;;) { pte_clear_not_present_full(mm, addr, ptep, full); if (--nr == 0) break; ptep++; addr += PAGE_SIZE; } } #endif #ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH extern pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, pte_t *ptep); #endif #ifndef __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH extern pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); extern pud_t pudp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, pud_t *pudp); #endif #ifndef pte_mkwrite static inline pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma) { return pte_mkwrite_novma(pte); } #endif #if defined(CONFIG_ARCH_WANT_PMD_MKWRITE) && !defined(pmd_mkwrite) static inline pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) { return pmd_mkwrite_novma(pmd); } #endif #ifndef __HAVE_ARCH_PTEP_SET_WRPROTECT struct mm_struct; static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long address, pte_t *ptep) { pte_t old_pte = ptep_get(ptep); set_pte_at(mm, address, ptep, pte_wrprotect(old_pte)); } #endif #ifndef wrprotect_ptes /** * wrprotect_ptes - Write-protect PTEs that map consecutive pages of the same * folio. * @mm: Address space the pages are mapped into. * @addr: Address the first page is mapped at. * @ptep: Page table pointer for the first entry. * @nr: Number of entries to write-protect. * * May be overridden by the architecture; otherwise, implemented as a simple * loop over ptep_set_wrprotect(). * * Note that PTE bits in the PTE range besides the PFN can differ. For example, * some PTEs might be write-protected. * * Context: The caller holds the page table lock. The PTEs map consecutive * pages that belong to the same folio. The PTEs are all in the same PMD. */ static inline void wrprotect_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned int nr) { for (;;) { ptep_set_wrprotect(mm, addr, ptep); if (--nr == 0) break; ptep++; addr += PAGE_SIZE; } } #endif #ifndef clear_flush_young_ptes /** * clear_flush_young_ptes - Mark PTEs that map consecutive pages of the same * folio as old and flush the TLB. * @vma: The virtual memory area the pages are mapped into. * @addr: Address the first page is mapped at. * @ptep: Page table pointer for the first entry. * @nr: Number of entries to clear access bit. * * May be overridden by the architecture; otherwise, implemented as a simple * loop over ptep_clear_flush_young(). * * Note that PTE bits in the PTE range besides the PFN can differ. For example, * some PTEs might be write-protected. * * Context: The caller holds the page table lock. The PTEs map consecutive * pages that belong to the same folio. The PTEs are all in the same PMD. */ static inline int clear_flush_young_ptes(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, unsigned int nr) { int young = 0; for (;;) { young |= ptep_clear_flush_young(vma, addr, ptep); if (--nr == 0) break; ptep++; addr += PAGE_SIZE; } return young; } #endif /* * On some architectures hardware does not set page access bit when accessing * memory page, it is responsibility of software setting this bit. It brings * out extra page fault penalty to track page access bit. For optimization page * access bit can be set during all page fault flow on these arches. * To be differentiate with macro pte_mkyoung, this macro is used on platforms * where software maintains page access bit. */ #ifndef pte_sw_mkyoung static inline pte_t pte_sw_mkyoung(pte_t pte) { return pte; } #define pte_sw_mkyoung pte_sw_mkyoung #endif #ifndef __HAVE_ARCH_PMDP_SET_WRPROTECT #ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long address, pmd_t *pmdp) { pmd_t old_pmd = *pmdp; set_pmd_at(mm, address, pmdp, pmd_wrprotect(old_pmd)); } #else static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long address, pmd_t *pmdp) { BUILD_BUG(); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif #ifndef __HAVE_ARCH_PUDP_SET_WRPROTECT #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD #ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline void pudp_set_wrprotect(struct mm_struct *mm, unsigned long address, pud_t *pudp) { pud_t old_pud = *pudp; set_pud_at(mm, address, pudp, pud_wrprotect(old_pud)); } #else static inline void pudp_set_wrprotect(struct mm_struct *mm, unsigned long address, pud_t *pudp) { BUILD_BUG(); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ #endif #ifndef pmdp_collapse_flush #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); #else static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { BUILD_BUG(); return *pmdp; } #define pmdp_collapse_flush pmdp_collapse_flush #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif #ifndef __HAVE_ARCH_PGTABLE_DEPOSIT extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, pgtable_t pgtable); #endif #ifndef __HAVE_ARCH_PGTABLE_WITHDRAW extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); #endif #ifndef arch_needs_pgtable_deposit #define arch_needs_pgtable_deposit() (false) #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* * This is an implementation of pmdp_establish() that is only suitable for an * architecture that doesn't have hardware dirty/accessed bits. In this case we * can't race with CPU which sets these bits and non-atomic approach is fine. */ static inline pmd_t generic_pmdp_establish(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t pmd) { pmd_t old_pmd = *pmdp; set_pmd_at(vma->vm_mm, address, pmdp, pmd); return old_pmd; } #endif #ifndef __HAVE_ARCH_PMDP_INVALIDATE extern pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); #endif #ifndef __HAVE_ARCH_PMDP_INVALIDATE_AD /* * pmdp_invalidate_ad() invalidates the PMD while changing a transparent * hugepage mapping in the page tables. This function is similar to * pmdp_invalidate(), but should only be used if the access and dirty bits would * not be cleared by the software in the new PMD value. The function ensures * that hardware changes of the access and dirty bits updates would not be lost. * * Doing so can allow in certain architectures to avoid a TLB flush in most * cases. Yet, another TLB flush might be necessary later if the PMD update * itself requires such flush (e.g., if protection was set to be stricter). Yet, * even when a TLB flush is needed because of the update, the caller may be able * to batch these TLB flushing operations, so fewer TLB flush operations are * needed. */ extern pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); #endif #ifndef __HAVE_ARCH_PTE_SAME static inline int pte_same(pte_t pte_a, pte_t pte_b) { return pte_val(pte_a) == pte_val(pte_b); } #endif #ifndef __HAVE_ARCH_PTE_UNUSED /* * Some architectures provide facilities to virtualization guests * so that they can flag allocated pages as unused. This allows the * host to transparently reclaim unused pages. This function returns * whether the pte's page is unused. */ static inline int pte_unused(pte_t pte) { return 0; } #endif #ifndef pte_access_permitted #define pte_access_permitted(pte, write) \ (pte_present(pte) && (!(write) || pte_write(pte))) #endif #ifndef pmd_access_permitted #define pmd_access_permitted(pmd, write) \ (pmd_present(pmd) && (!(write) || pmd_write(pmd))) #endif #ifndef pud_access_permitted #define pud_access_permitted(pud, write) \ (pud_present(pud) && (!(write) || pud_write(pud))) #endif #ifndef p4d_access_permitted #define p4d_access_permitted(p4d, write) \ (p4d_present(p4d) && (!(write) || p4d_write(p4d))) #endif #ifndef pgd_access_permitted #define pgd_access_permitted(pgd, write) \ (pgd_present(pgd) && (!(write) || pgd_write(pgd))) #endif #ifndef __HAVE_ARCH_PMD_SAME static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) { return pmd_val(pmd_a) == pmd_val(pmd_b); } #endif #ifndef pud_same static inline int pud_same(pud_t pud_a, pud_t pud_b) { return pud_val(pud_a) == pud_val(pud_b); } #define pud_same pud_same #endif #ifndef __HAVE_ARCH_P4D_SAME static inline int p4d_same(p4d_t p4d_a, p4d_t p4d_b) { return p4d_val(p4d_a) == p4d_val(p4d_b); } #endif #ifndef __HAVE_ARCH_PGD_SAME static inline int pgd_same(pgd_t pgd_a, pgd_t pgd_b) { return pgd_val(pgd_a) == pgd_val(pgd_b); } #endif #ifndef __HAVE_ARCH_DO_SWAP_PAGE static inline void arch_do_swap_page_nr(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t pte, pte_t oldpte, int nr) { } #else /* * Some architectures support metadata associated with a page. When a * page is being swapped out, this metadata must be saved so it can be * restored when the page is swapped back in. SPARC M7 and newer * processors support an ADI (Application Data Integrity) tag for the * page as metadata for the page. arch_do_swap_page() can restore this * metadata when a page is swapped back in. */ static inline void arch_do_swap_page_nr(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t pte, pte_t oldpte, int nr) { for (int i = 0; i < nr; i++) { arch_do_swap_page(vma->vm_mm, vma, addr + i * PAGE_SIZE, pte_advance_pfn(pte, i), pte_advance_pfn(oldpte, i)); } } #endif #ifndef __HAVE_ARCH_UNMAP_ONE /* * Some architectures support metadata associated with a page. When a * page is being swapped out, this metadata must be saved so it can be * restored when the page is swapped back in. SPARC M7 and newer * processors support an ADI (Application Data Integrity) tag for the * page as metadata for the page. arch_unmap_one() can save this * metadata on a swap-out of a page. */ static inline int arch_unmap_one(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t orig_pte) { return 0; } #endif /* * Allow architectures to preserve additional metadata associated with * swapped-out pages. The corresponding __HAVE_ARCH_SWAP_* macros and function * prototypes must be defined in the arch-specific asm/pgtable.h file. */ #ifndef __HAVE_ARCH_PREPARE_TO_SWAP static inline int arch_prepare_to_swap(struct folio *folio) { return 0; } #endif #ifndef __HAVE_ARCH_SWAP_INVALIDATE static inline void arch_swap_invalidate_page(int type, pgoff_t offset) { } static inline void arch_swap_invalidate_area(int type) { } #endif #ifndef __HAVE_ARCH_SWAP_RESTORE static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio) { } #endif #ifndef __HAVE_ARCH_MOVE_PTE #define move_pte(pte, old_addr, new_addr) (pte) #endif #ifndef pte_accessible # define pte_accessible(mm, pte) ((void)(pte), 1) #endif #ifndef flush_tlb_fix_spurious_fault #define flush_tlb_fix_spurious_fault(vma, address, ptep) flush_tlb_page(vma, address) #endif #ifndef flush_tlb_fix_spurious_fault_pmd #define flush_tlb_fix_spurious_fault_pmd(vma, address, pmdp) do { } while (0) #endif /* * When walking page tables, get the address of the next boundary, * or the end address of the range if that comes earlier. Although no * vma end wraps to 0, rounded up __boundary may wrap to 0 throughout. */ #define pgd_addr_end(addr, end) \ ({ unsigned long __boundary = ((addr) + PGDIR_SIZE) & PGDIR_MASK; \ (__boundary - 1 < (end) - 1)? __boundary: (end); \ }) #ifndef p4d_addr_end #define p4d_addr_end(addr, end) \ ({ unsigned long __boundary = ((addr) + P4D_SIZE) & P4D_MASK; \ (__boundary - 1 < (end) - 1)? __boundary: (end); \ }) #endif #ifndef pud_addr_end #define pud_addr_end(addr, end) \ ({ unsigned long __boundary = ((addr) + PUD_SIZE) & PUD_MASK; \ (__boundary - 1 < (end) - 1)? __boundary: (end); \ }) #endif #ifndef pmd_addr_end #define pmd_addr_end(addr, end) \ ({ unsigned long __boundary = ((addr) + PMD_SIZE) & PMD_MASK; \ (__boundary - 1 < (end) - 1)? __boundary: (end); \ }) #endif /* * When walking page tables, we usually want to skip any p?d_none entries; * and any p?d_bad entries - reporting the error before resetting to none. * Do the tests inline, but report and clear the bad entry in mm/memory.c. */ void pgd_clear_bad(pgd_t *); #ifndef __PAGETABLE_P4D_FOLDED void p4d_clear_bad(p4d_t *); #else #define p4d_clear_bad(p4d) do { } while (0) #endif #ifndef __PAGETABLE_PUD_FOLDED void pud_clear_bad(pud_t *); #else #define pud_clear_bad(p4d) do { } while (0) #endif void pmd_clear_bad(pmd_t *); static inline int pgd_none_or_clear_bad(pgd_t *pgd) { if (pgd_none(*pgd)) return 1; if (unlikely(pgd_bad(*pgd))) { pgd_clear_bad(pgd); return 1; } return 0; } static inline int p4d_none_or_clear_bad(p4d_t *p4d) { if (p4d_none(*p4d)) return 1; if (unlikely(p4d_bad(*p4d))) { p4d_clear_bad(p4d); return 1; } return 0; } static inline int pud_none_or_clear_bad(pud_t *pud) { if (pud_none(*pud)) return 1; if (unlikely(pud_bad(*pud))) { pud_clear_bad(pud); return 1; } return 0; } static inline int pmd_none_or_clear_bad(pmd_t *pmd) { if (pmd_none(*pmd)) return 1; if (unlikely(pmd_bad(*pmd))) { pmd_clear_bad(pmd); return 1; } return 0; } static inline pte_t __ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { /* * Get the current pte state, but zero it out to make it * non-present, preventing the hardware from asynchronously * updating it. */ return ptep_get_and_clear(vma->vm_mm, addr, ptep); } static inline void __ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte) { /* * The pte is non-present, so there's no hardware state to * preserve. */ set_pte_at(vma->vm_mm, addr, ptep, pte); } #ifndef __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION /* * Start a pte protection read-modify-write transaction, which * protects against asynchronous hardware modifications to the pte. * The intention is not to prevent the hardware from making pte * updates, but to prevent any updates it may make from being lost. * * This does not protect against other software modifications of the * pte; the appropriate pte lock must be held over the transaction. * * Note that this interface is intended to be batchable, meaning that * ptep_modify_prot_commit may not actually update the pte, but merely * queue the update to be done at some later time. The update must be * actually committed before the pte lock is released, however. */ static inline pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { return __ptep_modify_prot_start(vma, addr, ptep); } /* * Commit an update to a pte, leaving any hardware-controlled bits in * the PTE unmodified. The pte returned from ptep_modify_prot_start() may * additionally have young and/or dirty bits set where previously they were not, * so the updated pte may have these additional changes. */ static inline void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t old_pte, pte_t pte) { __ptep_modify_prot_commit(vma, addr, ptep, pte); } #endif /* __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION */ /** * modify_prot_start_ptes - Start a pte protection read-modify-write transaction * over a batch of ptes, which protects against asynchronous hardware * modifications to the ptes. The intention is not to prevent the hardware from * making pte updates, but to prevent any updates it may make from being lost. * Please see the comment above ptep_modify_prot_start() for full description. * * @vma: The virtual memory area the pages are mapped into. * @addr: Address the first page is mapped at. * @ptep: Page table pointer for the first entry. * @nr: Number of entries. * * May be overridden by the architecture; otherwise, implemented as a simple * loop over ptep_modify_prot_start(), collecting the a/d bits from each pte * in the batch. * * Note that PTE bits in the PTE batch besides the PFN can differ. * * Context: The caller holds the page table lock. The PTEs map consecutive * pages that belong to the same folio. All other PTE bits must be identical for * all PTEs in the batch except for young and dirty bits. The PTEs are all in * the same PMD. */ #ifndef modify_prot_start_ptes static inline pte_t modify_prot_start_ptes(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, unsigned int nr) { pte_t pte, tmp_pte; pte = ptep_modify_prot_start(vma, addr, ptep); while (--nr) { ptep++; addr += PAGE_SIZE; tmp_pte = ptep_modify_prot_start(vma, addr, ptep); if (pte_dirty(tmp_pte)) pte = pte_mkdirty(pte); if (pte_young(tmp_pte)) pte = pte_mkyoung(pte); } return pte; } #endif /** * modify_prot_commit_ptes - Commit an update to a batch of ptes, leaving any * hardware-controlled bits in the PTE unmodified. * * @vma: The virtual memory area the pages are mapped into. * @addr: Address the first page is mapped at. * @ptep: Page table pointer for the first entry. * @old_pte: Old page table entry (for the first entry) which is now cleared. * @pte: New page table entry to be set. * @nr: Number of entries. * * May be overridden by the architecture; otherwise, implemented as a simple * loop over ptep_modify_prot_commit(). * * Context: The caller holds the page table lock. The PTEs are all in the same * PMD. On exit, the set ptes in the batch map the same folio. The ptes set by * ptep_modify_prot_start() may additionally have young and/or dirty bits set * where previously they were not, so the updated ptes may have these * additional changes. */ #ifndef modify_prot_commit_ptes static inline void modify_prot_commit_ptes(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t old_pte, pte_t pte, unsigned int nr) { int i; for (i = 0; i < nr; ++i, ++ptep, addr += PAGE_SIZE) { ptep_modify_prot_commit(vma, addr, ptep, old_pte, pte); /* Advance PFN only, set same prot */ old_pte = pte_next_pfn(old_pte); pte = pte_next_pfn(pte); } } #endif /* * Architectures can set this mask to a combination of PGTBL_P?D_MODIFIED values * and let generic vmalloc, ioremap and page table update code know when * arch_sync_kernel_mappings() needs to be called. */ #ifndef ARCH_PAGE_TABLE_SYNC_MASK #define ARCH_PAGE_TABLE_SYNC_MASK 0 #endif /* * There is no default implementation for arch_sync_kernel_mappings(). It is * relied upon the compiler to optimize calls out if ARCH_PAGE_TABLE_SYNC_MASK * is 0. */ void arch_sync_kernel_mappings(unsigned long start, unsigned long end); #endif /* CONFIG_MMU */ /* * On almost all architectures and configurations, 0 can be used as the * upper ceiling to free_pgtables(): on many architectures it has the same * effect as using TASK_SIZE. However, there is one configuration which * must impose a more careful limit, to avoid freeing kernel pgtables. */ #ifndef USER_PGTABLES_CEILING #define USER_PGTABLES_CEILING 0UL #endif /* * This defines the first usable user address. Platforms * can override its value with custom FIRST_USER_ADDRESS * defined in their respective <asm/pgtable.h>. */ #ifndef FIRST_USER_ADDRESS #define FIRST_USER_ADDRESS 0UL #endif /* * No-op macros that just return the current protection value. Defined here * because these macros can be used even if CONFIG_MMU is not defined. */ #ifndef pgprot_nx #define pgprot_nx(prot) (prot) #endif #ifndef pgprot_noncached #define pgprot_noncached(prot) (prot) #endif #ifndef pgprot_writecombine #define pgprot_writecombine pgprot_noncached #endif #ifndef pgprot_writethrough #define pgprot_writethrough pgprot_noncached #endif #ifndef pgprot_device #define pgprot_device pgprot_noncached #endif #ifndef pgprot_mhp #define pgprot_mhp(prot) (prot) #endif #ifdef CONFIG_MMU #ifndef pgprot_modify #define pgprot_modify pgprot_modify static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) { if (pgprot_val(oldprot) == pgprot_val(pgprot_noncached(oldprot))) newprot = pgprot_noncached(newprot); if (pgprot_val(oldprot) == pgprot_val(pgprot_writecombine(oldprot))) newprot = pgprot_writecombine(newprot); if (pgprot_val(oldprot) == pgprot_val(pgprot_device(oldprot))) newprot = pgprot_device(newprot); return newprot; } #endif #endif /* CONFIG_MMU */ #ifndef pgprot_encrypted #define pgprot_encrypted(prot) (prot) #endif #ifndef pgprot_decrypted #define pgprot_decrypted(prot) (prot) #endif /* * A facility to provide batching of the reload of page tables and * other process state with the actual context switch code for * paravirtualized guests. By convention, only one of the batched * update (lazy) modes (CPU, MMU) should be active at any given time, * entry should never be nested, and entry and exits should always be * paired. This is for sanity of maintaining and reasoning about the * kernel code. In this case, the exit (end of the context switch) is * in architecture-specific code, and so doesn't need a generic * definition. */ #ifndef __HAVE_ARCH_START_CONTEXT_SWITCH #define arch_start_context_switch(prev) do {} while (0) #endif /* * Some platforms can customize the PTE soft-dirty bit making it unavailable * even if the architecture provides the resource. * Adding this API allows architectures to add their own checks for the * devices on which the kernel is running. * Note: When overriding it, please make sure the CONFIG_MEM_SOFT_DIRTY * is part of this macro. */ #ifndef pgtable_supports_soft_dirty #define pgtable_supports_soft_dirty() IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) #endif #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY #ifndef CONFIG_ARCH_ENABLE_THP_MIGRATION static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd) { return pmd; } static inline int pmd_swp_soft_dirty(pmd_t pmd) { return 0; } static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd) { return pmd; } #endif #else /* !CONFIG_HAVE_ARCH_SOFT_DIRTY */ static inline int pte_soft_dirty(pte_t pte) { return 0; } static inline int pmd_soft_dirty(pmd_t pmd) { return 0; } static inline pte_t pte_mksoft_dirty(pte_t pte) { return pte; } static inline pmd_t pmd_mksoft_dirty(pmd_t pmd) { return pmd; } static inline pte_t pte_clear_soft_dirty(pte_t pte) { return pte; } static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd) { return pmd; } static inline pte_t pte_swp_mksoft_dirty(pte_t pte) { return pte; } static inline int pte_swp_soft_dirty(pte_t pte) { return 0; } static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) { return pte; } static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd) { return pmd; } static inline int pmd_swp_soft_dirty(pmd_t pmd) { return 0; } static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd) { return pmd; } #endif #ifndef __HAVE_PFNMAP_TRACKING /* * Interfaces that can be used by architecture code to keep track of * memory type of pfn mappings specified by the remap_pfn_range, * vmf_insert_pfn. */ static inline int pfnmap_setup_cachemode(unsigned long pfn, unsigned long size, pgprot_t *prot) { return 0; } static inline int pfnmap_track(unsigned long pfn, unsigned long size, pgprot_t *prot) { return 0; } static inline void pfnmap_untrack(unsigned long pfn, unsigned long size) { } #else /** * pfnmap_setup_cachemode - setup the cachemode in the pgprot for a pfn range * @pfn: the start of the pfn range * @size: the size of the pfn range in bytes * @prot: the pgprot to modify * * Lookup the cachemode for the pfn range starting at @pfn with the size * @size and store it in @prot, leaving other data in @prot unchanged. * * This allows for a hardware implementation to have fine-grained control of * memory cache behavior at page level granularity. Without a hardware * implementation, this function does nothing. * * Currently there is only one implementation for this - x86 Page Attribute * Table (PAT). See Documentation/arch/x86/pat.rst for more details. * * This function can fail if the pfn range spans pfns that require differing * cachemodes. If the pfn range was previously verified to have a single * cachemode, it is sufficient to query only a single pfn. The assumption is * that this is the case for drivers using the vmf_insert_pfn*() interface. * * Returns 0 on success and -EINVAL on error. */ int pfnmap_setup_cachemode(unsigned long pfn, unsigned long size, pgprot_t *prot); /** * pfnmap_track - track a pfn range * @pfn: the start of the pfn range * @size: the size of the pfn range in bytes * @prot: the pgprot to track * * Requested the pfn range to be 'tracked' by a hardware implementation and * setup the cachemode in @prot similar to pfnmap_setup_cachemode(). * * This allows for fine-grained control of memory cache behaviour at page * level granularity. Tracking memory this way is persisted across VMA splits * (VMA merging does not apply for VM_PFNMAP). * * Currently, there is only one implementation for this - x86 Page Attribute * Table (PAT). See Documentation/arch/x86/pat.rst for more details. * * Returns 0 on success and -EINVAL on error. */ int pfnmap_track(unsigned long pfn, unsigned long size, pgprot_t *prot); /** * pfnmap_untrack - untrack a pfn range * @pfn: the start of the pfn range * @size: the size of the pfn range in bytes * * Untrack a pfn range previously tracked through pfnmap_track(). */ void pfnmap_untrack(unsigned long pfn, unsigned long size); #endif /** * pfnmap_setup_cachemode_pfn - setup the cachemode in the pgprot for a pfn * @pfn: the pfn * @prot: the pgprot to modify * * Lookup the cachemode for @pfn and store it in @prot, leaving other * data in @prot unchanged. * * See pfnmap_setup_cachemode() for details. */ static inline void pfnmap_setup_cachemode_pfn(unsigned long pfn, pgprot_t *prot) { pfnmap_setup_cachemode(pfn, PAGE_SIZE, prot); } #ifdef CONFIG_MMU #ifdef __HAVE_COLOR_ZERO_PAGE static inline int is_zero_pfn(unsigned long pfn) { extern unsigned long zero_pfn; unsigned long offset_from_zero_pfn = pfn - zero_pfn; return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT); } #define my_zero_pfn(addr) page_to_pfn(ZERO_PAGE(addr)) #else static inline int is_zero_pfn(unsigned long pfn) { extern unsigned long zero_pfn; return pfn == zero_pfn; } static inline unsigned long my_zero_pfn(unsigned long addr) { extern unsigned long zero_pfn; return zero_pfn; } #endif #else static inline int is_zero_pfn(unsigned long pfn) { return 0; } static inline unsigned long my_zero_pfn(unsigned long addr) { return 0; } #endif /* CONFIG_MMU */ #ifdef CONFIG_MMU #ifndef CONFIG_TRANSPARENT_HUGEPAGE static inline int pmd_trans_huge(pmd_t pmd) { return 0; } #ifndef pmd_write static inline int pmd_write(pmd_t pmd) { BUG(); return 0; } #endif /* pmd_write */ #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #ifndef pud_write static inline int pud_write(pud_t pud) { BUG(); return 0; } #endif /* pud_write */ #if !defined(CONFIG_TRANSPARENT_HUGEPAGE) || \ !defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) static inline int pud_trans_huge(pud_t pud) { return 0; } #endif static inline int pud_trans_unstable(pud_t *pud) { #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) pud_t pudval = READ_ONCE(*pud); if (pud_none(pudval) || pud_trans_huge(pudval)) return 1; if (unlikely(pud_bad(pudval))) { pud_clear_bad(pud); return 1; } #endif return 0; } #ifndef CONFIG_NUMA_BALANCING /* * In an inaccessible (PROT_NONE) VMA, pte_protnone() may indicate "yes". It is * perfectly valid to indicate "no" in that case, which is why our default * implementation defaults to "always no". * * In an accessible VMA, however, pte_protnone() reliably indicates PROT_NONE * page protection due to NUMA hinting. NUMA hinting faults only apply in * accessible VMAs. * * So, to reliably identify PROT_NONE PTEs that require a NUMA hinting fault, * looking at the VMA accessibility is sufficient. */ static inline int pte_protnone(pte_t pte) { return 0; } static inline int pmd_protnone(pmd_t pmd) { return 0; } #endif /* CONFIG_NUMA_BALANCING */ #endif /* CONFIG_MMU */ #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP #ifndef __PAGETABLE_P4D_FOLDED int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot); void p4d_clear_huge(p4d_t *p4d); #else static inline int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot) { return 0; } static inline void p4d_clear_huge(p4d_t *p4d) { } #endif /* !__PAGETABLE_P4D_FOLDED */ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot); int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot); int pud_clear_huge(pud_t *pud); int pmd_clear_huge(pmd_t *pmd); int p4d_free_pud_page(p4d_t *p4d, unsigned long addr); int pud_free_pmd_page(pud_t *pud, unsigned long addr); int pmd_free_pte_page(pmd_t *pmd, unsigned long addr); #else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */ static inline int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot) { return 0; } static inline int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) { return 0; } static inline int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) { return 0; } static inline void p4d_clear_huge(p4d_t *p4d) { } static inline int pud_clear_huge(pud_t *pud) { return 0; } static inline int pmd_clear_huge(pmd_t *pmd) { return 0; } static inline int p4d_free_pud_page(p4d_t *p4d, unsigned long addr) { return 0; } static inline int pud_free_pmd_page(pud_t *pud, unsigned long addr) { return 0; } static inline int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) { return 0; } #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ #ifndef __HAVE_ARCH_FLUSH_PMD_TLB_RANGE #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* * ARCHes with special requirements for evicting THP backing TLB entries can * implement this. Otherwise also, it can help optimize normal TLB flush in * THP regime. Stock flush_tlb_range() typically has optimization to nuke the * entire TLB if flush span is greater than a threshold, which will * likely be true for a single huge page. Thus a single THP flush will * invalidate the entire TLB which is not desirable. * e.g. see arch/arc: flush_pmd_tlb_range */ #define flush_pmd_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end) #define flush_pud_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end) #else #define flush_pmd_tlb_range(vma, addr, end) BUILD_BUG() #define flush_pud_tlb_range(vma, addr, end) BUILD_BUG() #endif #endif struct file; int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, unsigned long size, pgprot_t *vma_prot); #ifndef CONFIG_X86_ESPFIX64 static inline void init_espfix_bsp(void) { } #endif extern void __init pgtable_cache_init(void); #ifndef __HAVE_ARCH_PFN_MODIFY_ALLOWED static inline bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot) { return true; } static inline bool arch_has_pfn_modify_check(void) { return false; } #endif /* !_HAVE_ARCH_PFN_MODIFY_ALLOWED */ /* * Architecture PAGE_KERNEL_* fallbacks * * Some architectures don't define certain PAGE_KERNEL_* flags. This is either * because they really don't support them, or the port needs to be updated to * reflect the required functionality. Below are a set of relatively safe * fallbacks, as best effort, which we can count on in lieu of the architectures * not defining them on their own yet. */ #ifndef PAGE_KERNEL_RO # define PAGE_KERNEL_RO PAGE_KERNEL #endif #ifndef PAGE_KERNEL_EXEC # define PAGE_KERNEL_EXEC PAGE_KERNEL #endif /* * Page Table Modification bits for pgtbl_mod_mask. * * These are used by the p?d_alloc_track*() and p*d_populate_kernel() * functions in the generic vmalloc, ioremap and page table update code * to track at which page-table levels entries have been modified. * Based on that the code can better decide when page table changes need * to be synchronized to other page-tables in the system. */ #define __PGTBL_PGD_MODIFIED 0 #define __PGTBL_P4D_MODIFIED 1 #define __PGTBL_PUD_MODIFIED 2 #define __PGTBL_PMD_MODIFIED 3 #define __PGTBL_PTE_MODIFIED 4 #define PGTBL_PGD_MODIFIED BIT(__PGTBL_PGD_MODIFIED) #define PGTBL_P4D_MODIFIED BIT(__PGTBL_P4D_MODIFIED) #define PGTBL_PUD_MODIFIED BIT(__PGTBL_PUD_MODIFIED) #define PGTBL_PMD_MODIFIED BIT(__PGTBL_PMD_MODIFIED) #define PGTBL_PTE_MODIFIED BIT(__PGTBL_PTE_MODIFIED) /* Page-Table Modification Mask */ typedef unsigned int pgtbl_mod_mask; enum pgtable_level { PGTABLE_LEVEL_PTE = 0, PGTABLE_LEVEL_PMD, PGTABLE_LEVEL_PUD, PGTABLE_LEVEL_P4D, PGTABLE_LEVEL_PGD, }; static inline const char *pgtable_level_to_str(enum pgtable_level level) { switch (level) { case PGTABLE_LEVEL_PTE: return "pte"; case PGTABLE_LEVEL_PMD: return "pmd"; case PGTABLE_LEVEL_PUD: return "pud"; case PGTABLE_LEVEL_P4D: return "p4d"; case PGTABLE_LEVEL_PGD: return "pgd"; default: return "unknown"; } } #endif /* !__ASSEMBLY__ */ #if !defined(MAX_POSSIBLE_PHYSMEM_BITS) && !defined(CONFIG_64BIT) #ifdef CONFIG_PHYS_ADDR_T_64BIT /* * ZSMALLOC needs to know the highest PFN on 32-bit architectures * with physical address space extension, but falls back to * BITS_PER_LONG otherwise. */ #error Missing MAX_POSSIBLE_PHYSMEM_BITS definition #else #define MAX_POSSIBLE_PHYSMEM_BITS 32 #endif #endif #ifndef has_transparent_hugepage #define has_transparent_hugepage() IS_BUILTIN(CONFIG_TRANSPARENT_HUGEPAGE) #endif #ifndef has_transparent_pud_hugepage #define has_transparent_pud_hugepage() IS_BUILTIN(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) #endif /* * On some architectures it depends on the mm if the p4d/pud or pmd * layer of the page table hierarchy is folded or not. */ #ifndef mm_p4d_folded #define mm_p4d_folded(mm) __is_defined(__PAGETABLE_P4D_FOLDED) #endif #ifndef mm_pud_folded #define mm_pud_folded(mm) __is_defined(__PAGETABLE_PUD_FOLDED) #endif #ifndef mm_pmd_folded #define mm_pmd_folded(mm) __is_defined(__PAGETABLE_PMD_FOLDED) #endif #ifndef p4d_offset_lockless #define p4d_offset_lockless(pgdp, pgd, address) p4d_offset(&(pgd), address) #endif #ifndef pud_offset_lockless #define pud_offset_lockless(p4dp, p4d, address) pud_offset(&(p4d), address) #endif #ifndef pmd_offset_lockless #define pmd_offset_lockless(pudp, pud, address) pmd_offset(&(pud), address) #endif /* * pXd_leaf() is the API to check whether a pgtable entry is a huge page * mapping. It should work globally across all archs, without any * dependency on CONFIG_* options. For architectures that do not support * huge mappings on specific levels, below fallbacks will be used. * * A leaf pgtable entry should always imply the following: * * - It is a "present" entry. IOW, before using this API, please check it * with pXd_present() first. NOTE: it may not always mean the "present * bit" is set. For example, PROT_NONE entries are always "present". * * - It should _never_ be a swap entry of any type. Above "present" check * should have guarded this, but let's be crystal clear on this. * * - It should contain a huge PFN, which points to a huge page larger than * PAGE_SIZE of the platform. The PFN format isn't important here. * * - It should cover all kinds of huge mappings (i.e. pXd_trans_huge() * or hugetlb mappings). */ #ifndef pgd_leaf #define pgd_leaf(x) false #endif #ifndef p4d_leaf #define p4d_leaf(x) false #endif #ifndef pud_leaf #define pud_leaf(x) false #endif #ifndef pmd_leaf #define pmd_leaf(x) false #endif #ifndef pgd_leaf_size #define pgd_leaf_size(x) (1ULL << PGDIR_SHIFT) #endif #ifndef p4d_leaf_size #define p4d_leaf_size(x) P4D_SIZE #endif #ifndef pud_leaf_size #define pud_leaf_size(x) PUD_SIZE #endif #ifndef pmd_leaf_size #define pmd_leaf_size(x) PMD_SIZE #endif #ifndef __pte_leaf_size #ifndef pte_leaf_size #define pte_leaf_size(x) PAGE_SIZE #endif #define __pte_leaf_size(x,y) pte_leaf_size(y) #endif /* * We always define pmd_pfn for all archs as it's used in lots of generic * code. Now it happens too for pud_pfn (and can happen for larger * mappings too in the future; we're not there yet). Instead of defining * it for all archs (like pmd_pfn), provide a fallback. * * Note that returning 0 here means any arch that didn't define this can * get severely wrong when it hits a real pud leaf. It's arch's * responsibility to properly define it when a huge pud is possible. */ #ifndef pud_pfn #define pud_pfn(x) 0 #endif /* * Some architectures have MMUs that are configurable or selectable at boot * time. These lead to variable PTRS_PER_x. For statically allocated arrays it * helps to have a static maximum value. */ #ifndef MAX_PTRS_PER_PTE #define MAX_PTRS_PER_PTE PTRS_PER_PTE #endif #ifndef MAX_PTRS_PER_PMD #define MAX_PTRS_PER_PMD PTRS_PER_PMD #endif #ifndef MAX_PTRS_PER_PUD #define MAX_PTRS_PER_PUD PTRS_PER_PUD #endif #ifndef MAX_PTRS_PER_P4D #define MAX_PTRS_PER_P4D PTRS_PER_P4D #endif #ifndef pte_pgprot #define pte_pgprot(x) ((pgprot_t) {0}) #endif #ifndef pmd_pgprot #define pmd_pgprot(x) ((pgprot_t) {0}) #endif #ifndef pud_pgprot #define pud_pgprot(x) ((pgprot_t) {0}) #endif /* description of effects of mapping type and prot in current implementation. * this is due to the limited x86 page protection hardware. The expected * behavior is in parens: * * map_type prot * PROT_NONE PROT_READ PROT_WRITE PROT_EXEC * MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes * w: (no) no w: (no) no w: (yes) yes w: (no) no * x: (no) no x: (no) yes x: (no) yes x: (yes) yes * * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes * w: (no) no w: (no) no w: (copy) copy w: (no) no * x: (no) no x: (no) yes x: (no) yes x: (yes) yes * * On arm64, PROT_EXEC has the following behaviour for both MAP_SHARED and * MAP_PRIVATE (with Enhanced PAN supported): * r: (no) no * w: (no) no * x: (yes) yes */ #define DECLARE_VM_GET_PAGE_PROT \ pgprot_t vm_get_page_prot(vm_flags_t vm_flags) \ { \ return protection_map[vm_flags & \ (VM_READ | VM_WRITE | VM_EXEC | VM_SHARED)]; \ } \ EXPORT_SYMBOL(vm_get_page_prot); #endif /* _LINUX_PGTABLE_H */
54 209 3 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2007 Oracle. All rights reserved. */ #ifndef BTRFS_TRANSACTION_H #define BTRFS_TRANSACTION_H #include <linux/atomic.h> #include <linux/refcount.h> #include <linux/list.h> #include <linux/time64.h> #include <linux/mutex.h> #include <linux/wait.h> #include "btrfs_inode.h" #include "delayed-ref.h" struct dentry; struct inode; struct btrfs_pending_snapshot; struct btrfs_fs_info; struct btrfs_root_item; struct btrfs_root; struct btrfs_path; /* * Signal that a direct IO write is in progress, to avoid deadlock for sync * direct IO writes when fsync is called during the direct IO write path. */ #define BTRFS_TRANS_DIO_WRITE_STUB ((void *) 1) /* Radix-tree tag for roots that are part of the transaction. */ #define BTRFS_ROOT_TRANS_TAG 0 enum btrfs_trans_state { TRANS_STATE_RUNNING, TRANS_STATE_COMMIT_PREP, TRANS_STATE_COMMIT_START, TRANS_STATE_COMMIT_DOING, TRANS_STATE_UNBLOCKED, TRANS_STATE_SUPER_COMMITTED, TRANS_STATE_COMPLETED, TRANS_STATE_MAX, }; #define BTRFS_TRANS_HAVE_FREE_BGS 0 #define BTRFS_TRANS_DIRTY_BG_RUN 1 #define BTRFS_TRANS_CACHE_ENOSPC 2 struct btrfs_transaction { u64 transid; /* * total external writers(USERSPACE/START/ATTACH) in this * transaction, it must be zero before the transaction is * being committed */ atomic_t num_extwriters; /* * total writers in this transaction, it must be zero before the * transaction can end */ atomic_t num_writers; refcount_t use_count; unsigned long flags; /* Be protected by fs_info->trans_lock when we want to change it. */ enum btrfs_trans_state state; int aborted; struct list_head list; struct extent_io_tree dirty_pages; time64_t start_time; wait_queue_head_t writer_wait; wait_queue_head_t commit_wait; struct list_head pending_snapshots; struct list_head dev_update_list; struct list_head switch_commits; struct list_head dirty_bgs; /* * There is no explicit lock which protects io_bgs, rather its * consistency is implied by the fact that all the sites which modify * it do so under some form of transaction critical section, namely: * * - btrfs_start_dirty_block_groups - This function can only ever be * run by one of the transaction committers. Refer to * BTRFS_TRANS_DIRTY_BG_RUN usage in btrfs_commit_transaction * * - btrfs_write_dirty_blockgroups - this is called by * commit_cowonly_roots from transaction critical section * (TRANS_STATE_COMMIT_DOING) * * - btrfs_cleanup_dirty_bgs - called on transaction abort */ struct list_head io_bgs; struct list_head dropped_roots; struct extent_io_tree pinned_extents; /* * we need to make sure block group deletion doesn't race with * free space cache writeout. This mutex keeps them from stomping * on each other */ struct mutex cache_write_mutex; spinlock_t dirty_bgs_lock; /* Protected by spin lock fs_info->unused_bgs_lock. */ struct list_head deleted_bgs; spinlock_t dropped_roots_lock; struct btrfs_delayed_ref_root delayed_refs; struct btrfs_fs_info *fs_info; /* * Number of ordered extents the transaction must wait for before * committing. These are ordered extents started by a fast fsync. */ atomic_t pending_ordered; wait_queue_head_t pending_wait; }; enum { ENUM_BIT(__TRANS_FREEZABLE), ENUM_BIT(__TRANS_START), ENUM_BIT(__TRANS_ATTACH), ENUM_BIT(__TRANS_JOIN), ENUM_BIT(__TRANS_JOIN_NOLOCK), ENUM_BIT(__TRANS_DUMMY), ENUM_BIT(__TRANS_JOIN_NOSTART), }; #define TRANS_START (__TRANS_START | __TRANS_FREEZABLE) #define TRANS_ATTACH (__TRANS_ATTACH) #define TRANS_JOIN (__TRANS_JOIN | __TRANS_FREEZABLE) #define TRANS_JOIN_NOLOCK (__TRANS_JOIN_NOLOCK) #define TRANS_JOIN_NOSTART (__TRANS_JOIN_NOSTART) #define TRANS_EXTWRITERS (__TRANS_START | __TRANS_ATTACH) struct btrfs_trans_handle { u64 transid; u64 bytes_reserved; u64 delayed_refs_bytes_reserved; u64 chunk_bytes_reserved; unsigned long delayed_ref_updates; unsigned long delayed_ref_csum_deletions; struct btrfs_transaction *transaction; struct btrfs_block_rsv *block_rsv; struct btrfs_block_rsv *orig_rsv; /* Set by a task that wants to create a snapshot. */ struct btrfs_pending_snapshot *pending_snapshot; refcount_t use_count; unsigned int type; /* * Error code of transaction abort, set outside of locks and must use * the READ_ONCE/WRITE_ONCE access */ short aborted; bool adding_csums; bool allocating_chunk; bool removing_chunk; bool reloc_reserved; bool in_fsync; struct btrfs_fs_info *fs_info; struct list_head new_bgs; struct btrfs_block_rsv delayed_rsv; }; /* * The abort status can be changed between calls and is not protected by locks. * This accepts btrfs_transaction and btrfs_trans_handle as types. Once it's * set to a non-zero value it does not change, so the macro should be in checks * but is not necessary for further reads of the value. */ #define TRANS_ABORTED(trans) (unlikely(READ_ONCE((trans)->aborted))) struct btrfs_pending_snapshot { struct dentry *dentry; struct btrfs_inode *dir; struct btrfs_root *root; struct btrfs_root_item *root_item; struct btrfs_root *snap; struct btrfs_qgroup_inherit *inherit; struct btrfs_path *path; /* block reservation for the operation */ struct btrfs_block_rsv block_rsv; /* extra metadata reservation for relocation */ int error; /* Preallocated anonymous block device number */ dev_t anon_dev; bool readonly; struct list_head list; }; static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans, struct btrfs_inode *inode) { spin_lock(&inode->lock); inode->last_trans = trans->transaction->transid; inode->last_sub_trans = btrfs_get_root_log_transid(inode->root); inode->last_log_commit = inode->last_sub_trans - 1; spin_unlock(&inode->lock); } /* * Make qgroup codes to skip given qgroupid, means the old/new_roots for * qgroup won't contain the qgroupid in it. */ static inline void btrfs_set_skip_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) { struct btrfs_delayed_ref_root *delayed_refs; delayed_refs = &trans->transaction->delayed_refs; WARN_ON(delayed_refs->qgroup_to_skip); delayed_refs->qgroup_to_skip = qgroupid; } static inline void btrfs_clear_skip_qgroup(struct btrfs_trans_handle *trans) { struct btrfs_delayed_ref_root *delayed_refs; delayed_refs = &trans->transaction->delayed_refs; WARN_ON(!delayed_refs->qgroup_to_skip); delayed_refs->qgroup_to_skip = 0; } /* * We want the transaction abort to print stack trace only for errors where the * cause could be a bug, eg. due to ENOSPC, and not for common errors that are * caused by external factors. */ static inline bool btrfs_abort_should_print_stack(int error) { switch (error) { case -EIO: case -EROFS: case -ENOMEM: return false; } return true; } /* * Call btrfs_abort_transaction as early as possible when an error condition is * detected, that way the exact stack trace is reported for some errors. */ #define btrfs_abort_transaction(trans, error) \ do { \ bool __first = false; \ /* Report first abort since mount */ \ if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \ &((trans)->fs_info->fs_state))) { \ __first = true; \ if (WARN(btrfs_abort_should_print_stack(error), \ KERN_ERR \ "BTRFS: Transaction aborted (error %d)\n", \ (error))) { \ /* Stack trace printed. */ \ } else { \ btrfs_err((trans)->fs_info, \ "Transaction aborted (error %d)", \ (error)); \ } \ } \ __btrfs_abort_transaction((trans), __func__, \ __LINE__, (error), __first); \ } while (0) int btrfs_end_transaction(struct btrfs_trans_handle *trans); struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, unsigned int num_items); struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv( struct btrfs_root *root, unsigned int num_items); struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root); struct btrfs_trans_handle *btrfs_join_transaction_spacecache(struct btrfs_root *root); struct btrfs_trans_handle *btrfs_join_transaction_nostart(struct btrfs_root *root); struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root); struct btrfs_trans_handle *btrfs_attach_transaction_barrier( struct btrfs_root *root); int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid); void btrfs_add_dead_root(struct btrfs_root *root); void btrfs_maybe_wake_unfinished_drop(struct btrfs_fs_info *fs_info); int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info); int btrfs_commit_transaction(struct btrfs_trans_handle *trans); void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans); int btrfs_commit_current_transaction(struct btrfs_root *root); int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans); bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans); void btrfs_throttle(struct btrfs_fs_info *fs_info); int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info, struct extent_io_tree *dirty_pages, int mark); int btrfs_wait_tree_log_extents(struct btrfs_root *root, int mark); int btrfs_transaction_blocked(struct btrfs_fs_info *info); void btrfs_put_transaction(struct btrfs_transaction *transaction); void btrfs_add_dropped_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans); void __cold __btrfs_abort_transaction(struct btrfs_trans_handle *trans, const char *function, unsigned int line, int error, bool first_hit); int __init btrfs_transaction_init(void); void __cold btrfs_transaction_exit(void); #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NETFILTER_NETDEV_H_ #define _NETFILTER_NETDEV_H_ #include <linux/netfilter.h> #include <linux/netdevice.h> #ifdef CONFIG_NETFILTER_INGRESS static inline bool nf_hook_ingress_active(const struct sk_buff *skb) { #ifdef CONFIG_JUMP_LABEL if (!static_key_false(&nf_hooks_needed[NFPROTO_NETDEV][NF_NETDEV_INGRESS])) return false; #endif return rcu_access_pointer(skb->dev->nf_hooks_ingress); } /* caller must hold rcu_read_lock */ static inline int nf_hook_ingress(struct sk_buff *skb) { struct nf_hook_entries *e = rcu_dereference(skb->dev->nf_hooks_ingress); struct nf_hook_state state; int ret; /* Must recheck the ingress hook head, in the event it became NULL * after the check in nf_hook_ingress_active evaluated to true. */ if (unlikely(!e)) return 0; nf_hook_state_init(&state, NF_NETDEV_INGRESS, NFPROTO_NETDEV, skb->dev, NULL, NULL, dev_net(skb->dev), NULL); ret = nf_hook_slow(skb, &state, e, 0); if (ret == 0) return -1; return ret; } #else /* CONFIG_NETFILTER_INGRESS */ static inline int nf_hook_ingress_active(struct sk_buff *skb) { return 0; } static inline int nf_hook_ingress(struct sk_buff *skb) { return 0; } #endif /* CONFIG_NETFILTER_INGRESS */ #ifdef CONFIG_NETFILTER_EGRESS static inline bool nf_hook_egress_active(void) { #ifdef CONFIG_JUMP_LABEL if (!static_key_false(&nf_hooks_needed[NFPROTO_NETDEV][NF_NETDEV_EGRESS])) return false; #endif return true; } /** * nf_hook_egress - classify packets before transmission * @skb: packet to be classified * @rc: result code which shall be returned by __dev_queue_xmit() on failure * @dev: netdev whose egress hooks shall be applied to @skb * * Caller must hold rcu_read_lock. * * On ingress, packets are classified first by tc, then by netfilter. * On egress, the order is reversed for symmetry. Conceptually, tc and * netfilter can be thought of as layers, with netfilter layered above tc: * When tc redirects a packet to another interface, netfilter is not applied * because the packet is on the tc layer. * * The nf_skip_egress flag controls whether netfilter is applied on egress. * It is updated by __netif_receive_skb_core() and __dev_queue_xmit() when the * packet passes through tc and netfilter. Because __dev_queue_xmit() may be * called recursively by tunnel drivers such as vxlan, the flag is reverted to * false after sch_handle_egress(). This ensures that netfilter is applied * both on the overlay and underlying network. * * Returns: @skb on success or %NULL if the packet was consumed or filtered. */ static inline struct sk_buff *nf_hook_egress(struct sk_buff *skb, int *rc, struct net_device *dev) { struct nf_hook_entries *e; struct nf_hook_state state; int ret; #ifdef CONFIG_NETFILTER_SKIP_EGRESS if (skb->nf_skip_egress) return skb; #endif e = rcu_dereference_check(dev->nf_hooks_egress, rcu_read_lock_bh_held()); if (!e) return skb; nf_hook_state_init(&state, NF_NETDEV_EGRESS, NFPROTO_NETDEV, NULL, dev, NULL, dev_net(dev), NULL); /* nf assumes rcu_read_lock, not just read_lock_bh */ rcu_read_lock(); ret = nf_hook_slow(skb, &state, e, 0); rcu_read_unlock(); if (ret == 1) { return skb; } else if (ret < 0) { *rc = NET_XMIT_DROP; return NULL; } else { /* ret == 0 */ *rc = NET_XMIT_SUCCESS; return NULL; } } #else /* CONFIG_NETFILTER_EGRESS */ static inline bool nf_hook_egress_active(void) { return false; } static inline struct sk_buff *nf_hook_egress(struct sk_buff *skb, int *rc, struct net_device *dev) { return skb; } #endif /* CONFIG_NETFILTER_EGRESS */ static inline void nf_skip_egress(struct sk_buff *skb, bool skip) { #ifdef CONFIG_NETFILTER_SKIP_EGRESS skb->nf_skip_egress = skip; #endif } static inline void nf_hook_netdev_init(struct net_device *dev) { #ifdef CONFIG_NETFILTER_INGRESS RCU_INIT_POINTER(dev->nf_hooks_ingress, NULL); #endif #ifdef CONFIG_NETFILTER_EGRESS RCU_INIT_POINTER(dev->nf_hooks_egress, NULL); #endif } #endif /* _NETFILTER_NETDEV_H_ */
1 1 1 1 1 3 1 2 7 7 2 5 2 3 1 1 3 3 208 207 8 207 207 202 6 1 2 3 20 14 17 12 4 2 4 10 16 16 219 2 216 4 216 222 222 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * All Rights Reserved. */ #include "xfs_platform.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_trans.h" #include "xfs_error.h" #include "xfs_alloc.h" #include "xfs_fsops.h" #include "xfs_trans_space.h" #include "xfs_log.h" #include "xfs_log_priv.h" #include "xfs_ag.h" #include "xfs_ag_resv.h" #include "xfs_trace.h" #include "xfs_rtalloc.h" #include "xfs_rtrmap_btree.h" #include "xfs_rtrefcount_btree.h" #include "xfs_metafile.h" #include "xfs_healthmon.h" #include <linux/fserror.h> /* * Write new AG headers to disk. Non-transactional, but need to be * written and completed prior to the growfs transaction being logged. * To do this, we use a delayed write buffer list and wait for * submission and IO completion of the list as a whole. This allows the * IO subsystem to merge all the AG headers in a single AG into a single * IO and hide most of the latency of the IO from us. * * This also means that if we get an error whilst building the buffer * list to write, we can cancel the entire list without having written * anything. */ static int xfs_resizefs_init_new_ags( struct xfs_trans *tp, struct aghdr_init_data *id, xfs_agnumber_t oagcount, xfs_agnumber_t nagcount, xfs_rfsblock_t delta, struct xfs_perag *last_pag, bool *lastag_extended) { struct xfs_mount *mp = tp->t_mountp; xfs_rfsblock_t nb = mp->m_sb.sb_dblocks + delta; int error; *lastag_extended = false; INIT_LIST_HEAD(&id->buffer_list); for (id->agno = nagcount - 1; id->agno >= oagcount; id->agno--, delta -= id->agsize) { if (id->agno == nagcount - 1) id->agsize = nb - (id->agno * (xfs_rfsblock_t)mp->m_sb.sb_agblocks); else id->agsize = mp->m_sb.sb_agblocks; error = xfs_ag_init_headers(mp, id); if (error) { xfs_buf_delwri_cancel(&id->buffer_list); return error; } } error = xfs_buf_delwri_submit(&id->buffer_list); if (error) return error; if (delta) { *lastag_extended = true; error = xfs_ag_extend_space(last_pag, tp, delta); } return error; } /* * growfs operations */ static int xfs_growfs_data_private( struct xfs_mount *mp, /* mount point for filesystem */ struct xfs_growfs_data *in) /* growfs data input struct */ { xfs_agnumber_t oagcount = mp->m_sb.sb_agcount; xfs_rfsblock_t nb = in->newblocks; struct xfs_buf *bp; int error; xfs_agnumber_t nagcount; xfs_agnumber_t nagimax = 0; int64_t delta; bool lastag_extended = false; struct xfs_trans *tp; struct aghdr_init_data id = {}; struct xfs_perag *last_pag; error = xfs_sb_validate_fsb_count(&mp->m_sb, nb); if (error) return error; if (nb > mp->m_sb.sb_dblocks) { error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1), XFS_FSS_TO_BB(mp, 1), &bp, NULL); if (error) return error; xfs_buf_relse(bp); } /* Make sure the new fs size won't cause problems with the log. */ error = xfs_growfs_check_rtgeom(mp, nb, mp->m_sb.sb_rblocks, mp->m_sb.sb_rextsize); if (error) return error; xfs_growfs_compute_deltas(mp, nb, &delta, &nagcount); /* * Reject filesystems with a single AG because they are not * supported, and reject a shrink operation that would cause a * filesystem to become unsupported. */ if (delta < 0 && nagcount < 2) return -EINVAL; /* No work to do */ if (delta == 0) return 0; /* TODO: shrinking the entire AGs hasn't yet completed */ if (nagcount < oagcount) return -EINVAL; /* allocate the new per-ag structures */ error = xfs_initialize_perag(mp, oagcount, nagcount, nb, &nagimax); if (error) return error; if (delta > 0) error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata, XFS_GROWFS_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp); else error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata, -delta, 0, 0, &tp); if (error) goto out_free_unused_perag; last_pag = xfs_perag_get(mp, oagcount - 1); if (delta > 0) { error = xfs_resizefs_init_new_ags(tp, &id, oagcount, nagcount, delta, last_pag, &lastag_extended); } else { xfs_warn_experimental(mp, XFS_EXPERIMENTAL_SHRINK); error = xfs_ag_shrink_space(last_pag, &tp, -delta); } xfs_perag_put(last_pag); if (error) goto out_trans_cancel; /* * Update changed superblock fields transactionally. These are not * seen by the rest of the world until the transaction commit applies * them atomically to the superblock. */ if (nagcount > oagcount) xfs_trans_mod_sb(tp, XFS_TRANS_SB_AGCOUNT, nagcount - oagcount); if (delta) xfs_trans_mod_sb(tp, XFS_TRANS_SB_DBLOCKS, delta); if (id.nfree) xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, id.nfree); /* * Sync sb counters now to reflect the updated values. This is * particularly important for shrink because the write verifier * will fail if sb_fdblocks is ever larger than sb_dblocks. */ if (xfs_has_lazysbcount(mp)) xfs_log_sb(tp); xfs_trans_set_sync(tp); error = xfs_trans_commit(tp); if (error) return error; /* New allocation groups fully initialized, so update mount struct */ if (nagimax) mp->m_maxagi = nagimax; xfs_set_low_space_thresholds(mp); mp->m_alloc_set_aside = xfs_alloc_set_aside(mp); if (delta > 0) { /* * If we expanded the last AG, free the per-AG reservation * so we can reinitialize it with the new size. */ if (lastag_extended) { struct xfs_perag *pag; pag = xfs_perag_get(mp, id.agno); xfs_ag_resv_free(pag); xfs_perag_put(pag); } /* * Reserve AG metadata blocks. ENOSPC here does not mean there * was a growfs failure, just that there still isn't space for * new user data after the grow has been run. */ error = xfs_fs_reserve_ag_blocks(mp); if (error == -ENOSPC) error = 0; /* Compute new maxlevels for rt btrees. */ xfs_rtrmapbt_compute_maxlevels(mp); xfs_rtrefcountbt_compute_maxlevels(mp); } return error; out_trans_cancel: xfs_trans_cancel(tp); out_free_unused_perag: if (nagcount > oagcount) xfs_free_perag_range(mp, oagcount, nagcount); return error; } static int xfs_growfs_log_private( struct xfs_mount *mp, /* mount point for filesystem */ struct xfs_growfs_log *in) /* growfs log input struct */ { xfs_extlen_t nb; nb = in->newblocks; if (nb < XFS_MIN_LOG_BLOCKS || nb < XFS_B_TO_FSB(mp, XFS_MIN_LOG_BYTES)) return -EINVAL; if (nb == mp->m_sb.sb_logblocks && in->isint == (mp->m_sb.sb_logstart != 0)) return -EINVAL; /* * Moving the log is hard, need new interfaces to sync * the log first, hold off all activity while moving it. * Can have shorter or longer log in the same space, * or transform internal to external log or vice versa. */ return -ENOSYS; } static int xfs_growfs_imaxpct( struct xfs_mount *mp, __u32 imaxpct) { struct xfs_trans *tp; int dpct; int error; if (imaxpct > 100) return -EINVAL; error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata, XFS_GROWFS_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp); if (error) return error; dpct = imaxpct - mp->m_sb.sb_imax_pct; xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct); xfs_trans_set_sync(tp); return xfs_trans_commit(tp); } /* * protected versions of growfs function acquire and release locks on the mount * point - exported through ioctls: XFS_IOC_FSGROWFSDATA, XFS_IOC_FSGROWFSLOG, * XFS_IOC_FSGROWFSRT */ int xfs_growfs_data( struct xfs_mount *mp, struct xfs_growfs_data *in) { int error; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (!mutex_trylock(&mp->m_growlock)) return -EWOULDBLOCK; /* we can't grow the data section when an internal RT section exists */ if (in->newblocks != mp->m_sb.sb_dblocks && mp->m_sb.sb_rtstart) { error = -EINVAL; goto out_unlock; } /* update imaxpct separately to the physical grow of the filesystem */ if (in->imaxpct != mp->m_sb.sb_imax_pct) { error = xfs_growfs_imaxpct(mp, in->imaxpct); if (error) goto out_unlock; } if (in->newblocks != mp->m_sb.sb_dblocks) { error = xfs_growfs_data_private(mp, in); if (error) goto out_unlock; } /* Post growfs calculations needed to reflect new state in operations */ if (mp->m_sb.sb_imax_pct) { uint64_t icount = mp->m_sb.sb_dblocks * mp->m_sb.sb_imax_pct; do_div(icount, 100); M_IGEO(mp)->maxicount = XFS_FSB_TO_INO(mp, icount); } else M_IGEO(mp)->maxicount = 0; /* Update secondary superblocks now the physical grow has completed */ error = xfs_update_secondary_sbs(mp); /* * Increment the generation unconditionally, after trying to update the * secondary superblocks, as the new size is live already at this point. */ mp->m_generation++; out_unlock: mutex_unlock(&mp->m_growlock); return error; } int xfs_growfs_log( xfs_mount_t *mp, struct xfs_growfs_log *in) { int error; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (!mutex_trylock(&mp->m_growlock)) return -EWOULDBLOCK; error = xfs_growfs_log_private(mp, in); mutex_unlock(&mp->m_growlock); return error; } /* * Reserve the requested number of blocks if available. Otherwise return * as many as possible to satisfy the request. The actual number * reserved are returned in outval. */ int xfs_reserve_blocks( struct xfs_mount *mp, enum xfs_free_counter ctr, uint64_t request) { int64_t lcounter, delta; int64_t fdblks_delta = 0; int64_t free; int error = 0; ASSERT(ctr < XC_FREE_NR); /* * With per-cpu counters, this becomes an interesting problem. we need * to work out if we are freeing or allocation blocks first, then we can * do the modification as necessary. * * We do this under the m_sb_lock so that if we are near ENOSPC, we will * hold out any changes while we work out what to do. This means that * the amount of free space can change while we do this, so we need to * retry if we end up trying to reserve more space than is available. */ spin_lock(&mp->m_sb_lock); /* * If our previous reservation was larger than the current value, * then move any unused blocks back to the free pool. Modify the resblks * counters directly since we shouldn't have any problems unreserving * space. */ if (mp->m_free[ctr].res_total > request) { lcounter = mp->m_free[ctr].res_avail - request; if (lcounter > 0) { /* release unused blocks */ fdblks_delta = lcounter; mp->m_free[ctr].res_avail -= lcounter; } mp->m_free[ctr].res_total = request; if (fdblks_delta) { spin_unlock(&mp->m_sb_lock); xfs_add_freecounter(mp, ctr, fdblks_delta); spin_lock(&mp->m_sb_lock); } goto out; } /* * If the request is larger than the current reservation, reserve the * blocks before we update the reserve counters. Sample m_free and * perform a partial reservation if the request exceeds free space. * * The code below estimates how many blocks it can request from * fdblocks to stash in the reserve pool. This is a classic TOCTOU * race since fdblocks updates are not always coordinated via * m_sb_lock. Set the reserve size even if there's not enough free * space to fill it because mod_fdblocks will refill an undersized * reserve when it can. */ free = xfs_sum_freecounter_raw(mp, ctr) - xfs_freecounter_unavailable(mp, ctr); delta = request - mp->m_free[ctr].res_total; mp->m_free[ctr].res_total = request; if (delta > 0 && free > 0) { /* * We'll either succeed in getting space from the free block * count or we'll get an ENOSPC. Don't set the reserved flag * here - we don't want to reserve the extra reserve blocks * from the reserve. * * The desired reserve size can change after we drop the lock. * Use mod_fdblocks to put the space into the reserve or into * fdblocks as appropriate. */ fdblks_delta = min(free, delta); spin_unlock(&mp->m_sb_lock); error = xfs_dec_freecounter(mp, ctr, fdblks_delta, 0); if (!error) xfs_add_freecounter(mp, ctr, fdblks_delta); spin_lock(&mp->m_sb_lock); } out: spin_unlock(&mp->m_sb_lock); return error; } int xfs_fs_goingdown( xfs_mount_t *mp, uint32_t inflags) { switch (inflags) { case XFS_FSOP_GOING_FLAGS_DEFAULT: { if (!bdev_freeze(mp->m_super->s_bdev)) { xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT); bdev_thaw(mp->m_super->s_bdev); } break; } case XFS_FSOP_GOING_FLAGS_LOGFLUSH: xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT); break; case XFS_FSOP_GOING_FLAGS_NOLOGFLUSH: xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT | SHUTDOWN_LOG_IO_ERROR); break; default: return -EINVAL; } return 0; } /* * Force a shutdown of the filesystem instantly while keeping the filesystem * consistent. We don't do an unmount here; just shutdown the shop, make sure * that absolutely nothing persistent happens to this filesystem after this * point. * * The shutdown state change is atomic, resulting in the first and only the * first shutdown call processing the shutdown. This means we only shutdown the * log once as it requires, and we don't spam the logs when multiple concurrent * shutdowns race to set the shutdown flags. */ void xfs_do_force_shutdown( struct xfs_mount *mp, uint32_t flags, char *fname, int lnnum) { int tag; const char *why; if (xfs_set_shutdown(mp)) { xlog_shutdown_wait(mp->m_log); return; } if (mp->m_sb_bp) mp->m_sb_bp->b_flags |= XBF_DONE; if (flags & SHUTDOWN_FORCE_UMOUNT) xfs_alert(mp, "User initiated shutdown received."); if (xlog_force_shutdown(mp->m_log, flags)) { tag = XFS_PTAG_SHUTDOWN_LOGERROR; why = "Log I/O Error"; } else if (flags & SHUTDOWN_CORRUPT_INCORE) { tag = XFS_PTAG_SHUTDOWN_CORRUPT; why = "Corruption of in-memory data"; } else if (flags & SHUTDOWN_CORRUPT_ONDISK) { tag = XFS_PTAG_SHUTDOWN_CORRUPT; why = "Corruption of on-disk metadata"; } else if (flags & SHUTDOWN_DEVICE_REMOVED) { tag = XFS_PTAG_SHUTDOWN_IOERROR; why = "Block device removal"; } else { tag = XFS_PTAG_SHUTDOWN_IOERROR; why = "Metadata I/O Error"; } trace_xfs_force_shutdown(mp, tag, flags, fname, lnnum); xfs_alert_tag(mp, tag, "%s (0x%x) detected at %pS (%s:%d). Shutting down filesystem.", why, flags, __return_address, fname, lnnum); xfs_alert(mp, "Please unmount the filesystem and rectify the problem(s)"); if (xfs_error_level >= XFS_ERRLEVEL_HIGH) xfs_stack_trace(); fserror_report_shutdown(mp->m_super, GFP_KERNEL); xfs_healthmon_report_shutdown(mp, flags); } /* * Reserve free space for per-AG metadata. */ int xfs_fs_reserve_ag_blocks( struct xfs_mount *mp) { struct xfs_perag *pag = NULL; int error = 0; int err2; mp->m_finobt_nores = false; while ((pag = xfs_perag_next(mp, pag))) { err2 = xfs_ag_resv_init(pag, NULL); if (err2 && !error) error = err2; } if (error && error != -ENOSPC) { xfs_warn(mp, "Error %d reserving per-AG metadata reserve pool.", error); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); return error; } err2 = xfs_metafile_resv_init(mp); if (err2 && err2 != -ENOSPC) { xfs_warn(mp, "Error %d reserving realtime metadata reserve pool.", err2); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); if (!error) error = err2; } return error; } /* * Free space reserved for per-AG metadata. */ void xfs_fs_unreserve_ag_blocks( struct xfs_mount *mp) { struct xfs_perag *pag = NULL; xfs_metafile_resv_free(mp); while ((pag = xfs_perag_next(mp, pag))) xfs_ag_resv_free(pag); }
146 187 17 260 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_POLL_H #define _LINUX_POLL_H #include <linux/compiler.h> #include <linux/ktime.h> #include <linux/wait.h> #include <linux/string.h> #include <linux/fs.h> #include <linux/uaccess.h> #include <uapi/linux/poll.h> #include <uapi/linux/eventpoll.h> /* ~832 bytes of stack space used max in sys_select/sys_poll before allocating additional memory. */ #define MAX_STACK_ALLOC 832 #define FRONTEND_STACK_ALLOC 256 #define SELECT_STACK_ALLOC FRONTEND_STACK_ALLOC #define POLL_STACK_ALLOC FRONTEND_STACK_ALLOC #define WQUEUES_STACK_ALLOC (MAX_STACK_ALLOC - FRONTEND_STACK_ALLOC) #define N_INLINE_POLL_ENTRIES (WQUEUES_STACK_ALLOC / sizeof(struct poll_table_entry)) #define DEFAULT_POLLMASK (EPOLLIN | EPOLLOUT | EPOLLRDNORM | EPOLLWRNORM) struct poll_table_struct; /* * structures and helpers for f_op->poll implementations */ typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *, struct poll_table_struct *); /* * Do not touch the structure directly, use the access function * poll_requested_events() instead. */ typedef struct poll_table_struct { poll_queue_proc _qproc; __poll_t _key; } poll_table; static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p) { if (p && p->_qproc) { p->_qproc(filp, wait_address, p); /* * This memory barrier is paired in the wq_has_sleeper(). * See the comment above prepare_to_wait(), we need to * ensure that subsequent tests in this thread can't be * reordered with __add_wait_queue() in _qproc() paths. */ smp_mb(); } } /* * Return the set of events that the application wants to poll for. * This is useful for drivers that need to know whether a DMA transfer has * to be started implicitly on poll(). You typically only want to do that * if the application is actually polling for POLLIN and/or POLLOUT. */ static inline __poll_t poll_requested_events(const poll_table *p) { return p ? p->_key : ~(__poll_t)0; } static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc) { pt->_qproc = qproc; pt->_key = ~(__poll_t)0; /* all events enabled */ } static inline bool file_can_poll(struct file *file) { return file->f_op->poll; } static inline __poll_t vfs_poll(struct file *file, struct poll_table_struct *pt) { if (unlikely(!file->f_op->poll)) return DEFAULT_POLLMASK; return file->f_op->poll(file, pt); } struct poll_table_entry { struct file *filp; __poll_t key; wait_queue_entry_t wait; wait_queue_head_t *wait_address; }; /* * Structures and helpers for select/poll syscall */ struct poll_wqueues { poll_table pt; struct poll_table_page *table; struct task_struct *polling_task; int triggered; int error; int inline_index; struct poll_table_entry inline_entries[N_INLINE_POLL_ENTRIES]; }; extern void poll_initwait(struct poll_wqueues *pwq); extern void poll_freewait(struct poll_wqueues *pwq); extern u64 select_estimate_accuracy(struct timespec64 *tv); #define MAX_INT64_SECONDS (((s64)(~((u64)0)>>1)/HZ)-1) extern int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct timespec64 *end_time); extern int poll_select_set_timeout(struct timespec64 *to, time64_t sec, long nsec); #define __MAP(v, from, to) \ (from < to ? (v & from) * (to/from) : (v & from) / (from/to)) static inline __u16 mangle_poll(__poll_t val) { __u16 v = (__force __u16)val; #define M(X) __MAP(v, (__force __u16)EPOLL##X, POLL##X) return M(IN) | M(OUT) | M(PRI) | M(ERR) | M(NVAL) | M(RDNORM) | M(RDBAND) | M(WRNORM) | M(WRBAND) | M(HUP) | M(RDHUP) | M(MSG); #undef M } static inline __poll_t demangle_poll(u16 val) { #define M(X) (__force __poll_t)__MAP(val, POLL##X, (__force __u16)EPOLL##X) return M(IN) | M(OUT) | M(PRI) | M(ERR) | M(NVAL) | M(RDNORM) | M(RDBAND) | M(WRNORM) | M(WRBAND) | M(HUP) | M(RDHUP) | M(MSG); #undef M } #undef __MAP #endif /* _LINUX_POLL_H */
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 /* file-mmu.c: ramfs MMU-based file operations * * Resizable simple ram filesystem for Linux. * * Copyright (C) 2000 Linus Torvalds. * 2000 Transmeta Corp. * * Usage limits added by David Gibson, Linuxcare Australia. * This file is released under the GPL. */ /* * NOTE! This filesystem is probably most useful * not as a real filesystem, but as an example of * how virtual filesystems can be written. * * It doesn't get much simpler than this. Consider * that this file implements the full semantics of * a POSIX-compliant read-write filesystem. * * Note in particular how the filesystem does not * need to implement any data structures of its own * to keep track of the virtual data: using the VFS * caches is sufficient. */ #include <linux/fs.h> #include <linux/mm.h> #include <linux/ramfs.h> #include <linux/sched.h> #include "internal.h" static unsigned long ramfs_mmu_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { return mm_get_unmapped_area(file, addr, len, pgoff, flags); } const struct file_operations ramfs_file_operations = { .read_iter = generic_file_read_iter, .write_iter = generic_file_write_iter, .mmap_prepare = generic_file_mmap_prepare, .fsync = noop_fsync, .splice_read = filemap_splice_read, .splice_write = iter_file_splice_write, .llseek = generic_file_llseek, .get_unmapped_area = ramfs_mmu_get_unmapped_area, }; const struct inode_operations ramfs_file_inode_operations = { .setattr = simple_setattr, .getattr = simple_getattr, };
2 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 // SPDX-License-Identifier: GPL-2.0-only /* * Generic HDLC support routines for Linux * * Copyright (C) 1999 - 2008 Krzysztof Halasa <khc@pm.waw.pl> * * Currently supported: * * raw IP-in-HDLC * * Cisco HDLC * * Frame Relay with ANSI or CCITT LMI (both user and network side) * * PPP * * X.25 * * Use sethdlc utility to set line parameters, protocol and PVCs * * How does it work: * - proto->open(), close(), start(), stop() calls are serialized. * The order is: open, [ start, stop ... ] close ... * - proto->start() and stop() are called with spin_lock_irq held. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/errno.h> #include <linux/hdlc.h> #include <linux/if_arp.h> #include <linux/inetdevice.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/notifier.h> #include <linux/pkt_sched.h> #include <linux/poll.h> #include <linux/rtnetlink.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <net/net_namespace.h> static const char *version = "HDLC support module revision 1.22"; #undef DEBUG_LINK static struct hdlc_proto *first_proto; static int hdlc_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *p, struct net_device *orig_dev) { struct hdlc_device *hdlc; /* First make sure "dev" is an HDLC device */ if (!(dev->priv_flags & IFF_WAN_HDLC)) { kfree_skb(skb); return NET_RX_SUCCESS; } hdlc = dev_to_hdlc(dev); if (!net_eq(dev_net(dev), &init_net)) { kfree_skb(skb); return 0; } BUG_ON(!hdlc->proto->netif_rx); return hdlc->proto->netif_rx(skb); } netdev_tx_t hdlc_start_xmit(struct sk_buff *skb, struct net_device *dev) { hdlc_device *hdlc = dev_to_hdlc(dev); if (hdlc->proto->xmit) return hdlc->proto->xmit(skb, dev); return hdlc->xmit(skb, dev); /* call hardware driver directly */ } EXPORT_SYMBOL(hdlc_start_xmit); static inline void hdlc_proto_start(struct net_device *dev) { hdlc_device *hdlc = dev_to_hdlc(dev); if (hdlc->proto->start) hdlc->proto->start(dev); } static inline void hdlc_proto_stop(struct net_device *dev) { hdlc_device *hdlc = dev_to_hdlc(dev); if (hdlc->proto->stop) hdlc->proto->stop(dev); } static int hdlc_device_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); hdlc_device *hdlc; unsigned long flags; int on; if (!net_eq(dev_net(dev), &init_net)) return NOTIFY_DONE; if (!(dev->priv_flags & IFF_WAN_HDLC)) return NOTIFY_DONE; /* not an HDLC device */ if (event != NETDEV_CHANGE) return NOTIFY_DONE; /* Only interested in carrier changes */ on = netif_carrier_ok(dev); #ifdef DEBUG_LINK printk(KERN_DEBUG "%s: hdlc_device_event NETDEV_CHANGE, carrier %i\n", dev->name, on); #endif hdlc = dev_to_hdlc(dev); spin_lock_irqsave(&hdlc->state_lock, flags); if (hdlc->carrier == on) goto carrier_exit; /* no change in DCD line level */ hdlc->carrier = on; if (!hdlc->open) goto carrier_exit; if (hdlc->carrier) { netdev_info(dev, "Carrier detected\n"); hdlc_proto_start(dev); } else { netdev_info(dev, "Carrier lost\n"); hdlc_proto_stop(dev); } carrier_exit: spin_unlock_irqrestore(&hdlc->state_lock, flags); return NOTIFY_DONE; } /* Must be called by hardware driver when HDLC device is being opened */ int hdlc_open(struct net_device *dev) { hdlc_device *hdlc = dev_to_hdlc(dev); #ifdef DEBUG_LINK printk(KERN_DEBUG "%s: hdlc_open() carrier %i open %i\n", dev->name, hdlc->carrier, hdlc->open); #endif if (!hdlc->proto) return -ENOSYS; /* no protocol attached */ if (hdlc->proto->open) { int result = hdlc->proto->open(dev); if (result) return result; } spin_lock_irq(&hdlc->state_lock); if (hdlc->carrier) { netdev_info(dev, "Carrier detected\n"); hdlc_proto_start(dev); } else { netdev_info(dev, "No carrier\n"); } hdlc->open = 1; spin_unlock_irq(&hdlc->state_lock); return 0; } EXPORT_SYMBOL(hdlc_open); /* Must be called by hardware driver when HDLC device is being closed */ void hdlc_close(struct net_device *dev) { hdlc_device *hdlc = dev_to_hdlc(dev); #ifdef DEBUG_LINK printk(KERN_DEBUG "%s: hdlc_close() carrier %i open %i\n", dev->name, hdlc->carrier, hdlc->open); #endif spin_lock_irq(&hdlc->state_lock); hdlc->open = 0; if (hdlc->carrier) hdlc_proto_stop(dev); spin_unlock_irq(&hdlc->state_lock); if (hdlc->proto->close) hdlc->proto->close(dev); } EXPORT_SYMBOL(hdlc_close); int hdlc_ioctl(struct net_device *dev, struct if_settings *ifs) { struct hdlc_proto *proto = first_proto; int result; if (dev_to_hdlc(dev)->proto) { result = dev_to_hdlc(dev)->proto->ioctl(dev, ifs); if (result != -EINVAL) return result; } /* Not handled by currently attached protocol (if any) */ while (proto) { result = proto->ioctl(dev, ifs); if (result != -EINVAL) return result; proto = proto->next; } return -EINVAL; } EXPORT_SYMBOL(hdlc_ioctl); static const struct header_ops hdlc_null_ops; static void hdlc_setup_dev(struct net_device *dev) { /* Re-init all variables changed by HDLC protocol drivers, * including ether_setup() called from hdlc_raw_eth.c. */ dev->flags = IFF_POINTOPOINT | IFF_NOARP; dev->priv_flags = IFF_WAN_HDLC; dev->mtu = HDLC_MAX_MTU; dev->min_mtu = 68; dev->max_mtu = HDLC_MAX_MTU; dev->type = ARPHRD_RAWHDLC; dev->hard_header_len = 0; dev->needed_headroom = 0; dev->addr_len = 0; dev->header_ops = &hdlc_null_ops; } static void hdlc_setup(struct net_device *dev) { hdlc_device *hdlc = dev_to_hdlc(dev); hdlc_setup_dev(dev); hdlc->carrier = 1; hdlc->open = 0; spin_lock_init(&hdlc->state_lock); } struct net_device *alloc_hdlcdev(void *priv) { struct net_device *dev; dev = alloc_netdev(sizeof(struct hdlc_device), "hdlc%d", NET_NAME_UNKNOWN, hdlc_setup); if (dev) dev_to_hdlc(dev)->priv = priv; return dev; } EXPORT_SYMBOL(alloc_hdlcdev); void unregister_hdlc_device(struct net_device *dev) { rtnl_lock(); detach_hdlc_protocol(dev); unregister_netdevice(dev); rtnl_unlock(); } EXPORT_SYMBOL(unregister_hdlc_device); int attach_hdlc_protocol(struct net_device *dev, struct hdlc_proto *proto, size_t size) { int err; err = detach_hdlc_protocol(dev); if (err) return err; if (!try_module_get(proto->module)) return -ENOSYS; if (size) { dev_to_hdlc(dev)->state = kmalloc(size, GFP_KERNEL); if (!dev_to_hdlc(dev)->state) { module_put(proto->module); return -ENOBUFS; } } dev_to_hdlc(dev)->proto = proto; return 0; } EXPORT_SYMBOL(attach_hdlc_protocol); int detach_hdlc_protocol(struct net_device *dev) { hdlc_device *hdlc = dev_to_hdlc(dev); int err; if (hdlc->proto) { err = call_netdevice_notifiers(NETDEV_PRE_TYPE_CHANGE, dev); err = notifier_to_errno(err); if (err) { netdev_err(dev, "Refused to change device type\n"); return err; } if (hdlc->proto->detach) hdlc->proto->detach(dev); module_put(hdlc->proto->module); hdlc->proto = NULL; } kfree(hdlc->state); hdlc->state = NULL; hdlc_setup_dev(dev); return 0; } EXPORT_SYMBOL(detach_hdlc_protocol); void register_hdlc_protocol(struct hdlc_proto *proto) { rtnl_lock(); proto->next = first_proto; first_proto = proto; rtnl_unlock(); } EXPORT_SYMBOL(register_hdlc_protocol); void unregister_hdlc_protocol(struct hdlc_proto *proto) { struct hdlc_proto **p; rtnl_lock(); p = &first_proto; while (*p != proto) { BUG_ON(!*p); p = &((*p)->next); } *p = proto->next; rtnl_unlock(); } EXPORT_SYMBOL(unregister_hdlc_protocol); MODULE_AUTHOR("Krzysztof Halasa <khc@pm.waw.pl>"); MODULE_DESCRIPTION("HDLC support module"); MODULE_LICENSE("GPL v2"); static struct packet_type hdlc_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_HDLC), .func = hdlc_rcv, }; static struct notifier_block hdlc_notifier = { .notifier_call = hdlc_device_event, }; static int __init hdlc_module_init(void) { int result; pr_info("%s\n", version); result = register_netdevice_notifier(&hdlc_notifier); if (result) return result; dev_add_pack(&hdlc_packet_type); return 0; } static void __exit hdlc_module_exit(void) { dev_remove_pack(&hdlc_packet_type); unregister_netdevice_notifier(&hdlc_notifier); } module_init(hdlc_module_init); module_exit(hdlc_module_exit);
4 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 // SPDX-License-Identifier: GPL-2.0 /* * sysctl_net_ipv4.c: sysctl interface to net IPV4 subsystem. * * Begun April 1, 1996, Mike Shaver. * Added /proc/sys/net/ipv4 directory entry (empty =) ). [MS] */ #include <linux/sysctl.h> #include <linux/seqlock.h> #include <linux/init.h> #include <linux/slab.h> #include <net/icmp.h> #include <net/ip.h> #include <net/ip_fib.h> #include <net/tcp.h> #include <net/udp.h> #include <net/cipso_ipv4.h> #include <net/ping.h> #include <net/protocol.h> #include <net/netevent.h> static int tcp_retr1_max = 255; static int ip_local_port_range_min[] = { 1, 1 }; static int ip_local_port_range_max[] = { 65535, 65535 }; static int tcp_adv_win_scale_min = -31; static int tcp_adv_win_scale_max = 31; static int tcp_app_win_max = 31; static int tcp_min_snd_mss_min = TCP_MIN_SND_MSS; static int tcp_min_snd_mss_max = 65535; static int tcp_rto_max_max = TCP_RTO_MAX_SEC * MSEC_PER_SEC; static int ip_privileged_port_min; static int ip_privileged_port_max = 65535; static int ip_ttl_min = 1; static int ip_ttl_max = 255; static int tcp_syn_retries_min = 1; static int tcp_syn_retries_max = MAX_TCP_SYNCNT; static int tcp_syn_linear_timeouts_max = MAX_TCP_SYNCNT; static unsigned long ip_ping_group_range_min[] = { 0, 0 }; static unsigned long ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX }; static u32 u32_max_div_HZ = UINT_MAX / HZ; static int one_day_secs = 24 * 3600; static u32 fib_multipath_hash_fields_all_mask __maybe_unused = FIB_MULTIPATH_HASH_FIELD_ALL_MASK; static unsigned int tcp_child_ehash_entries_max = 16 * 1024 * 1024; static unsigned int udp_child_hash_entries_max = UDP_HTABLE_SIZE_MAX; static int tcp_plb_max_rounds = 31; static int tcp_plb_max_cong_thresh = 256; static unsigned int tcp_tw_reuse_delay_max = TCP_PAWS_MSL * MSEC_PER_SEC; static int tcp_ecn_mode_max = 5; static u32 icmp_errors_extension_mask_all = GENMASK_U8(ICMP_ERR_EXT_COUNT - 1, 0); /* obsolete */ static int sysctl_tcp_low_latency __read_mostly; /* Update system visible IP port range */ static void set_local_port_range(struct net *net, unsigned int low, unsigned int high) { bool same_parity = !((low ^ high) & 1); if (same_parity && !net->ipv4.ip_local_ports.warned) { net->ipv4.ip_local_ports.warned = true; pr_err_ratelimited("ip_local_port_range: prefer different parity for start/end values.\n"); } WRITE_ONCE(net->ipv4.ip_local_ports.range, high << 16 | low); } /* Validate changes from /proc interface. */ static int ipv4_local_port_range(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = table->data; int ret; int range[2]; struct ctl_table tmp = { .data = &range, .maxlen = sizeof(range), .mode = table->mode, .extra1 = &ip_local_port_range_min, .extra2 = &ip_local_port_range_max, }; inet_get_local_port_range(net, &range[0], &range[1]); ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); if (write && ret == 0) { /* Ensure that the upper limit is not smaller than the lower, * and that the lower does not encroach upon the privileged * port limit. */ if ((range[1] < range[0]) || (range[0] < READ_ONCE(net->ipv4.sysctl_ip_prot_sock))) ret = -EINVAL; else set_local_port_range(net, range[0], range[1]); } return ret; } /* Validate changes from /proc interface. */ static int ipv4_privileged_ports(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = container_of(table->data, struct net, ipv4.sysctl_ip_prot_sock); int ret; int pports; int range[2]; struct ctl_table tmp = { .data = &pports, .maxlen = sizeof(pports), .mode = table->mode, .extra1 = &ip_privileged_port_min, .extra2 = &ip_privileged_port_max, }; pports = READ_ONCE(net->ipv4.sysctl_ip_prot_sock); ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); if (write && ret == 0) { inet_get_local_port_range(net, &range[0], &range[1]); /* Ensure that the local port range doesn't overlap with the * privileged port range. */ if (range[0] < pports) ret = -EINVAL; else WRITE_ONCE(net->ipv4.sysctl_ip_prot_sock, pports); } return ret; } static void inet_get_ping_group_range_table(const struct ctl_table *table, kgid_t *low, kgid_t *high) { kgid_t *data = table->data; struct net *net = container_of(table->data, struct net, ipv4.ping_group_range.range); unsigned int seq; do { seq = read_seqbegin(&net->ipv4.ping_group_range.lock); *low = data[0]; *high = data[1]; } while (read_seqretry(&net->ipv4.ping_group_range.lock, seq)); } /* Update system visible IP port range */ static void set_ping_group_range(const struct ctl_table *table, kgid_t low, kgid_t high) { kgid_t *data = table->data; struct net *net = container_of(table->data, struct net, ipv4.ping_group_range.range); write_seqlock(&net->ipv4.ping_group_range.lock); data[0] = low; data[1] = high; write_sequnlock(&net->ipv4.ping_group_range.lock); } /* Validate changes from /proc interface. */ static int ipv4_ping_group_range(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct user_namespace *user_ns = current_user_ns(); int ret; unsigned long urange[2]; kgid_t low, high; struct ctl_table tmp = { .data = &urange, .maxlen = sizeof(urange), .mode = table->mode, .extra1 = &ip_ping_group_range_min, .extra2 = &ip_ping_group_range_max, }; inet_get_ping_group_range_table(table, &low, &high); urange[0] = from_kgid_munged(user_ns, low); urange[1] = from_kgid_munged(user_ns, high); ret = proc_doulongvec_minmax(&tmp, write, buffer, lenp, ppos); if (write && ret == 0) { low = make_kgid(user_ns, urange[0]); high = make_kgid(user_ns, urange[1]); if (!gid_valid(low) || !gid_valid(high)) return -EINVAL; if (urange[1] < urange[0] || gid_lt(high, low)) { low = make_kgid(&init_user_ns, 1); high = make_kgid(&init_user_ns, 0); } set_ping_group_range(table, low, high); } return ret; } static int ipv4_fwd_update_priority(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net; int ret; net = container_of(table->data, struct net, ipv4.sysctl_ip_fwd_update_priority); ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos); if (write && ret == 0) call_netevent_notifiers(NETEVENT_IPV4_FWD_UPDATE_PRIORITY_UPDATE, net); return ret; } static int proc_tcp_congestion_control(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = container_of(ctl->data, struct net, ipv4.tcp_congestion_control); char val[TCP_CA_NAME_MAX]; struct ctl_table tbl = { .data = val, .maxlen = TCP_CA_NAME_MAX, }; int ret; tcp_get_default_congestion_control(net, val); ret = proc_dostring(&tbl, write, buffer, lenp, ppos); if (write && ret == 0) ret = tcp_set_default_congestion_control(net, val); return ret; } static int proc_tcp_available_congestion_control(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX, }; int ret; tbl.data = kmalloc(tbl.maxlen, GFP_USER); if (!tbl.data) return -ENOMEM; tcp_get_available_congestion_control(tbl.data, TCP_CA_BUF_MAX); ret = proc_dostring(&tbl, write, buffer, lenp, ppos); kfree(tbl.data); return ret; } static int proc_allowed_congestion_control(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX }; int ret; tbl.data = kmalloc(tbl.maxlen, GFP_USER); if (!tbl.data) return -ENOMEM; tcp_get_allowed_congestion_control(tbl.data, tbl.maxlen); ret = proc_dostring(&tbl, write, buffer, lenp, ppos); if (write && ret == 0) ret = tcp_set_allowed_congestion_control(tbl.data); kfree(tbl.data); return ret; } static int sscanf_key(char *buf, __le32 *key) { u32 user_key[4]; int i, ret = 0; if (sscanf(buf, "%x-%x-%x-%x", user_key, user_key + 1, user_key + 2, user_key + 3) != 4) { ret = -EINVAL; } else { for (i = 0; i < ARRAY_SIZE(user_key); i++) key[i] = cpu_to_le32(user_key[i]); } pr_debug("proc TFO key set 0x%x-%x-%x-%x <- 0x%s: %u\n", user_key[0], user_key[1], user_key[2], user_key[3], buf, ret); return ret; } static int proc_tcp_fastopen_key(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = container_of(table->data, struct net, ipv4.sysctl_tcp_fastopen); /* maxlen to print the list of keys in hex (*2), with dashes * separating doublewords and a comma in between keys. */ struct ctl_table tbl = { .maxlen = ((TCP_FASTOPEN_KEY_LENGTH * 2 * TCP_FASTOPEN_KEY_MAX) + (TCP_FASTOPEN_KEY_MAX * 5)) }; u32 user_key[TCP_FASTOPEN_KEY_BUF_LENGTH / sizeof(u32)]; __le32 key[TCP_FASTOPEN_KEY_BUF_LENGTH / sizeof(__le32)]; char *backup_data; int ret, i = 0, off = 0, n_keys; tbl.data = kmalloc(tbl.maxlen, GFP_KERNEL); if (!tbl.data) return -ENOMEM; n_keys = tcp_fastopen_get_cipher(net, NULL, (u64 *)key); if (!n_keys) { memset(&key[0], 0, TCP_FASTOPEN_KEY_LENGTH); n_keys = 1; } for (i = 0; i < n_keys * 4; i++) user_key[i] = le32_to_cpu(key[i]); for (i = 0; i < n_keys; i++) { off += snprintf(tbl.data + off, tbl.maxlen - off, "%08x-%08x-%08x-%08x", user_key[i * 4], user_key[i * 4 + 1], user_key[i * 4 + 2], user_key[i * 4 + 3]); if (WARN_ON_ONCE(off >= tbl.maxlen - 1)) break; if (i + 1 < n_keys) off += snprintf(tbl.data + off, tbl.maxlen - off, ","); } ret = proc_dostring(&tbl, write, buffer, lenp, ppos); if (write && ret == 0) { backup_data = strchr(tbl.data, ','); if (backup_data) { *backup_data = '\0'; backup_data++; } if (sscanf_key(tbl.data, key)) { ret = -EINVAL; goto bad_key; } if (backup_data) { if (sscanf_key(backup_data, key + 4)) { ret = -EINVAL; goto bad_key; } } tcp_fastopen_reset_cipher(net, NULL, key, backup_data ? key + 4 : NULL); } bad_key: kfree(tbl.data); return ret; } static int proc_tfo_blackhole_detect_timeout(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = container_of(table->data, struct net, ipv4.sysctl_tcp_fastopen_blackhole_timeout); int ret; ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (write && ret == 0) atomic_set(&net->ipv4.tfo_active_disable_times, 0); return ret; } static int proc_tcp_available_ulp(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table tbl = { .maxlen = TCP_ULP_BUF_MAX, }; int ret; tbl.data = kmalloc(tbl.maxlen, GFP_USER); if (!tbl.data) return -ENOMEM; tcp_get_available_ulp(tbl.data, TCP_ULP_BUF_MAX); ret = proc_dostring(&tbl, write, buffer, lenp, ppos); kfree(tbl.data); return ret; } static int proc_tcp_ehash_entries(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = container_of(table->data, struct net, ipv4.sysctl_tcp_child_ehash_entries); struct inet_hashinfo *hinfo = net->ipv4.tcp_death_row.hashinfo; int tcp_ehash_entries; struct ctl_table tbl; tcp_ehash_entries = hinfo->ehash_mask + 1; /* A negative number indicates that the child netns * shares the global ehash. */ if (!net_eq(net, &init_net) && !hinfo->pernet) tcp_ehash_entries *= -1; memset(&tbl, 0, sizeof(tbl)); tbl.data = &tcp_ehash_entries; tbl.maxlen = sizeof(int); return proc_dointvec(&tbl, write, buffer, lenp, ppos); } static int proc_udp_hash_entries(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = container_of(table->data, struct net, ipv4.sysctl_udp_child_hash_entries); int udp_hash_entries; struct ctl_table tbl; udp_hash_entries = net->ipv4.udp_table->mask + 1; /* A negative number indicates that the child netns * shares the global udp_table. */ if (!net_eq(net, &init_net) && net->ipv4.udp_table == &udp_table) udp_hash_entries *= -1; memset(&tbl, 0, sizeof(tbl)); tbl.data = &udp_hash_entries; tbl.maxlen = sizeof(int); return proc_dointvec(&tbl, write, buffer, lenp, ppos); } #ifdef CONFIG_IP_ROUTE_MULTIPATH static int proc_fib_multipath_hash_policy(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = container_of(table->data, struct net, ipv4.sysctl_fib_multipath_hash_policy); int ret; ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos); if (write && ret == 0) call_netevent_notifiers(NETEVENT_IPV4_MPATH_HASH_UPDATE, net); return ret; } static int proc_fib_multipath_hash_fields(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net; int ret; net = container_of(table->data, struct net, ipv4.sysctl_fib_multipath_hash_fields); ret = proc_douintvec_minmax(table, write, buffer, lenp, ppos); if (write && ret == 0) call_netevent_notifiers(NETEVENT_IPV4_MPATH_HASH_UPDATE, net); return ret; } static u32 proc_fib_multipath_hash_rand_seed __ro_after_init; static void proc_fib_multipath_hash_init_rand_seed(void) { get_random_bytes(&proc_fib_multipath_hash_rand_seed, sizeof(proc_fib_multipath_hash_rand_seed)); } static void proc_fib_multipath_hash_set_seed(struct net *net, u32 user_seed) { struct sysctl_fib_multipath_hash_seed new = { .user_seed = user_seed, .mp_seed = (user_seed ? user_seed : proc_fib_multipath_hash_rand_seed), }; WRITE_ONCE(net->ipv4.sysctl_fib_multipath_hash_seed.user_seed, new.user_seed); WRITE_ONCE(net->ipv4.sysctl_fib_multipath_hash_seed.mp_seed, new.mp_seed); } static int proc_fib_multipath_hash_seed(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct sysctl_fib_multipath_hash_seed *mphs; struct net *net = table->data; struct ctl_table tmp; u32 user_seed; int ret; mphs = &net->ipv4.sysctl_fib_multipath_hash_seed; user_seed = READ_ONCE(mphs->user_seed); tmp = *table; tmp.data = &user_seed; ret = proc_douintvec_minmax(&tmp, write, buffer, lenp, ppos); if (write && ret == 0) { proc_fib_multipath_hash_set_seed(net, user_seed); call_netevent_notifiers(NETEVENT_IPV4_MPATH_HASH_UPDATE, net); } return ret; } #else static void proc_fib_multipath_hash_init_rand_seed(void) { } static void proc_fib_multipath_hash_set_seed(struct net *net, u32 user_seed) { } #endif static struct ctl_table ipv4_table[] = { { .procname = "tcp_max_orphans", .data = &sysctl_tcp_max_orphans, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "inet_peer_threshold", .data = &inet_peer_threshold, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "inet_peer_minttl", .data = &inet_peer_minttl, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "inet_peer_maxttl", .data = &inet_peer_maxttl, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "tcp_mem", .maxlen = sizeof(sysctl_tcp_mem), .data = &sysctl_tcp_mem, .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, { .procname = "tcp_low_latency", .data = &sysctl_tcp_low_latency, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, #ifdef CONFIG_NETLABEL { .procname = "cipso_cache_enable", .data = &cipso_v4_cache_enabled, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "cipso_cache_bucket_size", .data = &cipso_v4_cache_bucketsize, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "cipso_rbm_optfmt", .data = &cipso_v4_rbm_optfmt, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "cipso_rbm_strictvalid", .data = &cipso_v4_rbm_strictvalid, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, #endif /* CONFIG_NETLABEL */ { .procname = "tcp_available_ulp", .maxlen = TCP_ULP_BUF_MAX, .mode = 0444, .proc_handler = proc_tcp_available_ulp, }, { .procname = "udp_mem", .data = &sysctl_udp_mem, .maxlen = sizeof(sysctl_udp_mem), .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, { .procname = "fib_sync_mem", .data = &sysctl_fib_sync_mem, .maxlen = sizeof(sysctl_fib_sync_mem), .mode = 0644, .proc_handler = proc_douintvec_minmax, .extra1 = &sysctl_fib_sync_mem_min, .extra2 = &sysctl_fib_sync_mem_max, }, }; static struct ctl_table ipv4_net_table[] = { { .procname = "tcp_max_tw_buckets", .data = &init_net.ipv4.tcp_death_row.sysctl_max_tw_buckets, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "icmp_echo_ignore_all", .data = &init_net.ipv4.sysctl_icmp_echo_ignore_all, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE }, { .procname = "icmp_echo_enable_probe", .data = &init_net.ipv4.sysctl_icmp_echo_enable_probe, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE }, { .procname = "icmp_echo_ignore_broadcasts", .data = &init_net.ipv4.sysctl_icmp_echo_ignore_broadcasts, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE }, { .procname = "icmp_ignore_bogus_error_responses", .data = &init_net.ipv4.sysctl_icmp_ignore_bogus_error_responses, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE }, { .procname = "icmp_errors_use_inbound_ifaddr", .data = &init_net.ipv4.sysctl_icmp_errors_use_inbound_ifaddr, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE }, { .procname = "icmp_errors_extension_mask", .data = &init_net.ipv4.sysctl_icmp_errors_extension_mask, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = &icmp_errors_extension_mask_all, }, { .procname = "icmp_ratelimit", .data = &init_net.ipv4.sysctl_icmp_ratelimit, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_ms_jiffies, }, { .procname = "icmp_ratemask", .data = &init_net.ipv4.sysctl_icmp_ratemask, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "icmp_msgs_per_sec", .data = &init_net.ipv4.sysctl_icmp_msgs_per_sec, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, { .procname = "icmp_msgs_burst", .data = &init_net.ipv4.sysctl_icmp_msgs_burst, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, { .procname = "ping_group_range", .data = &init_net.ipv4.ping_group_range.range, .maxlen = sizeof(gid_t)*2, .mode = 0644, .proc_handler = ipv4_ping_group_range, }, #ifdef CONFIG_NET_L3_MASTER_DEV { .procname = "raw_l3mdev_accept", .data = &init_net.ipv4.sysctl_raw_l3mdev_accept, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, #endif { .procname = "tcp_ecn", .data = &init_net.ipv4.sysctl_tcp_ecn, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = &tcp_ecn_mode_max, }, { .procname = "tcp_ecn_option", .data = &init_net.ipv4.sysctl_tcp_ecn_option, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_THREE, }, { .procname = "tcp_ecn_option_beacon", .data = &init_net.ipv4.sysctl_tcp_ecn_option_beacon, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_THREE, }, { .procname = "tcp_ecn_fallback", .data = &init_net.ipv4.sysctl_tcp_ecn_fallback, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "ip_dynaddr", .data = &init_net.ipv4.sysctl_ip_dynaddr, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "ip_early_demux", .data = &init_net.ipv4.sysctl_ip_early_demux, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "udp_early_demux", .data = &init_net.ipv4.sysctl_udp_early_demux, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_early_demux", .data = &init_net.ipv4.sysctl_tcp_early_demux, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "nexthop_compat_mode", .data = &init_net.ipv4.sysctl_nexthop_compat_mode, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "ip_default_ttl", .data = &init_net.ipv4.sysctl_ip_default_ttl, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = &ip_ttl_min, .extra2 = &ip_ttl_max, }, { .procname = "ip_local_port_range", .maxlen = 0, .data = &init_net, .mode = 0644, .proc_handler = ipv4_local_port_range, }, { .procname = "ip_local_reserved_ports", .data = &init_net.ipv4.sysctl_local_reserved_ports, .maxlen = 65536, .mode = 0644, .proc_handler = proc_do_large_bitmap, }, { .procname = "ip_no_pmtu_disc", .data = &init_net.ipv4.sysctl_ip_no_pmtu_disc, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "ip_forward_use_pmtu", .data = &init_net.ipv4.sysctl_ip_fwd_use_pmtu, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "ip_forward_update_priority", .data = &init_net.ipv4.sysctl_ip_fwd_update_priority, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = ipv4_fwd_update_priority, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "ip_nonlocal_bind", .data = &init_net.ipv4.sysctl_ip_nonlocal_bind, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "ip_autobind_reuse", .data = &init_net.ipv4.sysctl_ip_autobind_reuse, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "fwmark_reflect", .data = &init_net.ipv4.sysctl_fwmark_reflect, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_fwmark_accept", .data = &init_net.ipv4.sysctl_tcp_fwmark_accept, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, #ifdef CONFIG_NET_L3_MASTER_DEV { .procname = "tcp_l3mdev_accept", .data = &init_net.ipv4.sysctl_tcp_l3mdev_accept, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, #endif { .procname = "tcp_mtu_probing", .data = &init_net.ipv4.sysctl_tcp_mtu_probing, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_base_mss", .data = &init_net.ipv4.sysctl_tcp_base_mss, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "tcp_min_snd_mss", .data = &init_net.ipv4.sysctl_tcp_min_snd_mss, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &tcp_min_snd_mss_min, .extra2 = &tcp_min_snd_mss_max, }, { .procname = "tcp_mtu_probe_floor", .data = &init_net.ipv4.sysctl_tcp_mtu_probe_floor, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &tcp_min_snd_mss_min, .extra2 = &tcp_min_snd_mss_max, }, { .procname = "tcp_probe_threshold", .data = &init_net.ipv4.sysctl_tcp_probe_threshold, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "tcp_probe_interval", .data = &init_net.ipv4.sysctl_tcp_probe_interval, .maxlen = sizeof(u32), .mode = 0644, .proc_handler = proc_douintvec_minmax, .extra2 = &u32_max_div_HZ, }, { .procname = "igmp_link_local_mcast_reports", .data = &init_net.ipv4.sysctl_igmp_llm_reports, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "igmp_max_memberships", .data = &init_net.ipv4.sysctl_igmp_max_memberships, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "igmp_max_msf", .data = &init_net.ipv4.sysctl_igmp_max_msf, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, #ifdef CONFIG_IP_MULTICAST { .procname = "igmp_qrv", .data = &init_net.ipv4.sysctl_igmp_qrv, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE }, #endif { .procname = "tcp_congestion_control", .data = &init_net.ipv4.tcp_congestion_control, .mode = 0644, .maxlen = TCP_CA_NAME_MAX, .proc_handler = proc_tcp_congestion_control, }, { .procname = "tcp_available_congestion_control", .maxlen = TCP_CA_BUF_MAX, .mode = 0444, .proc_handler = proc_tcp_available_congestion_control, }, { .procname = "tcp_allowed_congestion_control", .maxlen = TCP_CA_BUF_MAX, .mode = 0644, .proc_handler = proc_allowed_congestion_control, }, { .procname = "tcp_keepalive_time", .data = &init_net.ipv4.sysctl_tcp_keepalive_time, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "tcp_keepalive_probes", .data = &init_net.ipv4.sysctl_tcp_keepalive_probes, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_keepalive_intvl", .data = &init_net.ipv4.sysctl_tcp_keepalive_intvl, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "tcp_syn_retries", .data = &init_net.ipv4.sysctl_tcp_syn_retries, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = &tcp_syn_retries_min, .extra2 = &tcp_syn_retries_max }, { .procname = "tcp_synack_retries", .data = &init_net.ipv4.sysctl_tcp_synack_retries, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, #ifdef CONFIG_SYN_COOKIES { .procname = "tcp_syncookies", .data = &init_net.ipv4.sysctl_tcp_syncookies, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, #endif { .procname = "tcp_migrate_req", .data = &init_net.ipv4.sysctl_tcp_migrate_req, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE }, { .procname = "tcp_reordering", .data = &init_net.ipv4.sysctl_tcp_reordering, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "tcp_retries1", .data = &init_net.ipv4.sysctl_tcp_retries1, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra2 = &tcp_retr1_max }, { .procname = "tcp_retries2", .data = &init_net.ipv4.sysctl_tcp_retries2, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_orphan_retries", .data = &init_net.ipv4.sysctl_tcp_orphan_retries, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_fin_timeout", .data = &init_net.ipv4.sysctl_tcp_fin_timeout, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "tcp_notsent_lowat", .data = &init_net.ipv4.sysctl_tcp_notsent_lowat, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_douintvec, }, { .procname = "tcp_tw_reuse", .data = &init_net.ipv4.sysctl_tcp_tw_reuse, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, { .procname = "tcp_tw_reuse_delay", .data = &init_net.ipv4.sysctl_tcp_tw_reuse_delay, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_douintvec_minmax, .extra1 = SYSCTL_ONE, .extra2 = &tcp_tw_reuse_delay_max, }, { .procname = "tcp_max_syn_backlog", .data = &init_net.ipv4.sysctl_max_syn_backlog, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "tcp_fastopen", .data = &init_net.ipv4.sysctl_tcp_fastopen, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "tcp_fastopen_key", .mode = 0600, .data = &init_net.ipv4.sysctl_tcp_fastopen, /* maxlen to print the list of keys in hex (*2), with dashes * separating doublewords and a comma in between keys. */ .maxlen = ((TCP_FASTOPEN_KEY_LENGTH * 2 * TCP_FASTOPEN_KEY_MAX) + (TCP_FASTOPEN_KEY_MAX * 5)), .proc_handler = proc_tcp_fastopen_key, }, { .procname = "tcp_fastopen_blackhole_timeout_sec", .data = &init_net.ipv4.sysctl_tcp_fastopen_blackhole_timeout, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_tfo_blackhole_detect_timeout, .extra1 = SYSCTL_ZERO, }, #ifdef CONFIG_IP_ROUTE_MULTIPATH { .procname = "fib_multipath_use_neigh", .data = &init_net.ipv4.sysctl_fib_multipath_use_neigh, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "fib_multipath_hash_policy", .data = &init_net.ipv4.sysctl_fib_multipath_hash_policy, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_fib_multipath_hash_policy, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_THREE, }, { .procname = "fib_multipath_hash_fields", .data = &init_net.ipv4.sysctl_fib_multipath_hash_fields, .maxlen = sizeof(u32), .mode = 0644, .proc_handler = proc_fib_multipath_hash_fields, .extra1 = SYSCTL_ONE, .extra2 = &fib_multipath_hash_fields_all_mask, }, { .procname = "fib_multipath_hash_seed", .data = &init_net, .maxlen = sizeof(u32), .mode = 0644, .proc_handler = proc_fib_multipath_hash_seed, }, #endif { .procname = "ip_unprivileged_port_start", .maxlen = sizeof(int), .data = &init_net.ipv4.sysctl_ip_prot_sock, .mode = 0644, .proc_handler = ipv4_privileged_ports, }, #ifdef CONFIG_NET_L3_MASTER_DEV { .procname = "udp_l3mdev_accept", .data = &init_net.ipv4.sysctl_udp_l3mdev_accept, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, #endif { .procname = "tcp_sack", .data = &init_net.ipv4.sysctl_tcp_sack, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_window_scaling", .data = &init_net.ipv4.sysctl_tcp_window_scaling, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_timestamps", .data = &init_net.ipv4.sysctl_tcp_timestamps, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_early_retrans", .data = &init_net.ipv4.sysctl_tcp_early_retrans, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_FOUR, }, { .procname = "tcp_recovery", .data = &init_net.ipv4.sysctl_tcp_recovery, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_thin_linear_timeouts", .data = &init_net.ipv4.sysctl_tcp_thin_linear_timeouts, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_slow_start_after_idle", .data = &init_net.ipv4.sysctl_tcp_slow_start_after_idle, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_retrans_collapse", .data = &init_net.ipv4.sysctl_tcp_retrans_collapse, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_stdurg", .data = &init_net.ipv4.sysctl_tcp_stdurg, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_rfc1337", .data = &init_net.ipv4.sysctl_tcp_rfc1337, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_abort_on_overflow", .data = &init_net.ipv4.sysctl_tcp_abort_on_overflow, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_fack", .data = &init_net.ipv4.sysctl_tcp_fack, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_max_reordering", .data = &init_net.ipv4.sysctl_tcp_max_reordering, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "tcp_dsack", .data = &init_net.ipv4.sysctl_tcp_dsack, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_app_win", .data = &init_net.ipv4.sysctl_tcp_app_win, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = &tcp_app_win_max, }, { .procname = "tcp_adv_win_scale", .data = &init_net.ipv4.sysctl_tcp_adv_win_scale, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &tcp_adv_win_scale_min, .extra2 = &tcp_adv_win_scale_max, }, { .procname = "tcp_frto", .data = &init_net.ipv4.sysctl_tcp_frto, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_no_metrics_save", .data = &init_net.ipv4.sysctl_tcp_nometrics_save, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_no_ssthresh_metrics_save", .data = &init_net.ipv4.sysctl_tcp_no_ssthresh_metrics_save, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "tcp_moderate_rcvbuf", .data = &init_net.ipv4.sysctl_tcp_moderate_rcvbuf, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_rcvbuf_low_rtt", .data = &init_net.ipv4.sysctl_tcp_rcvbuf_low_rtt, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_INT_MAX, }, { .procname = "tcp_tso_win_divisor", .data = &init_net.ipv4.sysctl_tcp_tso_win_divisor, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_workaround_signed_windows", .data = &init_net.ipv4.sysctl_tcp_workaround_signed_windows, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_limit_output_bytes", .data = &init_net.ipv4.sysctl_tcp_limit_output_bytes, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "tcp_challenge_ack_limit", .data = &init_net.ipv4.sysctl_tcp_challenge_ack_limit, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "tcp_min_tso_segs", .data = &init_net.ipv4.sysctl_tcp_min_tso_segs, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ONE, }, { .procname = "tcp_tso_rtt_log", .data = &init_net.ipv4.sysctl_tcp_tso_rtt_log, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_min_rtt_wlen", .data = &init_net.ipv4.sysctl_tcp_min_rtt_wlen, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = &one_day_secs }, { .procname = "tcp_autocorking", .data = &init_net.ipv4.sysctl_tcp_autocorking, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "tcp_invalid_ratelimit", .data = &init_net.ipv4.sysctl_tcp_invalid_ratelimit, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_ms_jiffies, }, { .procname = "tcp_pacing_ss_ratio", .data = &init_net.ipv4.sysctl_tcp_pacing_ss_ratio, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE_THOUSAND, }, { .procname = "tcp_pacing_ca_ratio", .data = &init_net.ipv4.sysctl_tcp_pacing_ca_ratio, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE_THOUSAND, }, { .procname = "tcp_wmem", .data = &init_net.ipv4.sysctl_tcp_wmem, .maxlen = sizeof(init_net.ipv4.sysctl_tcp_wmem), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE, }, { .procname = "tcp_rmem", .data = &init_net.ipv4.sysctl_tcp_rmem, .maxlen = sizeof(init_net.ipv4.sysctl_tcp_rmem), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE, }, { .procname = "tcp_comp_sack_delay_ns", .data = &init_net.ipv4.sysctl_tcp_comp_sack_delay_ns, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, { .procname = "tcp_comp_sack_rtt_percent", .data = &init_net.ipv4.sysctl_tcp_comp_sack_rtt_percent, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE, .extra2 = SYSCTL_ONE_THOUSAND, }, { .procname = "tcp_comp_sack_slack_ns", .data = &init_net.ipv4.sysctl_tcp_comp_sack_slack_ns, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, { .procname = "tcp_comp_sack_nr", .data = &init_net.ipv4.sysctl_tcp_comp_sack_nr, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, }, { .procname = "tcp_backlog_ack_defer", .data = &init_net.ipv4.sysctl_tcp_backlog_ack_defer, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "tcp_reflect_tos", .data = &init_net.ipv4.sysctl_tcp_reflect_tos, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "tcp_ehash_entries", .data = &init_net.ipv4.sysctl_tcp_child_ehash_entries, .mode = 0444, .proc_handler = proc_tcp_ehash_entries, }, { .procname = "tcp_child_ehash_entries", .data = &init_net.ipv4.sysctl_tcp_child_ehash_entries, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_douintvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = &tcp_child_ehash_entries_max, }, { .procname = "udp_hash_entries", .data = &init_net.ipv4.sysctl_udp_child_hash_entries, .mode = 0444, .proc_handler = proc_udp_hash_entries, }, { .procname = "udp_child_hash_entries", .data = &init_net.ipv4.sysctl_udp_child_hash_entries, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_douintvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = &udp_child_hash_entries_max, }, { .procname = "udp_rmem_min", .data = &init_net.ipv4.sysctl_udp_rmem_min, .maxlen = sizeof(init_net.ipv4.sysctl_udp_rmem_min), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE }, { .procname = "udp_wmem_min", .data = &init_net.ipv4.sysctl_udp_wmem_min, .maxlen = sizeof(init_net.ipv4.sysctl_udp_wmem_min), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE }, { .procname = "fib_notify_on_flag_change", .data = &init_net.ipv4.sysctl_fib_notify_on_flag_change, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, { .procname = "tcp_plb_enabled", .data = &init_net.ipv4.sysctl_tcp_plb_enabled, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "tcp_plb_idle_rehash_rounds", .data = &init_net.ipv4.sysctl_tcp_plb_idle_rehash_rounds, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra2 = &tcp_plb_max_rounds, }, { .procname = "tcp_plb_rehash_rounds", .data = &init_net.ipv4.sysctl_tcp_plb_rehash_rounds, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra2 = &tcp_plb_max_rounds, }, { .procname = "tcp_plb_suspend_rto_sec", .data = &init_net.ipv4.sysctl_tcp_plb_suspend_rto_sec, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_plb_cong_thresh", .data = &init_net.ipv4.sysctl_tcp_plb_cong_thresh, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = &tcp_plb_max_cong_thresh, }, { .procname = "tcp_syn_linear_timeouts", .data = &init_net.ipv4.sysctl_tcp_syn_linear_timeouts, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = &tcp_syn_linear_timeouts_max, }, { .procname = "tcp_shrink_window", .data = &init_net.ipv4.sysctl_tcp_shrink_window, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "tcp_pingpong_thresh", .data = &init_net.ipv4.sysctl_tcp_pingpong_thresh, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ONE, }, { .procname = "tcp_rto_min_us", .data = &init_net.ipv4.sysctl_tcp_rto_min_us, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE, }, { .procname = "tcp_rto_max_ms", .data = &init_net.ipv4.sysctl_tcp_rto_max_ms, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE_THOUSAND, .extra2 = &tcp_rto_max_max, }, }; static __net_init int ipv4_sysctl_init_net(struct net *net) { size_t table_size = ARRAY_SIZE(ipv4_net_table); struct ctl_table *table; table = ipv4_net_table; if (!net_eq(net, &init_net)) { int i; table = kmemdup(table, sizeof(ipv4_net_table), GFP_KERNEL); if (!table) goto err_alloc; for (i = 0; i < table_size; i++) { if (table[i].data) { /* Update the variables to point into * the current struct net */ table[i].data += (void *)net - (void *)&init_net; } else { /* Entries without data pointer are global; * Make them read-only in non-init_net ns */ table[i].mode &= ~0222; } } } net->ipv4.ipv4_hdr = register_net_sysctl_sz(net, "net/ipv4", table, table_size); if (!net->ipv4.ipv4_hdr) goto err_reg; net->ipv4.sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL); if (!net->ipv4.sysctl_local_reserved_ports) goto err_ports; proc_fib_multipath_hash_set_seed(net, 0); return 0; err_ports: unregister_net_sysctl_table(net->ipv4.ipv4_hdr); err_reg: if (!net_eq(net, &init_net)) kfree(table); err_alloc: return -ENOMEM; } static __net_exit void ipv4_sysctl_exit_net(struct net *net) { const struct ctl_table *table; kfree(net->ipv4.sysctl_local_reserved_ports); table = net->ipv4.ipv4_hdr->ctl_table_arg; unregister_net_sysctl_table(net->ipv4.ipv4_hdr); kfree(table); } static __net_initdata struct pernet_operations ipv4_sysctl_ops = { .init = ipv4_sysctl_init_net, .exit = ipv4_sysctl_exit_net, }; static __init int sysctl_ipv4_init(void) { struct ctl_table_header *hdr; hdr = register_net_sysctl(&init_net, "net/ipv4", ipv4_table); if (!hdr) return -ENOMEM; proc_fib_multipath_hash_init_rand_seed(); if (register_pernet_subsys(&ipv4_sysctl_ops)) { unregister_net_sysctl_table(hdr); return -ENOMEM; } return 0; } __initcall(sysctl_ipv4_init);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2023 Western Digital Corporation or its affiliates. */ #ifndef BTRFS_RAID_STRIPE_TREE_H #define BTRFS_RAID_STRIPE_TREE_H #include <linux/types.h> #include <uapi/linux/btrfs_tree.h> #include "fs.h" #include "accessors.h" #define BTRFS_RST_SUPP_BLOCK_GROUP_MASK (BTRFS_BLOCK_GROUP_DUP | \ BTRFS_BLOCK_GROUP_RAID1_MASK | \ BTRFS_BLOCK_GROUP_RAID0 | \ BTRFS_BLOCK_GROUP_RAID10) struct btrfs_io_context; struct btrfs_io_stripe; struct btrfs_fs_info; struct btrfs_ordered_extent; struct btrfs_trans_handle; int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 length); int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info, u64 logical, u64 *length, u64 map_type, u32 stripe_index, struct btrfs_io_stripe *stripe); int btrfs_insert_raid_extent(struct btrfs_trans_handle *trans, struct btrfs_ordered_extent *ordered_extent); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans, struct btrfs_io_context *bioc); #endif static inline bool btrfs_need_stripe_tree_update(struct btrfs_fs_info *fs_info, u64 map_type) { u64 type = map_type & BTRFS_BLOCK_GROUP_TYPE_MASK; u64 profile = map_type & BTRFS_BLOCK_GROUP_PROFILE_MASK; if (!btrfs_fs_incompat(fs_info, RAID_STRIPE_TREE)) return false; if (type != BTRFS_BLOCK_GROUP_DATA) return false; if (profile & BTRFS_RST_SUPP_BLOCK_GROUP_MASK) return true; return false; } static inline int btrfs_num_raid_stripes(u32 item_size) { return item_size / sizeof(struct btrfs_raid_stride); } #endif
4 4 4 4 4 1 4 4 4 4 4 2 4 4 4 4 4 4 4 4 4 4 4 3 4 3 3 4 4 4 4 4 4 4 4 4 4 4 4 4 3 4 3 4 4 4 3 4 4 4 3 3 4 4 4 4 4 4 4 4 4 4 4 4 4 3 4 4 4 4 4 4 4 1 1 2 2 4 2 4 4 4 4 4 4 4 4 4 2 2 8 8 4 4 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 // SPDX-License-Identifier: 0BSD /* * LZMA2 decoder * * Authors: Lasse Collin <lasse.collin@tukaani.org> * Igor Pavlov <https://7-zip.org/> */ #include "xz_private.h" #include "xz_lzma2.h" /* * Range decoder initialization eats the first five bytes of each LZMA chunk. */ #define RC_INIT_BYTES 5 /* * Minimum number of usable input buffer to safely decode one LZMA symbol. * The worst case is that we decode 22 bits using probabilities and 26 * direct bits. This may decode at maximum of 20 bytes of input. However, * lzma_main() does an extra normalization before returning, thus we * need to put 21 here. */ #define LZMA_IN_REQUIRED 21 /* * Dictionary (history buffer) * * These are always true: * start <= pos <= full <= end * pos <= limit <= end * * In multi-call mode, also these are true: * end == size * size <= size_max * allocated <= size * * Most of these variables are size_t to support single-call mode, * in which the dictionary variables address the actual output * buffer directly. */ struct dictionary { /* Beginning of the history buffer */ uint8_t *buf; /* Old position in buf (before decoding more data) */ size_t start; /* Position in buf */ size_t pos; /* * How full dictionary is. This is used to detect corrupt input that * would read beyond the beginning of the uncompressed stream. */ size_t full; /* Write limit; we don't write to buf[limit] or later bytes. */ size_t limit; /* * End of the dictionary buffer. In multi-call mode, this is * the same as the dictionary size. In single-call mode, this * indicates the size of the output buffer. */ size_t end; /* * Size of the dictionary as specified in Block Header. This is used * together with "full" to detect corrupt input that would make us * read beyond the beginning of the uncompressed stream. */ uint32_t size; /* * Maximum allowed dictionary size in multi-call mode. * This is ignored in single-call mode. */ uint32_t size_max; /* * Amount of memory currently allocated for the dictionary. * This is used only with XZ_DYNALLOC. (With XZ_PREALLOC, * size_max is always the same as the allocated size.) */ uint32_t allocated; /* Operation mode */ enum xz_mode mode; }; /* Range decoder */ struct rc_dec { uint32_t range; uint32_t code; /* * Number of initializing bytes remaining to be read * by rc_read_init(). */ uint32_t init_bytes_left; /* * Buffer from which we read our input. It can be either * temp.buf or the caller-provided input buffer. */ const uint8_t *in; size_t in_pos; size_t in_limit; }; /* Probabilities for a length decoder. */ struct lzma_len_dec { /* Probability of match length being at least 10 */ uint16_t choice; /* Probability of match length being at least 18 */ uint16_t choice2; /* Probabilities for match lengths 2-9 */ uint16_t low[POS_STATES_MAX][LEN_LOW_SYMBOLS]; /* Probabilities for match lengths 10-17 */ uint16_t mid[POS_STATES_MAX][LEN_MID_SYMBOLS]; /* Probabilities for match lengths 18-273 */ uint16_t high[LEN_HIGH_SYMBOLS]; }; struct lzma_dec { /* Distances of latest four matches */ uint32_t rep0; uint32_t rep1; uint32_t rep2; uint32_t rep3; /* Types of the most recently seen LZMA symbols */ enum lzma_state state; /* * Length of a match. This is updated so that dict_repeat can * be called again to finish repeating the whole match. */ uint32_t len; /* * LZMA properties or related bit masks (number of literal * context bits, a mask derived from the number of literal * position bits, and a mask derived from the number * position bits) */ uint32_t lc; uint32_t literal_pos_mask; /* (1 << lp) - 1 */ uint32_t pos_mask; /* (1 << pb) - 1 */ /* If 1, it's a match. Otherwise it's a single 8-bit literal. */ uint16_t is_match[STATES][POS_STATES_MAX]; /* If 1, it's a repeated match. The distance is one of rep0 .. rep3. */ uint16_t is_rep[STATES]; /* * If 0, distance of a repeated match is rep0. * Otherwise check is_rep1. */ uint16_t is_rep0[STATES]; /* * If 0, distance of a repeated match is rep1. * Otherwise check is_rep2. */ uint16_t is_rep1[STATES]; /* If 0, distance of a repeated match is rep2. Otherwise it is rep3. */ uint16_t is_rep2[STATES]; /* * If 1, the repeated match has length of one byte. Otherwise * the length is decoded from rep_len_decoder. */ uint16_t is_rep0_long[STATES][POS_STATES_MAX]; /* * Probability tree for the highest two bits of the match * distance. There is a separate probability tree for match * lengths of 2 (i.e. MATCH_LEN_MIN), 3, 4, and [5, 273]. */ uint16_t dist_slot[DIST_STATES][DIST_SLOTS]; /* * Probility trees for additional bits for match distance * when the distance is in the range [4, 127]. */ uint16_t dist_special[FULL_DISTANCES - DIST_MODEL_END]; /* * Probability tree for the lowest four bits of a match * distance that is equal to or greater than 128. */ uint16_t dist_align[ALIGN_SIZE]; /* Length of a normal match */ struct lzma_len_dec match_len_dec; /* Length of a repeated match */ struct lzma_len_dec rep_len_dec; /* Probabilities of literals */ uint16_t literal[LITERAL_CODERS_MAX][LITERAL_CODER_SIZE]; }; struct lzma2_dec { /* Position in xz_dec_lzma2_run(). */ enum lzma2_seq { SEQ_CONTROL, SEQ_UNCOMPRESSED_1, SEQ_UNCOMPRESSED_2, SEQ_COMPRESSED_0, SEQ_COMPRESSED_1, SEQ_PROPERTIES, SEQ_LZMA_PREPARE, SEQ_LZMA_RUN, SEQ_COPY } sequence; /* Next position after decoding the compressed size of the chunk. */ enum lzma2_seq next_sequence; /* Uncompressed size of LZMA chunk (2 MiB at maximum) */ uint32_t uncompressed; /* * Compressed size of LZMA chunk or compressed/uncompressed * size of uncompressed chunk (64 KiB at maximum) */ uint32_t compressed; /* * True if dictionary reset is needed. This is false before * the first chunk (LZMA or uncompressed). */ bool need_dict_reset; /* * True if new LZMA properties are needed. This is false * before the first LZMA chunk. */ bool need_props; #ifdef XZ_DEC_MICROLZMA bool pedantic_microlzma; #endif }; struct xz_dec_lzma2 { /* * The order below is important on x86 to reduce code size and * it shouldn't hurt on other platforms. Everything up to and * including lzma.pos_mask are in the first 128 bytes on x86-32, * which allows using smaller instructions to access those * variables. On x86-64, fewer variables fit into the first 128 * bytes, but this is still the best order without sacrificing * the readability by splitting the structures. */ struct rc_dec rc; struct dictionary dict; struct lzma2_dec lzma2; struct lzma_dec lzma; /* * Temporary buffer which holds small number of input bytes between * decoder calls. See lzma2_lzma() for details. */ struct { uint32_t size; uint8_t buf[3 * LZMA_IN_REQUIRED]; } temp; }; /************** * Dictionary * **************/ /* * Reset the dictionary state. When in single-call mode, set up the beginning * of the dictionary to point to the actual output buffer. */ static void dict_reset(struct dictionary *dict, struct xz_buf *b) { if (DEC_IS_SINGLE(dict->mode)) { dict->buf = b->out + b->out_pos; dict->end = b->out_size - b->out_pos; } dict->start = 0; dict->pos = 0; dict->limit = 0; dict->full = 0; } /* Set dictionary write limit */ static void dict_limit(struct dictionary *dict, size_t out_max) { if (dict->end - dict->pos <= out_max) dict->limit = dict->end; else dict->limit = dict->pos + out_max; } /* Return true if at least one byte can be written into the dictionary. */ static inline bool dict_has_space(const struct dictionary *dict) { return dict->pos < dict->limit; } /* * Get a byte from the dictionary at the given distance. The distance is * assumed to valid, or as a special case, zero when the dictionary is * still empty. This special case is needed for single-call decoding to * avoid writing a '\0' to the end of the destination buffer. */ static inline uint32_t dict_get(const struct dictionary *dict, uint32_t dist) { size_t offset = dict->pos - dist - 1; if (dist >= dict->pos) offset += dict->end; return dict->full > 0 ? dict->buf[offset] : 0; } /* * Put one byte into the dictionary. It is assumed that there is space for it. */ static inline void dict_put(struct dictionary *dict, uint8_t byte) { dict->buf[dict->pos++] = byte; if (dict->full < dict->pos) dict->full = dict->pos; } /* * Repeat given number of bytes from the given distance. If the distance is * invalid, false is returned. On success, true is returned and *len is * updated to indicate how many bytes were left to be repeated. */ static bool dict_repeat(struct dictionary *dict, uint32_t *len, uint32_t dist) { size_t back; uint32_t left; if (dist >= dict->full || dist >= dict->size) return false; left = min_t(size_t, dict->limit - dict->pos, *len); *len -= left; back = dict->pos - dist - 1; if (dist >= dict->pos) back += dict->end; do { dict->buf[dict->pos++] = dict->buf[back++]; if (back == dict->end) back = 0; } while (--left > 0); if (dict->full < dict->pos) dict->full = dict->pos; return true; } /* Copy uncompressed data as is from input to dictionary and output buffers. */ static void dict_uncompressed(struct dictionary *dict, struct xz_buf *b, uint32_t *left) { size_t copy_size; while (*left > 0 && b->in_pos < b->in_size && b->out_pos < b->out_size) { copy_size = min(b->in_size - b->in_pos, b->out_size - b->out_pos); if (copy_size > dict->end - dict->pos) copy_size = dict->end - dict->pos; if (copy_size > *left) copy_size = *left; *left -= copy_size; /* * If doing in-place decompression in single-call mode and the * uncompressed size of the file is larger than the caller * thought (i.e. it is invalid input!), the buffers below may * overlap and cause undefined behavior with memcpy(). * With valid inputs memcpy() would be fine here. */ memmove(dict->buf + dict->pos, b->in + b->in_pos, copy_size); dict->pos += copy_size; if (dict->full < dict->pos) dict->full = dict->pos; if (DEC_IS_MULTI(dict->mode)) { if (dict->pos == dict->end) dict->pos = 0; /* * Like above but for multi-call mode: use memmove() * to avoid undefined behavior with invalid input. */ memmove(b->out + b->out_pos, b->in + b->in_pos, copy_size); } dict->start = dict->pos; b->out_pos += copy_size; b->in_pos += copy_size; } } #ifdef XZ_DEC_MICROLZMA # define DICT_FLUSH_SUPPORTS_SKIPPING true #else # define DICT_FLUSH_SUPPORTS_SKIPPING false #endif /* * Flush pending data from dictionary to b->out. It is assumed that there is * enough space in b->out. This is guaranteed because caller uses dict_limit() * before decoding data into the dictionary. */ static uint32_t dict_flush(struct dictionary *dict, struct xz_buf *b) { size_t copy_size = dict->pos - dict->start; if (DEC_IS_MULTI(dict->mode)) { if (dict->pos == dict->end) dict->pos = 0; /* * These buffers cannot overlap even if doing in-place * decompression because in multi-call mode dict->buf * has been allocated by us in this file; it's not * provided by the caller like in single-call mode. * * With MicroLZMA, b->out can be NULL to skip bytes that * the caller doesn't need. This cannot be done with XZ * because it would break BCJ filters. */ if (!DICT_FLUSH_SUPPORTS_SKIPPING || b->out != NULL) memcpy(b->out + b->out_pos, dict->buf + dict->start, copy_size); } dict->start = dict->pos; b->out_pos += copy_size; return copy_size; } /***************** * Range decoder * *****************/ /* Reset the range decoder. */ static void rc_reset(struct rc_dec *rc) { rc->range = (uint32_t)-1; rc->code = 0; rc->init_bytes_left = RC_INIT_BYTES; } /* * Read the first five initial bytes into rc->code if they haven't been * read already. (Yes, the first byte gets completely ignored.) */ static bool rc_read_init(struct rc_dec *rc, struct xz_buf *b) { while (rc->init_bytes_left > 0) { if (b->in_pos == b->in_size) return false; rc->code = (rc->code << 8) + b->in[b->in_pos++]; --rc->init_bytes_left; } return true; } /* Return true if there may not be enough input for the next decoding loop. */ static inline bool rc_limit_exceeded(const struct rc_dec *rc) { return rc->in_pos > rc->in_limit; } /* * Return true if it is possible (from point of view of range decoder) that * we have reached the end of the LZMA chunk. */ static inline bool rc_is_finished(const struct rc_dec *rc) { return rc->code == 0; } /* Read the next input byte if needed. */ static __always_inline void rc_normalize(struct rc_dec *rc) { if (rc->range < RC_TOP_VALUE) { rc->range <<= RC_SHIFT_BITS; rc->code = (rc->code << RC_SHIFT_BITS) + rc->in[rc->in_pos++]; } } /* * Decode one bit. In some versions, this function has been split in three * functions so that the compiler is supposed to be able to more easily avoid * an extra branch. In this particular version of the LZMA decoder, this * doesn't seem to be a good idea (tested with GCC 3.3.6, 3.4.6, and 4.3.3 * on x86). Using a non-split version results in nicer looking code too. * * NOTE: This must return an int. Do not make it return a bool or the speed * of the code generated by GCC 3.x decreases 10-15 %. (GCC 4.3 doesn't care, * and it generates 10-20 % faster code than GCC 3.x from this file anyway.) */ static __always_inline int rc_bit(struct rc_dec *rc, uint16_t *prob) { uint32_t bound; int bit; rc_normalize(rc); bound = (rc->range >> RC_BIT_MODEL_TOTAL_BITS) * *prob; if (rc->code < bound) { rc->range = bound; *prob += (RC_BIT_MODEL_TOTAL - *prob) >> RC_MOVE_BITS; bit = 0; } else { rc->range -= bound; rc->code -= bound; *prob -= *prob >> RC_MOVE_BITS; bit = 1; } return bit; } /* Decode a bittree starting from the most significant bit. */ static __always_inline uint32_t rc_bittree(struct rc_dec *rc, uint16_t *probs, uint32_t limit) { uint32_t symbol = 1; do { if (rc_bit(rc, &probs[symbol])) symbol = (symbol << 1) + 1; else symbol <<= 1; } while (symbol < limit); return symbol; } /* Decode a bittree starting from the least significant bit. */ static __always_inline void rc_bittree_reverse(struct rc_dec *rc, uint16_t *probs, uint32_t *dest, uint32_t limit) { uint32_t symbol = 1; uint32_t i = 0; do { if (rc_bit(rc, &probs[symbol])) { symbol = (symbol << 1) + 1; *dest += 1 << i; } else { symbol <<= 1; } } while (++i < limit); } /* Decode direct bits (fixed fifty-fifty probability) */ static inline void rc_direct(struct rc_dec *rc, uint32_t *dest, uint32_t limit) { uint32_t mask; do { rc_normalize(rc); rc->range >>= 1; rc->code -= rc->range; mask = (uint32_t)0 - (rc->code >> 31); rc->code += rc->range & mask; *dest = (*dest << 1) + (mask + 1); } while (--limit > 0); } /******** * LZMA * ********/ /* Get pointer to literal coder probability array. */ static uint16_t *lzma_literal_probs(struct xz_dec_lzma2 *s) { uint32_t prev_byte = dict_get(&s->dict, 0); uint32_t low = prev_byte >> (8 - s->lzma.lc); uint32_t high = (s->dict.pos & s->lzma.literal_pos_mask) << s->lzma.lc; return s->lzma.literal[low + high]; } /* Decode a literal (one 8-bit byte) */ static void lzma_literal(struct xz_dec_lzma2 *s) { uint16_t *probs; uint32_t symbol; uint32_t match_byte; uint32_t match_bit; uint32_t offset; uint32_t i; probs = lzma_literal_probs(s); if (lzma_state_is_literal(s->lzma.state)) { symbol = rc_bittree(&s->rc, probs, 0x100); } else { symbol = 1; match_byte = dict_get(&s->dict, s->lzma.rep0) << 1; offset = 0x100; do { match_bit = match_byte & offset; match_byte <<= 1; i = offset + match_bit + symbol; if (rc_bit(&s->rc, &probs[i])) { symbol = (symbol << 1) + 1; offset &= match_bit; } else { symbol <<= 1; offset &= ~match_bit; } } while (symbol < 0x100); } dict_put(&s->dict, (uint8_t)symbol); lzma_state_literal(&s->lzma.state); } /* Decode the length of the match into s->lzma.len. */ static void lzma_len(struct xz_dec_lzma2 *s, struct lzma_len_dec *l, uint32_t pos_state) { uint16_t *probs; uint32_t limit; if (!rc_bit(&s->rc, &l->choice)) { probs = l->low[pos_state]; limit = LEN_LOW_SYMBOLS; s->lzma.len = MATCH_LEN_MIN; } else { if (!rc_bit(&s->rc, &l->choice2)) { probs = l->mid[pos_state]; limit = LEN_MID_SYMBOLS; s->lzma.len = MATCH_LEN_MIN + LEN_LOW_SYMBOLS; } else { probs = l->high; limit = LEN_HIGH_SYMBOLS; s->lzma.len = MATCH_LEN_MIN + LEN_LOW_SYMBOLS + LEN_MID_SYMBOLS; } } s->lzma.len += rc_bittree(&s->rc, probs, limit) - limit; } /* Decode a match. The distance will be stored in s->lzma.rep0. */ static void lzma_match(struct xz_dec_lzma2 *s, uint32_t pos_state) { uint16_t *probs; uint32_t dist_slot; uint32_t limit; lzma_state_match(&s->lzma.state); s->lzma.rep3 = s->lzma.rep2; s->lzma.rep2 = s->lzma.rep1; s->lzma.rep1 = s->lzma.rep0; lzma_len(s, &s->lzma.match_len_dec, pos_state); probs = s->lzma.dist_slot[lzma_get_dist_state(s->lzma.len)]; dist_slot = rc_bittree(&s->rc, probs, DIST_SLOTS) - DIST_SLOTS; if (dist_slot < DIST_MODEL_START) { s->lzma.rep0 = dist_slot; } else { limit = (dist_slot >> 1) - 1; s->lzma.rep0 = 2 + (dist_slot & 1); if (dist_slot < DIST_MODEL_END) { s->lzma.rep0 <<= limit; probs = s->lzma.dist_special + s->lzma.rep0 - dist_slot - 1; rc_bittree_reverse(&s->rc, probs, &s->lzma.rep0, limit); } else { rc_direct(&s->rc, &s->lzma.rep0, limit - ALIGN_BITS); s->lzma.rep0 <<= ALIGN_BITS; rc_bittree_reverse(&s->rc, s->lzma.dist_align, &s->lzma.rep0, ALIGN_BITS); } } } /* * Decode a repeated match. The distance is one of the four most recently * seen matches. The distance will be stored in s->lzma.rep0. */ static void lzma_rep_match(struct xz_dec_lzma2 *s, uint32_t pos_state) { uint32_t tmp; if (!rc_bit(&s->rc, &s->lzma.is_rep0[s->lzma.state])) { if (!rc_bit(&s->rc, &s->lzma.is_rep0_long[ s->lzma.state][pos_state])) { lzma_state_short_rep(&s->lzma.state); s->lzma.len = 1; return; } } else { if (!rc_bit(&s->rc, &s->lzma.is_rep1[s->lzma.state])) { tmp = s->lzma.rep1; } else { if (!rc_bit(&s->rc, &s->lzma.is_rep2[s->lzma.state])) { tmp = s->lzma.rep2; } else { tmp = s->lzma.rep3; s->lzma.rep3 = s->lzma.rep2; } s->lzma.rep2 = s->lzma.rep1; } s->lzma.rep1 = s->lzma.rep0; s->lzma.rep0 = tmp; } lzma_state_long_rep(&s->lzma.state); lzma_len(s, &s->lzma.rep_len_dec, pos_state); } /* LZMA decoder core */ static bool lzma_main(struct xz_dec_lzma2 *s) { uint32_t pos_state; /* * If the dictionary was reached during the previous call, try to * finish the possibly pending repeat in the dictionary. */ if (dict_has_space(&s->dict) && s->lzma.len > 0) dict_repeat(&s->dict, &s->lzma.len, s->lzma.rep0); /* * Decode more LZMA symbols. One iteration may consume up to * LZMA_IN_REQUIRED - 1 bytes. */ while (dict_has_space(&s->dict) && !rc_limit_exceeded(&s->rc)) { pos_state = s->dict.pos & s->lzma.pos_mask; if (!rc_bit(&s->rc, &s->lzma.is_match[ s->lzma.state][pos_state])) { lzma_literal(s); } else { if (rc_bit(&s->rc, &s->lzma.is_rep[s->lzma.state])) lzma_rep_match(s, pos_state); else lzma_match(s, pos_state); if (!dict_repeat(&s->dict, &s->lzma.len, s->lzma.rep0)) return false; } } /* * Having the range decoder always normalized when we are outside * this function makes it easier to correctly handle end of the chunk. */ rc_normalize(&s->rc); return true; } /* * Reset the LZMA decoder and range decoder state. Dictionary is not reset * here, because LZMA state may be reset without resetting the dictionary. */ static void lzma_reset(struct xz_dec_lzma2 *s) { uint16_t *probs; size_t i; s->lzma.state = STATE_LIT_LIT; s->lzma.rep0 = 0; s->lzma.rep1 = 0; s->lzma.rep2 = 0; s->lzma.rep3 = 0; s->lzma.len = 0; /* * All probabilities are initialized to the same value. This hack * makes the code smaller by avoiding a separate loop for each * probability array. * * This could be optimized so that only that part of literal * probabilities that are actually required. In the common case * we would write 12 KiB less. */ probs = s->lzma.is_match[0]; for (i = 0; i < PROBS_TOTAL; ++i) probs[i] = RC_BIT_MODEL_TOTAL / 2; rc_reset(&s->rc); } /* * Decode and validate LZMA properties (lc/lp/pb) and calculate the bit masks * from the decoded lp and pb values. On success, the LZMA decoder state is * reset and true is returned. */ static bool lzma_props(struct xz_dec_lzma2 *s, uint8_t props) { if (props > (4 * 5 + 4) * 9 + 8) return false; s->lzma.pos_mask = 0; while (props >= 9 * 5) { props -= 9 * 5; ++s->lzma.pos_mask; } s->lzma.pos_mask = (1 << s->lzma.pos_mask) - 1; s->lzma.literal_pos_mask = 0; while (props >= 9) { props -= 9; ++s->lzma.literal_pos_mask; } s->lzma.lc = props; if (s->lzma.lc + s->lzma.literal_pos_mask > 4) return false; s->lzma.literal_pos_mask = (1 << s->lzma.literal_pos_mask) - 1; lzma_reset(s); return true; } /********* * LZMA2 * *********/ /* * The LZMA decoder assumes that if the input limit (s->rc.in_limit) hasn't * been exceeded, it is safe to read up to LZMA_IN_REQUIRED bytes. This * wrapper function takes care of making the LZMA decoder's assumption safe. * * As long as there is plenty of input left to be decoded in the current LZMA * chunk, we decode directly from the caller-supplied input buffer until * there's LZMA_IN_REQUIRED bytes left. Those remaining bytes are copied into * s->temp.buf, which (hopefully) gets filled on the next call to this * function. We decode a few bytes from the temporary buffer so that we can * continue decoding from the caller-supplied input buffer again. */ static bool lzma2_lzma(struct xz_dec_lzma2 *s, struct xz_buf *b) { size_t in_avail; uint32_t tmp; in_avail = b->in_size - b->in_pos; if (s->temp.size > 0 || s->lzma2.compressed == 0) { tmp = 2 * LZMA_IN_REQUIRED - s->temp.size; if (tmp > s->lzma2.compressed - s->temp.size) tmp = s->lzma2.compressed - s->temp.size; if (tmp > in_avail) tmp = in_avail; memcpy(s->temp.buf + s->temp.size, b->in + b->in_pos, tmp); if (s->temp.size + tmp == s->lzma2.compressed) { memzero(s->temp.buf + s->temp.size + tmp, sizeof(s->temp.buf) - s->temp.size - tmp); s->rc.in_limit = s->temp.size + tmp; } else if (s->temp.size + tmp < LZMA_IN_REQUIRED) { s->temp.size += tmp; b->in_pos += tmp; return true; } else { s->rc.in_limit = s->temp.size + tmp - LZMA_IN_REQUIRED; } s->rc.in = s->temp.buf; s->rc.in_pos = 0; if (!lzma_main(s) || s->rc.in_pos > s->temp.size + tmp) return false; s->lzma2.compressed -= s->rc.in_pos; if (s->rc.in_pos < s->temp.size) { s->temp.size -= s->rc.in_pos; memmove(s->temp.buf, s->temp.buf + s->rc.in_pos, s->temp.size); return true; } b->in_pos += s->rc.in_pos - s->temp.size; s->temp.size = 0; } in_avail = b->in_size - b->in_pos; if (in_avail >= LZMA_IN_REQUIRED) { s->rc.in = b->in; s->rc.in_pos = b->in_pos; if (in_avail >= s->lzma2.compressed + LZMA_IN_REQUIRED) s->rc.in_limit = b->in_pos + s->lzma2.compressed; else s->rc.in_limit = b->in_size - LZMA_IN_REQUIRED; if (!lzma_main(s)) return false; in_avail = s->rc.in_pos - b->in_pos; if (in_avail > s->lzma2.compressed) return false; s->lzma2.compressed -= in_avail; b->in_pos = s->rc.in_pos; } in_avail = b->in_size - b->in_pos; if (in_avail < LZMA_IN_REQUIRED) { if (in_avail > s->lzma2.compressed) in_avail = s->lzma2.compressed; memcpy(s->temp.buf, b->in + b->in_pos, in_avail); s->temp.size = in_avail; b->in_pos += in_avail; } return true; } /* * Take care of the LZMA2 control layer, and forward the job of actual LZMA * decoding or copying of uncompressed chunks to other functions. */ enum xz_ret xz_dec_lzma2_run(struct xz_dec_lzma2 *s, struct xz_buf *b) { uint32_t tmp; while (b->in_pos < b->in_size || s->lzma2.sequence == SEQ_LZMA_RUN) { switch (s->lzma2.sequence) { case SEQ_CONTROL: /* * LZMA2 control byte * * Exact values: * 0x00 End marker * 0x01 Dictionary reset followed by * an uncompressed chunk * 0x02 Uncompressed chunk (no dictionary reset) * * Highest three bits (s->control & 0xE0): * 0xE0 Dictionary reset, new properties and state * reset, followed by LZMA compressed chunk * 0xC0 New properties and state reset, followed * by LZMA compressed chunk (no dictionary * reset) * 0xA0 State reset using old properties, * followed by LZMA compressed chunk (no * dictionary reset) * 0x80 LZMA chunk (no dictionary or state reset) * * For LZMA compressed chunks, the lowest five bits * (s->control & 1F) are the highest bits of the * uncompressed size (bits 16-20). * * A new LZMA2 stream must begin with a dictionary * reset. The first LZMA chunk must set new * properties and reset the LZMA state. * * Values that don't match anything described above * are invalid and we return XZ_DATA_ERROR. */ tmp = b->in[b->in_pos++]; if (tmp == 0x00) return XZ_STREAM_END; if (tmp >= 0xE0 || tmp == 0x01) { s->lzma2.need_props = true; s->lzma2.need_dict_reset = false; dict_reset(&s->dict, b); } else if (s->lzma2.need_dict_reset) { return XZ_DATA_ERROR; } if (tmp >= 0x80) { s->lzma2.uncompressed = (tmp & 0x1F) << 16; s->lzma2.sequence = SEQ_UNCOMPRESSED_1; if (tmp >= 0xC0) { /* * When there are new properties, * state reset is done at * SEQ_PROPERTIES. */ s->lzma2.need_props = false; s->lzma2.next_sequence = SEQ_PROPERTIES; } else if (s->lzma2.need_props) { return XZ_DATA_ERROR; } else { s->lzma2.next_sequence = SEQ_LZMA_PREPARE; if (tmp >= 0xA0) lzma_reset(s); } } else { if (tmp > 0x02) return XZ_DATA_ERROR; s->lzma2.sequence = SEQ_COMPRESSED_0; s->lzma2.next_sequence = SEQ_COPY; } break; case SEQ_UNCOMPRESSED_1: s->lzma2.uncompressed += (uint32_t)b->in[b->in_pos++] << 8; s->lzma2.sequence = SEQ_UNCOMPRESSED_2; break; case SEQ_UNCOMPRESSED_2: s->lzma2.uncompressed += (uint32_t)b->in[b->in_pos++] + 1; s->lzma2.sequence = SEQ_COMPRESSED_0; break; case SEQ_COMPRESSED_0: s->lzma2.compressed = (uint32_t)b->in[b->in_pos++] << 8; s->lzma2.sequence = SEQ_COMPRESSED_1; break; case SEQ_COMPRESSED_1: s->lzma2.compressed += (uint32_t)b->in[b->in_pos++] + 1; s->lzma2.sequence = s->lzma2.next_sequence; break; case SEQ_PROPERTIES: if (!lzma_props(s, b->in[b->in_pos++])) return XZ_DATA_ERROR; s->lzma2.sequence = SEQ_LZMA_PREPARE; fallthrough; case SEQ_LZMA_PREPARE: if (s->lzma2.compressed < RC_INIT_BYTES) return XZ_DATA_ERROR; if (!rc_read_init(&s->rc, b)) return XZ_OK; s->lzma2.compressed -= RC_INIT_BYTES; s->lzma2.sequence = SEQ_LZMA_RUN; fallthrough; case SEQ_LZMA_RUN: /* * Set dictionary limit to indicate how much we want * to be encoded at maximum. Decode new data into the * dictionary. Flush the new data from dictionary to * b->out. Check if we finished decoding this chunk. * In case the dictionary got full but we didn't fill * the output buffer yet, we may run this loop * multiple times without changing s->lzma2.sequence. */ dict_limit(&s->dict, min_t(size_t, b->out_size - b->out_pos, s->lzma2.uncompressed)); if (!lzma2_lzma(s, b)) return XZ_DATA_ERROR; s->lzma2.uncompressed -= dict_flush(&s->dict, b); if (s->lzma2.uncompressed == 0) { if (s->lzma2.compressed > 0 || s->lzma.len > 0 || !rc_is_finished(&s->rc)) return XZ_DATA_ERROR; rc_reset(&s->rc); s->lzma2.sequence = SEQ_CONTROL; } else if (b->out_pos == b->out_size || (b->in_pos == b->in_size && s->temp.size < s->lzma2.compressed)) { return XZ_OK; } break; case SEQ_COPY: dict_uncompressed(&s->dict, b, &s->lzma2.compressed); if (s->lzma2.compressed > 0) return XZ_OK; s->lzma2.sequence = SEQ_CONTROL; break; } } return XZ_OK; } struct xz_dec_lzma2 *xz_dec_lzma2_create(enum xz_mode mode, uint32_t dict_max) { struct xz_dec_lzma2 *s = kmalloc_obj(*s); if (s == NULL) return NULL; s->dict.mode = mode; s->dict.size_max = dict_max; if (DEC_IS_PREALLOC(mode)) { s->dict.buf = vmalloc(dict_max); if (s->dict.buf == NULL) { kfree(s); return NULL; } } else if (DEC_IS_DYNALLOC(mode)) { s->dict.buf = NULL; s->dict.allocated = 0; } return s; } enum xz_ret xz_dec_lzma2_reset(struct xz_dec_lzma2 *s, uint8_t props) { /* This limits dictionary size to 3 GiB to keep parsing simpler. */ if (props > 39) return XZ_OPTIONS_ERROR; s->dict.size = 2 + (props & 1); s->dict.size <<= (props >> 1) + 11; if (DEC_IS_MULTI(s->dict.mode)) { if (s->dict.size > s->dict.size_max) return XZ_MEMLIMIT_ERROR; s->dict.end = s->dict.size; if (DEC_IS_DYNALLOC(s->dict.mode)) { if (s->dict.allocated < s->dict.size) { s->dict.allocated = s->dict.size; vfree(s->dict.buf); s->dict.buf = vmalloc(s->dict.size); if (s->dict.buf == NULL) { s->dict.allocated = 0; return XZ_MEM_ERROR; } } } } s->lzma2.sequence = SEQ_CONTROL; s->lzma2.need_dict_reset = true; s->temp.size = 0; return XZ_OK; } void xz_dec_lzma2_end(struct xz_dec_lzma2 *s) { if (DEC_IS_MULTI(s->dict.mode)) vfree(s->dict.buf); kfree(s); } #ifdef XZ_DEC_MICROLZMA /* This is a wrapper struct to have a nice struct name in the public API. */ struct xz_dec_microlzma { struct xz_dec_lzma2 s; }; enum xz_ret xz_dec_microlzma_run(struct xz_dec_microlzma *s_ptr, struct xz_buf *b) { struct xz_dec_lzma2 *s = &s_ptr->s; /* * sequence is SEQ_PROPERTIES before the first input byte, * SEQ_LZMA_PREPARE until a total of five bytes have been read, * and SEQ_LZMA_RUN for the rest of the input stream. */ if (s->lzma2.sequence != SEQ_LZMA_RUN) { if (s->lzma2.sequence == SEQ_PROPERTIES) { /* One byte is needed for the props. */ if (b->in_pos >= b->in_size) return XZ_OK; /* * Don't increment b->in_pos here. The same byte is * also passed to rc_read_init() which will ignore it. */ if (!lzma_props(s, ~b->in[b->in_pos])) return XZ_DATA_ERROR; s->lzma2.sequence = SEQ_LZMA_PREPARE; } /* * xz_dec_microlzma_reset() doesn't validate the compressed * size so we do it here. We have to limit the maximum size * to avoid integer overflows in lzma2_lzma(). 3 GiB is a nice * round number and much more than users of this code should * ever need. */ if (s->lzma2.compressed < RC_INIT_BYTES || s->lzma2.compressed > (3U << 30)) return XZ_DATA_ERROR; if (!rc_read_init(&s->rc, b)) return XZ_OK; s->lzma2.compressed -= RC_INIT_BYTES; s->lzma2.sequence = SEQ_LZMA_RUN; dict_reset(&s->dict, b); } /* This is to allow increasing b->out_size between calls. */ if (DEC_IS_SINGLE(s->dict.mode)) s->dict.end = b->out_size - b->out_pos; while (true) { dict_limit(&s->dict, min_t(size_t, b->out_size - b->out_pos, s->lzma2.uncompressed)); if (!lzma2_lzma(s, b)) return XZ_DATA_ERROR; s->lzma2.uncompressed -= dict_flush(&s->dict, b); if (s->lzma2.uncompressed == 0) { if (s->lzma2.pedantic_microlzma) { if (s->lzma2.compressed > 0 || s->lzma.len > 0 || !rc_is_finished(&s->rc)) return XZ_DATA_ERROR; } return XZ_STREAM_END; } if (b->out_pos == b->out_size) return XZ_OK; if (b->in_pos == b->in_size && s->temp.size < s->lzma2.compressed) return XZ_OK; } } struct xz_dec_microlzma *xz_dec_microlzma_alloc(enum xz_mode mode, uint32_t dict_size) { struct xz_dec_microlzma *s; /* Restrict dict_size to the same range as in the LZMA2 code. */ if (dict_size < 4096 || dict_size > (3U << 30)) return NULL; s = kmalloc_obj(*s); if (s == NULL) return NULL; s->s.dict.mode = mode; s->s.dict.size = dict_size; if (DEC_IS_MULTI(mode)) { s->s.dict.end = dict_size; s->s.dict.buf = vmalloc(dict_size); if (s->s.dict.buf == NULL) { kfree(s); return NULL; } } return s; } void xz_dec_microlzma_reset(struct xz_dec_microlzma *s, uint32_t comp_size, uint32_t uncomp_size, int uncomp_size_is_exact) { /* * comp_size is validated in xz_dec_microlzma_run(). * uncomp_size can safely be anything. */ s->s.lzma2.compressed = comp_size; s->s.lzma2.uncompressed = uncomp_size; s->s.lzma2.pedantic_microlzma = uncomp_size_is_exact; s->s.lzma2.sequence = SEQ_PROPERTIES; s->s.temp.size = 0; } void xz_dec_microlzma_end(struct xz_dec_microlzma *s) { if (DEC_IS_MULTI(s->s.dict.mode)) vfree(s->s.dict.buf); kfree(s); } #endif
16 16 18 1 17 7 17 2 16 2 7 2 4 14 7 5 1 2 2 2 3 2 1 3 3 3 2 2 2 2 2 2 3 3 3 1 2 1 1 4 4 1 1 1 1 1 1 1 1 1 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 /* FUSE: Filesystem in Userspace Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu> This program can be distributed under the terms of the GNU GPL. See the file COPYING. */ #include "fuse_i.h" #include <linux/pagemap.h> #include <linux/slab.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/sched/signal.h> #include <linux/module.h> #include <linux/swap.h> #include <linux/falloc.h> #include <linux/uio.h> #include <linux/fs.h> #include <linux/filelock.h> #include <linux/splice.h> #include <linux/task_io_accounting_ops.h> #include <linux/iomap.h> static int fuse_send_open(struct fuse_mount *fm, u64 nodeid, unsigned int open_flags, int opcode, struct fuse_open_out *outargp) { struct fuse_open_in inarg; FUSE_ARGS(args); memset(&inarg, 0, sizeof(inarg)); inarg.flags = open_flags & ~(O_CREAT | O_EXCL | O_NOCTTY); if (!fm->fc->atomic_o_trunc) inarg.flags &= ~O_TRUNC; if (fm->fc->handle_killpriv_v2 && (inarg.flags & O_TRUNC) && !capable(CAP_FSETID)) { inarg.open_flags |= FUSE_OPEN_KILL_SUIDGID; } args.opcode = opcode; args.nodeid = nodeid; args.in_numargs = 1; args.in_args[0].size = sizeof(inarg); args.in_args[0].value = &inarg; args.out_numargs = 1; args.out_args[0].size = sizeof(*outargp); args.out_args[0].value = outargp; return fuse_simple_request(fm, &args); } struct fuse_file *fuse_file_alloc(struct fuse_mount *fm, bool release) { struct fuse_file *ff; ff = kzalloc_obj(struct fuse_file, GFP_KERNEL_ACCOUNT); if (unlikely(!ff)) return NULL; ff->fm = fm; if (release) { ff->args = kzalloc_obj(*ff->args, GFP_KERNEL_ACCOUNT); if (!ff->args) { kfree(ff); return NULL; } } INIT_LIST_HEAD(&ff->write_entry); refcount_set(&ff->count, 1); RB_CLEAR_NODE(&ff->polled_node); init_waitqueue_head(&ff->poll_wait); ff->kh = atomic64_inc_return(&fm->fc->khctr); return ff; } void fuse_file_free(struct fuse_file *ff) { kfree(ff->args); kfree(ff); } static struct fuse_file *fuse_file_get(struct fuse_file *ff) { refcount_inc(&ff->count); return ff; } static void fuse_release_end(struct fuse_mount *fm, struct fuse_args *args, int error) { struct fuse_release_args *ra = container_of(args, typeof(*ra), args); iput(ra->inode); kfree(ra); } static void fuse_file_put(struct fuse_file *ff, bool sync) { if (refcount_dec_and_test(&ff->count)) { struct fuse_release_args *ra = &ff->args->release_args; struct fuse_args *args = (ra ? &ra->args : NULL); if (ra && ra->inode) fuse_file_io_release(ff, ra->inode); if (!args) { /* Do nothing when server does not implement 'opendir' */ } else if (args->opcode == FUSE_RELEASE && ff->fm->fc->no_open) { fuse_release_end(ff->fm, args, 0); } else if (sync) { fuse_simple_request(ff->fm, args); fuse_release_end(ff->fm, args, 0); } else { args->end = fuse_release_end; if (fuse_simple_background(ff->fm, args, GFP_KERNEL | __GFP_NOFAIL)) fuse_release_end(ff->fm, args, -ENOTCONN); } kfree(ff); } } struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid, unsigned int open_flags, bool isdir) { struct fuse_conn *fc = fm->fc; struct fuse_file *ff; int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN; bool open = isdir ? !fc->no_opendir : !fc->no_open; bool release = !isdir || open; /* * ff->args->release_args still needs to be allocated (so we can hold an * inode reference while there are pending inflight file operations when * ->release() is called, see fuse_prepare_release()) even if * fc->no_open is set else it becomes possible for reclaim to deadlock * if while servicing the readahead request the server triggers reclaim * and reclaim evicts the inode of the file being read ahead. */ ff = fuse_file_alloc(fm, release); if (!ff) return ERR_PTR(-ENOMEM); ff->fh = 0; /* Default for no-open */ ff->open_flags = FOPEN_KEEP_CACHE | (isdir ? FOPEN_CACHE_DIR : 0); if (open) { /* Store outarg for fuse_finish_open() */ struct fuse_open_out *outargp = &ff->args->open_outarg; int err; err = fuse_send_open(fm, nodeid, open_flags, opcode, outargp); if (!err) { ff->fh = outargp->fh; ff->open_flags = outargp->open_flags; } else if (err != -ENOSYS) { fuse_file_free(ff); return ERR_PTR(err); } else { if (isdir) { /* No release needed */ kfree(ff->args); ff->args = NULL; fc->no_opendir = 1; } else { fc->no_open = 1; } } } if (isdir) ff->open_flags &= ~FOPEN_DIRECT_IO; ff->nodeid = nodeid; return ff; } int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file, bool isdir) { struct fuse_file *ff = fuse_file_open(fm, nodeid, file->f_flags, isdir); if (!IS_ERR(ff)) file->private_data = ff; return PTR_ERR_OR_ZERO(ff); } EXPORT_SYMBOL_GPL(fuse_do_open); static void fuse_link_write_file(struct file *file) { struct inode *inode = file_inode(file); struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_file *ff = file->private_data; /* * file may be written through mmap, so chain it onto the * inodes's write_file list */ spin_lock(&fi->lock); if (list_empty(&ff->write_entry)) list_add(&ff->write_entry, &fi->write_files); spin_unlock(&fi->lock); } int fuse_finish_open(struct inode *inode, struct file *file) { struct fuse_file *ff = file->private_data; struct fuse_conn *fc = get_fuse_conn(inode); int err; err = fuse_file_io_open(file, inode); if (err) return err; if (ff->open_flags & FOPEN_STREAM) stream_open(inode, file); else if (ff->open_flags & FOPEN_NONSEEKABLE) nonseekable_open(inode, file); if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache) fuse_link_write_file(file); return 0; } static void fuse_truncate_update_attr(struct inode *inode, struct file *file) { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); spin_lock(&fi->lock); fi->attr_version = atomic64_inc_return(&fc->attr_version); i_size_write(inode, 0); spin_unlock(&fi->lock); file_update_time(file); fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); } static int fuse_open(struct inode *inode, struct file *file) { struct fuse_mount *fm = get_fuse_mount(inode); struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = fm->fc; struct fuse_file *ff; int err; bool is_truncate = (file->f_flags & O_TRUNC) && fc->atomic_o_trunc; bool is_wb_truncate = is_truncate && fc->writeback_cache; bool dax_truncate = is_truncate && FUSE_IS_DAX(inode); if (fuse_is_bad(inode)) return -EIO; err = generic_file_open(inode, file); if (err) return err; if (is_wb_truncate || dax_truncate) inode_lock(inode); if (dax_truncate) { filemap_invalidate_lock(inode->i_mapping); err = fuse_dax_break_layouts(inode, 0, -1); if (err) goto out_inode_unlock; } if (is_wb_truncate || dax_truncate) fuse_set_nowrite(inode); err = fuse_do_open(fm, get_node_id(inode), file, false); if (!err) { ff = file->private_data; err = fuse_finish_open(inode, file); if (err) fuse_sync_release(fi, ff, file->f_flags); else if (is_truncate) fuse_truncate_update_attr(inode, file); } if (is_wb_truncate || dax_truncate) fuse_release_nowrite(inode); if (!err) { if (is_truncate) truncate_pagecache(inode, 0); else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) invalidate_inode_pages2(inode->i_mapping); } if (dax_truncate) filemap_invalidate_unlock(inode->i_mapping); out_inode_unlock: if (is_wb_truncate || dax_truncate) inode_unlock(inode); return err; } static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff, unsigned int flags, int opcode, bool sync) { struct fuse_conn *fc = ff->fm->fc; struct fuse_release_args *ra = &ff->args->release_args; if (fuse_file_passthrough(ff)) fuse_passthrough_release(ff, fuse_inode_backing(fi)); /* Inode is NULL on error path of fuse_create_open() */ if (likely(fi)) { spin_lock(&fi->lock); list_del(&ff->write_entry); spin_unlock(&fi->lock); } spin_lock(&fc->lock); if (!RB_EMPTY_NODE(&ff->polled_node)) rb_erase(&ff->polled_node, &fc->polled_files); spin_unlock(&fc->lock); wake_up_interruptible_all(&ff->poll_wait); if (!ra) return; /* ff->args was used for open outarg */ memset(ff->args, 0, sizeof(*ff->args)); ra->inarg.fh = ff->fh; ra->inarg.flags = flags; ra->args.in_numargs = 1; ra->args.in_args[0].size = sizeof(struct fuse_release_in); ra->args.in_args[0].value = &ra->inarg; ra->args.opcode = opcode; ra->args.nodeid = ff->nodeid; ra->args.force = true; ra->args.nocreds = true; /* * Hold inode until release is finished. * From fuse_sync_release() the refcount is 1 and everything's * synchronous, so we are fine with not doing igrab() here. */ ra->inode = sync ? NULL : igrab(&fi->inode); } void fuse_file_release(struct inode *inode, struct fuse_file *ff, unsigned int open_flags, fl_owner_t id, bool isdir) { struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_release_args *ra = &ff->args->release_args; int opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE; fuse_prepare_release(fi, ff, open_flags, opcode, false); if (ra && ff->flock) { ra->inarg.release_flags |= FUSE_RELEASE_FLOCK_UNLOCK; ra->inarg.lock_owner = fuse_lock_owner_id(ff->fm->fc, id); } /* * Normally this will send the RELEASE request, however if * some asynchronous READ or WRITE requests are outstanding, * the sending will be delayed. * * Make the release synchronous if this is a fuseblk mount, * synchronous RELEASE is allowed (and desirable) in this case * because the server can be trusted not to screw up. * * Always use the asynchronous file put because the current thread * might be the fuse server. This can happen if a process starts some * aio and closes the fd before the aio completes. Since aio takes its * own ref to the file, the IO completion has to drop the ref, which is * how the fuse server can end up closing its clients' files. */ fuse_file_put(ff, false); } void fuse_release_common(struct file *file, bool isdir) { fuse_file_release(file_inode(file), file->private_data, file->f_flags, (fl_owner_t) file, isdir); } static int fuse_release(struct inode *inode, struct file *file) { struct fuse_conn *fc = get_fuse_conn(inode); /* * Dirty pages might remain despite write_inode_now() call from * fuse_flush() due to writes racing with the close. */ if (fc->writeback_cache) write_inode_now(inode, 1); fuse_release_common(file, false); /* return value is ignored by VFS */ return 0; } void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff, unsigned int flags) { WARN_ON(refcount_read(&ff->count) > 1); fuse_prepare_release(fi, ff, flags, FUSE_RELEASE, true); fuse_file_put(ff, true); } EXPORT_SYMBOL_GPL(fuse_sync_release); /* * Scramble the ID space with XTEA, so that the value of the files_struct * pointer is not exposed to userspace. */ u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id) { u32 *k = fc->scramble_key; u64 v = (unsigned long) id; u32 v0 = v; u32 v1 = v >> 32; u32 sum = 0; int i; for (i = 0; i < 32; i++) { v0 += ((v1 << 4 ^ v1 >> 5) + v1) ^ (sum + k[sum & 3]); sum += 0x9E3779B9; v1 += ((v0 << 4 ^ v0 >> 5) + v0) ^ (sum + k[sum>>11 & 3]); } return (u64) v0 + ((u64) v1 << 32); } struct fuse_writepage_args { struct fuse_io_args ia; struct list_head queue_entry; struct inode *inode; struct fuse_sync_bucket *bucket; }; /* * Wait for all pending writepages on the inode to finish. * * This is currently done by blocking further writes with FUSE_NOWRITE * and waiting for all sent writes to complete. * * This must be called under i_mutex, otherwise the FUSE_NOWRITE usage * could conflict with truncation. */ static void fuse_sync_writes(struct inode *inode) { fuse_set_nowrite(inode); fuse_release_nowrite(inode); } static int fuse_flush(struct file *file, fl_owner_t id) { struct inode *inode = file_inode(file); struct fuse_mount *fm = get_fuse_mount(inode); struct fuse_file *ff = file->private_data; struct fuse_flush_in inarg; FUSE_ARGS(args); int err; if (fuse_is_bad(inode)) return -EIO; if (ff->open_flags & FOPEN_NOFLUSH && !fm->fc->writeback_cache) return 0; err = write_inode_now(inode, 1); if (err) return err; err = filemap_check_errors(file->f_mapping); if (err) return err; err = 0; if (fm->fc->no_flush) goto inval_attr_out; memset(&inarg, 0, sizeof(inarg)); inarg.fh = ff->fh; inarg.lock_owner = fuse_lock_owner_id(fm->fc, id); args.opcode = FUSE_FLUSH; args.nodeid = get_node_id(inode); args.in_numargs = 1; args.in_args[0].size = sizeof(inarg); args.in_args[0].value = &inarg; args.force = true; err = fuse_simple_request(fm, &args); if (err == -ENOSYS) { fm->fc->no_flush = 1; err = 0; } inval_attr_out: /* * In memory i_blocks is not maintained by fuse, if writeback cache is * enabled, i_blocks from cached attr may not be accurate. */ if (!err && fm->fc->writeback_cache) fuse_invalidate_attr_mask(inode, STATX_BLOCKS); return err; } int fuse_fsync_common(struct file *file, loff_t start, loff_t end, int datasync, int opcode) { struct inode *inode = file->f_mapping->host; struct fuse_mount *fm = get_fuse_mount(inode); struct fuse_file *ff = file->private_data; FUSE_ARGS(args); struct fuse_fsync_in inarg; memset(&inarg, 0, sizeof(inarg)); inarg.fh = ff->fh; inarg.fsync_flags = datasync ? FUSE_FSYNC_FDATASYNC : 0; args.opcode = opcode; args.nodeid = get_node_id(inode); args.in_numargs = 1; args.in_args[0].size = sizeof(inarg); args.in_args[0].value = &inarg; return fuse_simple_request(fm, &args); } static int fuse_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file->f_mapping->host; struct fuse_conn *fc = get_fuse_conn(inode); int err; if (fuse_is_bad(inode)) return -EIO; inode_lock(inode); /* * Start writeback against all dirty pages of the inode, then * wait for all outstanding writes, before sending the FSYNC * request. */ err = file_write_and_wait_range(file, start, end); if (err) goto out; fuse_sync_writes(inode); /* * Due to implementation of fuse writeback * file_write_and_wait_range() does not catch errors. * We have to do this directly after fuse_sync_writes() */ err = file_check_and_advance_wb_err(file); if (err) goto out; err = sync_inode_metadata(inode, 1); if (err) goto out; if (fc->no_fsync) goto out; err = fuse_fsync_common(file, start, end, datasync, FUSE_FSYNC); if (err == -ENOSYS) { fc->no_fsync = 1; err = 0; } out: inode_unlock(inode); return err; } void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos, size_t count, int opcode) { struct fuse_file *ff = file->private_data; struct fuse_args *args = &ia->ap.args; ia->read.in.fh = ff->fh; ia->read.in.offset = pos; ia->read.in.size = count; ia->read.in.flags = file->f_flags; args->opcode = opcode; args->nodeid = ff->nodeid; args->in_numargs = 1; args->in_args[0].size = sizeof(ia->read.in); args->in_args[0].value = &ia->read.in; args->out_argvar = true; args->out_numargs = 1; args->out_args[0].size = count; } static void fuse_release_user_pages(struct fuse_args_pages *ap, ssize_t nres, bool should_dirty) { unsigned int i; for (i = 0; i < ap->num_folios; i++) { if (should_dirty) folio_mark_dirty_lock(ap->folios[i]); if (ap->args.is_pinned) unpin_folio(ap->folios[i]); } if (nres > 0 && ap->args.invalidate_vmap) invalidate_kernel_vmap_range(ap->args.vmap_base, nres); } static void fuse_io_release(struct kref *kref) { kfree(container_of(kref, struct fuse_io_priv, refcnt)); } static ssize_t fuse_get_res_by_io(struct fuse_io_priv *io) { if (io->err) return io->err; if (io->bytes >= 0 && io->write) return -EIO; return io->bytes < 0 ? io->size : io->bytes; } /* * In case of short read, the caller sets 'pos' to the position of * actual end of fuse request in IO request. Otherwise, if bytes_requested * == bytes_transferred or rw == WRITE, the caller sets 'pos' to -1. * * An example: * User requested DIO read of 64K. It was split into two 32K fuse requests, * both submitted asynchronously. The first of them was ACKed by userspace as * fully completed (req->out.args[0].size == 32K) resulting in pos == -1. The * second request was ACKed as short, e.g. only 1K was read, resulting in * pos == 33K. * * Thus, when all fuse requests are completed, the minimal non-negative 'pos' * will be equal to the length of the longest contiguous fragment of * transferred data starting from the beginning of IO request. */ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos) { int left; spin_lock(&io->lock); if (err) io->err = io->err ? : err; else if (pos >= 0 && (io->bytes < 0 || pos < io->bytes)) io->bytes = pos; left = --io->reqs; if (!left && io->blocking) complete(io->done); spin_unlock(&io->lock); if (!left && !io->blocking) { ssize_t res = fuse_get_res_by_io(io); if (res >= 0) { struct inode *inode = file_inode(io->iocb->ki_filp); struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); spin_lock(&fi->lock); fi->attr_version = atomic64_inc_return(&fc->attr_version); spin_unlock(&fi->lock); } io->iocb->ki_complete(io->iocb, res); } kref_put(&io->refcnt, fuse_io_release); } static struct fuse_io_args *fuse_io_alloc(struct fuse_io_priv *io, unsigned int nfolios) { struct fuse_io_args *ia; ia = kzalloc_obj(*ia); if (ia) { ia->io = io; ia->ap.folios = fuse_folios_alloc(nfolios, GFP_KERNEL, &ia->ap.descs); if (!ia->ap.folios) { kfree(ia); ia = NULL; } } return ia; } static void fuse_io_free(struct fuse_io_args *ia) { kfree(ia->ap.folios); kfree(ia); } static void fuse_aio_complete_req(struct fuse_mount *fm, struct fuse_args *args, int err) { struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args); struct fuse_io_priv *io = ia->io; ssize_t pos = -1; size_t nres; if (err) { /* Nothing */ } else if (io->write) { if (ia->write.out.size > ia->write.in.size) { err = -EIO; } else { nres = ia->write.out.size; if (ia->write.in.size != ia->write.out.size) pos = ia->write.in.offset - io->offset + ia->write.out.size; } } else { u32 outsize = args->out_args[0].size; nres = outsize; if (ia->read.in.size != outsize) pos = ia->read.in.offset - io->offset + outsize; } fuse_release_user_pages(&ia->ap, err ?: nres, io->should_dirty); fuse_aio_complete(io, err, pos); fuse_io_free(ia); } static ssize_t fuse_async_req_send(struct fuse_mount *fm, struct fuse_io_args *ia, size_t num_bytes) { ssize_t err; struct fuse_io_priv *io = ia->io; spin_lock(&io->lock); kref_get(&io->refcnt); io->size += num_bytes; io->reqs++; spin_unlock(&io->lock); ia->ap.args.end = fuse_aio_complete_req; ia->ap.args.may_block = io->should_dirty; err = fuse_simple_background(fm, &ia->ap.args, GFP_KERNEL); if (err) fuse_aio_complete_req(fm, &ia->ap.args, err); return num_bytes; } static ssize_t fuse_send_read(struct fuse_io_args *ia, loff_t pos, size_t count, fl_owner_t owner) { struct file *file = ia->io->iocb->ki_filp; struct fuse_file *ff = file->private_data; struct fuse_mount *fm = ff->fm; fuse_read_args_fill(ia, file, pos, count, FUSE_READ); if (owner != NULL) { ia->read.in.read_flags |= FUSE_READ_LOCKOWNER; ia->read.in.lock_owner = fuse_lock_owner_id(fm->fc, owner); } if (ia->io->async) return fuse_async_req_send(fm, ia, count); return fuse_simple_request(fm, &ia->ap.args); } static void fuse_read_update_size(struct inode *inode, loff_t size, u64 attr_ver) { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); spin_lock(&fi->lock); if (attr_ver >= fi->attr_version && size < inode->i_size && !test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) { fi->attr_version = atomic64_inc_return(&fc->attr_version); i_size_write(inode, size); } spin_unlock(&fi->lock); } static void fuse_short_read(struct inode *inode, u64 attr_ver, size_t num_read, struct fuse_args_pages *ap) { struct fuse_conn *fc = get_fuse_conn(inode); /* * If writeback_cache is enabled, a short read means there's a hole in * the file. Some data after the hole is in page cache, but has not * reached the client fs yet. So the hole is not present there. */ if (!fc->writeback_cache) { loff_t pos = folio_pos(ap->folios[0]) + num_read; fuse_read_update_size(inode, pos, attr_ver); } } static int fuse_do_readfolio(struct file *file, struct folio *folio, size_t off, size_t len) { struct inode *inode = folio->mapping->host; struct fuse_mount *fm = get_fuse_mount(inode); loff_t pos = folio_pos(folio) + off; struct fuse_folio_desc desc = { .offset = off, .length = len, }; struct fuse_io_args ia = { .ap.args.page_zeroing = true, .ap.args.out_pages = true, .ap.num_folios = 1, .ap.folios = &folio, .ap.descs = &desc, }; ssize_t res; u64 attr_ver; attr_ver = fuse_get_attr_version(fm->fc); /* Don't overflow end offset */ if (pos + (desc.length - 1) == LLONG_MAX) desc.length--; fuse_read_args_fill(&ia, file, pos, desc.length, FUSE_READ); res = fuse_simple_request(fm, &ia.ap.args); if (res < 0) return res; /* * Short read means EOF. If file size is larger, truncate it */ if (res < desc.length) fuse_short_read(inode, attr_ver, res, &ia.ap); return 0; } static int fuse_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned int flags, struct iomap *iomap, struct iomap *srcmap) { iomap->type = IOMAP_MAPPED; iomap->length = length; iomap->offset = offset; return 0; } static const struct iomap_ops fuse_iomap_ops = { .iomap_begin = fuse_iomap_begin, }; struct fuse_fill_read_data { struct file *file; /* Fields below are used if sending the read request asynchronously */ struct fuse_conn *fc; struct fuse_io_args *ia; unsigned int nr_bytes; }; /* forward declarations */ static bool fuse_folios_need_send(struct fuse_conn *fc, loff_t pos, unsigned len, struct fuse_args_pages *ap, unsigned cur_bytes, bool write); static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file, unsigned int count, bool async); static int fuse_handle_readahead(struct folio *folio, struct readahead_control *rac, struct fuse_fill_read_data *data, loff_t pos, size_t len) { struct fuse_io_args *ia = data->ia; size_t off = offset_in_folio(folio, pos); struct fuse_conn *fc = data->fc; struct fuse_args_pages *ap; unsigned int nr_pages; if (ia && fuse_folios_need_send(fc, pos, len, &ia->ap, data->nr_bytes, false)) { fuse_send_readpages(ia, data->file, data->nr_bytes, fc->async_read); data->nr_bytes = 0; data->ia = NULL; ia = NULL; } if (!ia) { if (fc->num_background >= fc->congestion_threshold && rac->ra->async_size >= readahead_count(rac)) /* * Congested and only async pages left, so skip the * rest. */ return -EAGAIN; nr_pages = min(fc->max_pages, readahead_count(rac)); data->ia = fuse_io_alloc(NULL, nr_pages); if (!data->ia) return -ENOMEM; ia = data->ia; } folio_get(folio); ap = &ia->ap; ap->folios[ap->num_folios] = folio; ap->descs[ap->num_folios].offset = off; ap->descs[ap->num_folios].length = len; data->nr_bytes += len; ap->num_folios++; return 0; } static int fuse_iomap_read_folio_range_async(const struct iomap_iter *iter, struct iomap_read_folio_ctx *ctx, size_t len) { struct fuse_fill_read_data *data = ctx->read_ctx; struct folio *folio = ctx->cur_folio; loff_t pos = iter->pos; size_t off = offset_in_folio(folio, pos); struct file *file = data->file; int ret; if (ctx->rac) { ret = fuse_handle_readahead(folio, ctx->rac, data, pos, len); } else { /* * for non-readahead read requests, do reads synchronously * since it's not guaranteed that the server can handle * out-of-order reads */ ret = fuse_do_readfolio(file, folio, off, len); if (!ret) iomap_finish_folio_read(folio, off, len, ret); } return ret; } static void fuse_iomap_read_submit(struct iomap_read_folio_ctx *ctx) { struct fuse_fill_read_data *data = ctx->read_ctx; if (data->ia) fuse_send_readpages(data->ia, data->file, data->nr_bytes, data->fc->async_read); } static const struct iomap_read_ops fuse_iomap_read_ops = { .read_folio_range = fuse_iomap_read_folio_range_async, .submit_read = fuse_iomap_read_submit, }; static int fuse_read_folio(struct file *file, struct folio *folio) { struct inode *inode = folio->mapping->host; struct fuse_fill_read_data data = { .file = file, }; struct iomap_read_folio_ctx ctx = { .cur_folio = folio, .ops = &fuse_iomap_read_ops, .read_ctx = &data, }; if (fuse_is_bad(inode)) { folio_unlock(folio); return -EIO; } iomap_read_folio(&fuse_iomap_ops, &ctx, NULL); fuse_invalidate_atime(inode); return 0; } static int fuse_iomap_read_folio_range(const struct iomap_iter *iter, struct folio *folio, loff_t pos, size_t len) { struct file *file = iter->private; size_t off = offset_in_folio(folio, pos); return fuse_do_readfolio(file, folio, off, len); } static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args, int err) { int i; struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args); struct fuse_args_pages *ap = &ia->ap; size_t count = ia->read.in.size; size_t num_read = args->out_args[0].size; struct address_space *mapping; struct inode *inode; WARN_ON_ONCE(!ap->num_folios); mapping = ap->folios[0]->mapping; inode = mapping->host; /* * Short read means EOF. If file size is larger, truncate it */ if (!err && num_read < count) fuse_short_read(inode, ia->read.attr_ver, num_read, ap); fuse_invalidate_atime(inode); for (i = 0; i < ap->num_folios; i++) { iomap_finish_folio_read(ap->folios[i], ap->descs[i].offset, ap->descs[i].length, err); folio_put(ap->folios[i]); } if (ia->ff) fuse_file_put(ia->ff, false); fuse_io_free(ia); } static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file, unsigned int count, bool async) { struct fuse_file *ff = file->private_data; struct fuse_mount *fm = ff->fm; struct fuse_args_pages *ap = &ia->ap; loff_t pos = folio_pos(ap->folios[0]); ssize_t res; int err; ap->args.out_pages = true; ap->args.page_zeroing = true; ap->args.page_replace = true; /* Don't overflow end offset */ if (pos + (count - 1) == LLONG_MAX) { count--; ap->descs[ap->num_folios - 1].length--; } WARN_ON((loff_t) (pos + count) < 0); fuse_read_args_fill(ia, file, pos, count, FUSE_READ); ia->read.attr_ver = fuse_get_attr_version(fm->fc); if (async) { ia->ff = fuse_file_get(ff); ap->args.end = fuse_readpages_end; err = fuse_simple_background(fm, &ap->args, GFP_KERNEL); if (!err) return; } else { res = fuse_simple_request(fm, &ap->args); err = res < 0 ? res : 0; } fuse_readpages_end(fm, &ap->args, err); } static void fuse_readahead(struct readahead_control *rac) { struct inode *inode = rac->mapping->host; struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_fill_read_data data = { .file = rac->file, .fc = fc, }; struct iomap_read_folio_ctx ctx = { .ops = &fuse_iomap_read_ops, .rac = rac, .read_ctx = &data }; if (fuse_is_bad(inode)) return; iomap_readahead(&fuse_iomap_ops, &ctx, NULL); } static ssize_t fuse_cache_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct inode *inode = iocb->ki_filp->f_mapping->host; struct fuse_conn *fc = get_fuse_conn(inode); /* * In auto invalidate mode, always update attributes on read. * Otherwise, only update if we attempt to read past EOF (to ensure * i_size is up to date). */ if (fc->auto_inval_data || (iocb->ki_pos + iov_iter_count(to) > i_size_read(inode))) { int err; err = fuse_update_attributes(inode, iocb->ki_filp, STATX_SIZE); if (err) return err; } return generic_file_read_iter(iocb, to); } static void fuse_write_args_fill(struct fuse_io_args *ia, struct fuse_file *ff, loff_t pos, size_t count) { struct fuse_args *args = &ia->ap.args; ia->write.in.fh = ff->fh; ia->write.in.offset = pos; ia->write.in.size = count; args->opcode = FUSE_WRITE; args->nodeid = ff->nodeid; args->in_numargs = 2; if (ff->fm->fc->minor < 9) args->in_args[0].size = FUSE_COMPAT_WRITE_IN_SIZE; else args->in_args[0].size = sizeof(ia->write.in); args->in_args[0].value = &ia->write.in; args->in_args[1].size = count; args->out_numargs = 1; args->out_args[0].size = sizeof(ia->write.out); args->out_args[0].value = &ia->write.out; } static unsigned int fuse_write_flags(struct kiocb *iocb) { unsigned int flags = iocb->ki_filp->f_flags; if (iocb_is_dsync(iocb)) flags |= O_DSYNC; if (iocb->ki_flags & IOCB_SYNC) flags |= O_SYNC; return flags; } static ssize_t fuse_send_write(struct fuse_io_args *ia, loff_t pos, size_t count, fl_owner_t owner) { struct kiocb *iocb = ia->io->iocb; struct file *file = iocb->ki_filp; struct fuse_file *ff = file->private_data; struct fuse_mount *fm = ff->fm; struct fuse_write_in *inarg = &ia->write.in; ssize_t err; fuse_write_args_fill(ia, ff, pos, count); inarg->flags = fuse_write_flags(iocb); if (owner != NULL) { inarg->write_flags |= FUSE_WRITE_LOCKOWNER; inarg->lock_owner = fuse_lock_owner_id(fm->fc, owner); } if (ia->io->async) return fuse_async_req_send(fm, ia, count); err = fuse_simple_request(fm, &ia->ap.args); if (!err && ia->write.out.size > count) err = -EIO; return err ?: ia->write.out.size; } bool fuse_write_update_attr(struct inode *inode, loff_t pos, ssize_t written) { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); bool ret = false; spin_lock(&fi->lock); fi->attr_version = atomic64_inc_return(&fc->attr_version); if (written > 0 && pos > inode->i_size) { i_size_write(inode, pos); ret = true; } spin_unlock(&fi->lock); fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); return ret; } static ssize_t fuse_send_write_pages(struct fuse_io_args *ia, struct kiocb *iocb, struct inode *inode, loff_t pos, size_t count) { struct fuse_args_pages *ap = &ia->ap; struct file *file = iocb->ki_filp; struct fuse_file *ff = file->private_data; struct fuse_mount *fm = ff->fm; unsigned int offset, i; bool short_write; int err; for (i = 0; i < ap->num_folios; i++) folio_wait_writeback(ap->folios[i]); fuse_write_args_fill(ia, ff, pos, count); ia->write.in.flags = fuse_write_flags(iocb); if (fm->fc->handle_killpriv_v2 && !capable(CAP_FSETID)) ia->write.in.write_flags |= FUSE_WRITE_KILL_SUIDGID; err = fuse_simple_request(fm, &ap->args); if (!err && ia->write.out.size > count) err = -EIO; short_write = ia->write.out.size < count; offset = ap->descs[0].offset; count = ia->write.out.size; for (i = 0; i < ap->num_folios; i++) { struct folio *folio = ap->folios[i]; if (err) { folio_clear_uptodate(folio); } else { if (count >= folio_size(folio) - offset) count -= folio_size(folio) - offset; else { if (short_write) folio_clear_uptodate(folio); count = 0; } offset = 0; } if (ia->write.folio_locked && (i == ap->num_folios - 1)) folio_unlock(folio); folio_put(folio); } return err; } static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, struct address_space *mapping, struct iov_iter *ii, loff_t pos, unsigned int max_folios) { struct fuse_args_pages *ap = &ia->ap; struct fuse_conn *fc = get_fuse_conn(mapping->host); unsigned offset = pos & (PAGE_SIZE - 1); size_t count = 0; unsigned int num; int err = 0; num = min(iov_iter_count(ii), fc->max_write); ap->args.in_pages = true; while (num && ap->num_folios < max_folios) { size_t tmp; struct folio *folio; pgoff_t index = pos >> PAGE_SHIFT; unsigned int bytes; unsigned int folio_offset; again: folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, mapping_gfp_mask(mapping)); if (IS_ERR(folio)) { err = PTR_ERR(folio); break; } if (mapping_writably_mapped(mapping)) flush_dcache_folio(folio); folio_offset = ((index - folio->index) << PAGE_SHIFT) + offset; bytes = min(folio_size(folio) - folio_offset, num); tmp = copy_folio_from_iter_atomic(folio, folio_offset, bytes, ii); flush_dcache_folio(folio); if (!tmp) { folio_unlock(folio); folio_put(folio); /* * Ensure forward progress by faulting in * while not holding the folio lock: */ if (fault_in_iov_iter_readable(ii, bytes)) { err = -EFAULT; break; } goto again; } ap->folios[ap->num_folios] = folio; ap->descs[ap->num_folios].offset = folio_offset; ap->descs[ap->num_folios].length = tmp; ap->num_folios++; count += tmp; pos += tmp; num -= tmp; offset += tmp; if (offset == folio_size(folio)) offset = 0; /* If we copied full folio, mark it uptodate */ if (tmp == folio_size(folio)) folio_mark_uptodate(folio); if (folio_test_uptodate(folio)) { folio_unlock(folio); } else { ia->write.folio_locked = true; break; } if (!fc->big_writes || offset != 0) break; } return count > 0 ? count : err; } static inline unsigned int fuse_wr_pages(loff_t pos, size_t len, unsigned int max_pages) { unsigned int pages = ((pos + len - 1) >> PAGE_SHIFT) - (pos >> PAGE_SHIFT) + 1; return min(pages, max_pages); } static ssize_t fuse_perform_write(struct kiocb *iocb, struct iov_iter *ii) { struct address_space *mapping = iocb->ki_filp->f_mapping; struct inode *inode = mapping->host; struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); loff_t pos = iocb->ki_pos; int err = 0; ssize_t res = 0; if (inode->i_size < pos + iov_iter_count(ii)) set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); do { ssize_t count; struct fuse_io_args ia = {}; struct fuse_args_pages *ap = &ia.ap; unsigned int nr_pages = fuse_wr_pages(pos, iov_iter_count(ii), fc->max_pages); ap->folios = fuse_folios_alloc(nr_pages, GFP_KERNEL, &ap->descs); if (!ap->folios) { err = -ENOMEM; break; } count = fuse_fill_write_pages(&ia, mapping, ii, pos, nr_pages); if (count <= 0) { err = count; } else { err = fuse_send_write_pages(&ia, iocb, inode, pos, count); if (!err) { size_t num_written = ia.write.out.size; res += num_written; pos += num_written; /* break out of the loop on short write */ if (num_written != count) err = -EIO; } } kfree(ap->folios); } while (!err && iov_iter_count(ii)); fuse_write_update_attr(inode, pos, res); clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); if (!res) return err; iocb->ki_pos += res; return res; } static bool fuse_io_past_eof(struct kiocb *iocb, struct iov_iter *iter) { struct inode *inode = file_inode(iocb->ki_filp); return iocb->ki_pos + iov_iter_count(iter) > i_size_read(inode); } /* * @return true if an exclusive lock for direct IO writes is needed */ static bool fuse_dio_wr_exclusive_lock(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct fuse_file *ff = file->private_data; struct inode *inode = file_inode(iocb->ki_filp); struct fuse_inode *fi = get_fuse_inode(inode); /* Server side has to advise that it supports parallel dio writes. */ if (!(ff->open_flags & FOPEN_PARALLEL_DIRECT_WRITES)) return true; /* * Append will need to know the eventual EOF - always needs an * exclusive lock. */ if (iocb->ki_flags & IOCB_APPEND) return true; /* shared locks are not allowed with parallel page cache IO */ if (test_bit(FUSE_I_CACHE_IO_MODE, &fi->state)) return true; /* Parallel dio beyond EOF is not supported, at least for now. */ if (fuse_io_past_eof(iocb, from)) return true; return false; } static void fuse_dio_lock(struct kiocb *iocb, struct iov_iter *from, bool *exclusive) { struct inode *inode = file_inode(iocb->ki_filp); struct fuse_inode *fi = get_fuse_inode(inode); *exclusive = fuse_dio_wr_exclusive_lock(iocb, from); if (*exclusive) { inode_lock(inode); } else { inode_lock_shared(inode); /* * New parallal dio allowed only if inode is not in caching * mode and denies new opens in caching mode. This check * should be performed only after taking shared inode lock. * Previous past eof check was without inode lock and might * have raced, so check it again. */ if (fuse_io_past_eof(iocb, from) || fuse_inode_uncached_io_start(fi, NULL) != 0) { inode_unlock_shared(inode); inode_lock(inode); *exclusive = true; } } } static void fuse_dio_unlock(struct kiocb *iocb, bool exclusive) { struct inode *inode = file_inode(iocb->ki_filp); struct fuse_inode *fi = get_fuse_inode(inode); if (exclusive) { inode_unlock(inode); } else { /* Allow opens in caching mode after last parallel dio end */ fuse_inode_uncached_io_end(fi); inode_unlock_shared(inode); } } static const struct iomap_write_ops fuse_iomap_write_ops = { .read_folio_range = fuse_iomap_read_folio_range, }; static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct mnt_idmap *idmap = file_mnt_idmap(file); struct address_space *mapping = file->f_mapping; ssize_t written = 0; struct inode *inode = mapping->host; ssize_t err, count; struct fuse_conn *fc = get_fuse_conn(inode); bool writeback = false; if (fc->writeback_cache) { /* Update size (EOF optimization) and mode (SUID clearing) */ err = fuse_update_attributes(mapping->host, file, STATX_SIZE | STATX_MODE); if (err) return err; if (!fc->handle_killpriv_v2 || !setattr_should_drop_suidgid(idmap, file_inode(file))) writeback = true; } inode_lock(inode); err = count = generic_write_checks(iocb, from); if (err <= 0) goto out; task_io_account_write(count); err = kiocb_modified(iocb); if (err) goto out; if (iocb->ki_flags & IOCB_DIRECT) { written = generic_file_direct_write(iocb, from); if (written < 0 || !iov_iter_count(from)) goto out; written = direct_write_fallback(iocb, from, written, fuse_perform_write(iocb, from)); } else if (writeback) { /* * Use iomap so that we can do granular uptodate reads * and granular dirty tracking for large folios. */ written = iomap_file_buffered_write(iocb, from, &fuse_iomap_ops, &fuse_iomap_write_ops, file); } else { written = fuse_perform_write(iocb, from); } out: inode_unlock(inode); if (written > 0) written = generic_write_sync(iocb, written); return written ? written : err; } static inline unsigned long fuse_get_user_addr(const struct iov_iter *ii) { return (unsigned long)iter_iov(ii)->iov_base + ii->iov_offset; } static inline size_t fuse_get_frag_size(const struct iov_iter *ii, size_t max_size) { return min(iov_iter_single_seg_count(ii), max_size); } static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii, size_t *nbytesp, int write, unsigned int max_pages, bool use_pages_for_kvec_io) { bool flush_or_invalidate = false; unsigned int nr_pages = 0; size_t nbytes = 0; /* # bytes already packed in req */ ssize_t ret = 0; /* Special case for kernel I/O: can copy directly into the buffer. * However if the implementation of fuse_conn requires pages instead of * pointer (e.g., virtio-fs), use iov_iter_extract_pages() instead. */ if (iov_iter_is_kvec(ii)) { void *user_addr = (void *)fuse_get_user_addr(ii); if (!use_pages_for_kvec_io) { size_t frag_size = fuse_get_frag_size(ii, *nbytesp); if (write) ap->args.in_args[1].value = user_addr; else ap->args.out_args[0].value = user_addr; iov_iter_advance(ii, frag_size); *nbytesp = frag_size; return 0; } if (is_vmalloc_addr(user_addr)) { ap->args.vmap_base = user_addr; flush_or_invalidate = true; } } /* * Until there is support for iov_iter_extract_folios(), we have to * manually extract pages using iov_iter_extract_pages() and then * copy that to a folios array. */ struct page **pages = kzalloc(max_pages * sizeof(struct page *), GFP_KERNEL); if (!pages) { ret = -ENOMEM; goto out; } while (nbytes < *nbytesp && nr_pages < max_pages) { unsigned nfolios, i; size_t start; ret = iov_iter_extract_pages(ii, &pages, *nbytesp - nbytes, max_pages - nr_pages, 0, &start); if (ret < 0) break; nbytes += ret; nfolios = DIV_ROUND_UP(ret + start, PAGE_SIZE); for (i = 0; i < nfolios; i++) { struct folio *folio = page_folio(pages[i]); unsigned int offset = start + (folio_page_idx(folio, pages[i]) << PAGE_SHIFT); unsigned int len = umin(ret, PAGE_SIZE - start); ap->descs[ap->num_folios].offset = offset; ap->descs[ap->num_folios].length = len; ap->folios[ap->num_folios] = folio; start = 0; ret -= len; ap->num_folios++; } nr_pages += nfolios; } kfree(pages); if (write && flush_or_invalidate) flush_kernel_vmap_range(ap->args.vmap_base, nbytes); ap->args.invalidate_vmap = !write && flush_or_invalidate; ap->args.is_pinned = iov_iter_extract_will_pin(ii); ap->args.user_pages = true; if (write) ap->args.in_pages = true; else ap->args.out_pages = true; out: *nbytesp = nbytes; return ret < 0 ? ret : 0; } ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, loff_t *ppos, int flags) { int write = flags & FUSE_DIO_WRITE; int cuse = flags & FUSE_DIO_CUSE; struct file *file = io->iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; struct fuse_file *ff = file->private_data; struct fuse_conn *fc = ff->fm->fc; size_t nmax = write ? fc->max_write : fc->max_read; loff_t pos = *ppos; size_t count = iov_iter_count(iter); pgoff_t idx_from = pos >> PAGE_SHIFT; pgoff_t idx_to = (pos + count - 1) >> PAGE_SHIFT; ssize_t res = 0; int err = 0; struct fuse_io_args *ia; unsigned int max_pages; bool fopen_direct_io = ff->open_flags & FOPEN_DIRECT_IO; max_pages = iov_iter_npages(iter, fc->max_pages); ia = fuse_io_alloc(io, max_pages); if (!ia) return -ENOMEM; if (fopen_direct_io) { res = filemap_write_and_wait_range(mapping, pos, pos + count - 1); if (res) { fuse_io_free(ia); return res; } } if (!cuse && filemap_range_has_writeback(mapping, pos, (pos + count - 1))) { if (!write) inode_lock(inode); fuse_sync_writes(inode); if (!write) inode_unlock(inode); } if (fopen_direct_io && write) { res = invalidate_inode_pages2_range(mapping, idx_from, idx_to); if (res) { fuse_io_free(ia); return res; } } io->should_dirty = !write && user_backed_iter(iter); while (count) { ssize_t nres; fl_owner_t owner = current->files; size_t nbytes = min(count, nmax); err = fuse_get_user_pages(&ia->ap, iter, &nbytes, write, max_pages, fc->use_pages_for_kvec_io); if (err && !nbytes) break; if (write) { if (!capable(CAP_FSETID)) ia->write.in.write_flags |= FUSE_WRITE_KILL_SUIDGID; nres = fuse_send_write(ia, pos, nbytes, owner); } else { nres = fuse_send_read(ia, pos, nbytes, owner); } if (!io->async || nres < 0) { fuse_release_user_pages(&ia->ap, nres, io->should_dirty); fuse_io_free(ia); } ia = NULL; if (nres < 0) { iov_iter_revert(iter, nbytes); err = nres; break; } WARN_ON(nres > nbytes); count -= nres; res += nres; pos += nres; if (nres != nbytes) { iov_iter_revert(iter, nbytes - nres); break; } if (count) { max_pages = iov_iter_npages(iter, fc->max_pages); ia = fuse_io_alloc(io, max_pages); if (!ia) break; } } if (ia) fuse_io_free(ia); if (res > 0) *ppos = pos; if (res > 0 && write && fopen_direct_io) { /* * As in generic_file_direct_write(), invalidate after the * write, to invalidate read-ahead cache that may have competed * with the write. */ invalidate_inode_pages2_range(mapping, idx_from, idx_to); } return res > 0 ? res : err; } EXPORT_SYMBOL_GPL(fuse_direct_io); static ssize_t __fuse_direct_read(struct fuse_io_priv *io, struct iov_iter *iter, loff_t *ppos) { ssize_t res; struct inode *inode = file_inode(io->iocb->ki_filp); res = fuse_direct_io(io, iter, ppos, 0); fuse_invalidate_atime(inode); return res; } static ssize_t fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter); static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to) { ssize_t res; if (!is_sync_kiocb(iocb)) { res = fuse_direct_IO(iocb, to); } else { struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); res = __fuse_direct_read(&io, to, &iocb->ki_pos); } return res; } static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); ssize_t res; bool exclusive; fuse_dio_lock(iocb, from, &exclusive); res = generic_write_checks(iocb, from); if (res > 0) { task_io_account_write(res); if (!is_sync_kiocb(iocb)) { res = fuse_direct_IO(iocb, from); } else { struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); res = fuse_direct_io(&io, from, &iocb->ki_pos, FUSE_DIO_WRITE); fuse_write_update_attr(inode, iocb->ki_pos, res); } } fuse_dio_unlock(iocb, exclusive); return res; } static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct fuse_file *ff = file->private_data; struct inode *inode = file_inode(file); if (fuse_is_bad(inode)) return -EIO; if (FUSE_IS_DAX(inode)) return fuse_dax_read_iter(iocb, to); /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */ if (ff->open_flags & FOPEN_DIRECT_IO) return fuse_direct_read_iter(iocb, to); else if (fuse_file_passthrough(ff)) return fuse_passthrough_read_iter(iocb, to); else return fuse_cache_read_iter(iocb, to); } static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct fuse_file *ff = file->private_data; struct inode *inode = file_inode(file); if (fuse_is_bad(inode)) return -EIO; if (FUSE_IS_DAX(inode)) return fuse_dax_write_iter(iocb, from); /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */ if (ff->open_flags & FOPEN_DIRECT_IO) return fuse_direct_write_iter(iocb, from); else if (fuse_file_passthrough(ff)) return fuse_passthrough_write_iter(iocb, from); else return fuse_cache_write_iter(iocb, from); } static ssize_t fuse_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { struct fuse_file *ff = in->private_data; /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */ if (fuse_file_passthrough(ff) && !(ff->open_flags & FOPEN_DIRECT_IO)) return fuse_passthrough_splice_read(in, ppos, pipe, len, flags); else return filemap_splice_read(in, ppos, pipe, len, flags); } static ssize_t fuse_splice_write(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags) { struct fuse_file *ff = out->private_data; /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */ if (fuse_file_passthrough(ff) && !(ff->open_flags & FOPEN_DIRECT_IO)) return fuse_passthrough_splice_write(pipe, out, ppos, len, flags); else return iter_file_splice_write(pipe, out, ppos, len, flags); } static void fuse_writepage_free(struct fuse_writepage_args *wpa) { struct fuse_args_pages *ap = &wpa->ia.ap; if (wpa->bucket) fuse_sync_bucket_dec(wpa->bucket); fuse_file_put(wpa->ia.ff, false); kfree(ap->folios); kfree(wpa); } static void fuse_writepage_finish(struct fuse_writepage_args *wpa) { struct fuse_args_pages *ap = &wpa->ia.ap; struct inode *inode = wpa->inode; struct fuse_inode *fi = get_fuse_inode(inode); int i; for (i = 0; i < ap->num_folios; i++) /* * Benchmarks showed that ending writeback within the * scope of the fi->lock alleviates xarray lock * contention and noticeably improves performance. */ iomap_finish_folio_write(inode, ap->folios[i], ap->descs[i].length); wake_up(&fi->page_waitq); } /* Called under fi->lock, may release and reacquire it */ static void fuse_send_writepage(struct fuse_mount *fm, struct fuse_writepage_args *wpa, loff_t size) __releases(fi->lock) __acquires(fi->lock) { struct fuse_inode *fi = get_fuse_inode(wpa->inode); struct fuse_args_pages *ap = &wpa->ia.ap; struct fuse_write_in *inarg = &wpa->ia.write.in; struct fuse_args *args = &ap->args; __u64 data_size = 0; int err, i; for (i = 0; i < ap->num_folios; i++) data_size += ap->descs[i].length; fi->writectr++; if (inarg->offset + data_size <= size) { inarg->size = data_size; } else if (inarg->offset < size) { inarg->size = size - inarg->offset; } else { /* Got truncated off completely */ goto out_free; } args->in_args[1].size = inarg->size; args->force = true; args->nocreds = true; err = fuse_simple_background(fm, args, GFP_ATOMIC); if (err == -ENOMEM) { spin_unlock(&fi->lock); err = fuse_simple_background(fm, args, GFP_NOFS | __GFP_NOFAIL); spin_lock(&fi->lock); } /* Fails on broken connection only */ if (unlikely(err)) goto out_free; return; out_free: fi->writectr--; fuse_writepage_finish(wpa); spin_unlock(&fi->lock); fuse_writepage_free(wpa); spin_lock(&fi->lock); } /* * If fi->writectr is positive (no truncate or fsync going on) send * all queued writepage requests. * * Called with fi->lock */ void fuse_flush_writepages(struct inode *inode) __releases(fi->lock) __acquires(fi->lock) { struct fuse_mount *fm = get_fuse_mount(inode); struct fuse_inode *fi = get_fuse_inode(inode); loff_t crop = i_size_read(inode); struct fuse_writepage_args *wpa; while (fi->writectr >= 0 && !list_empty(&fi->queued_writes)) { wpa = list_entry(fi->queued_writes.next, struct fuse_writepage_args, queue_entry); list_del_init(&wpa->queue_entry); fuse_send_writepage(fm, wpa, crop); } } static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args, int error) { struct fuse_writepage_args *wpa = container_of(args, typeof(*wpa), ia.ap.args); struct inode *inode = wpa->inode; struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = get_fuse_conn(inode); mapping_set_error(inode->i_mapping, error); /* * A writeback finished and this might have updated mtime/ctime on * server making local mtime/ctime stale. Hence invalidate attrs. * Do this only if writeback_cache is not enabled. If writeback_cache * is enabled, we trust local ctime/mtime. */ if (!fc->writeback_cache) fuse_invalidate_attr_mask(inode, FUSE_STATX_MODIFY); spin_lock(&fi->lock); fi->writectr--; fuse_writepage_finish(wpa); spin_unlock(&fi->lock); fuse_writepage_free(wpa); } static struct fuse_file *__fuse_write_file_get(struct fuse_inode *fi) { struct fuse_file *ff; spin_lock(&fi->lock); ff = list_first_entry_or_null(&fi->write_files, struct fuse_file, write_entry); if (ff) fuse_file_get(ff); spin_unlock(&fi->lock); return ff; } static struct fuse_file *fuse_write_file_get(struct fuse_inode *fi) { struct fuse_file *ff = __fuse_write_file_get(fi); WARN_ON(!ff); return ff; } int fuse_write_inode(struct inode *inode, struct writeback_control *wbc) { struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_file *ff; int err; ff = __fuse_write_file_get(fi); err = fuse_flush_times(inode, ff); if (ff) fuse_file_put(ff, false); return err; } static struct fuse_writepage_args *fuse_writepage_args_alloc(void) { struct fuse_writepage_args *wpa; struct fuse_args_pages *ap; wpa = kzalloc_obj(*wpa, GFP_NOFS); if (wpa) { ap = &wpa->ia.ap; ap->num_folios = 0; ap->folios = fuse_folios_alloc(1, GFP_NOFS, &ap->descs); if (!ap->folios) { kfree(wpa); wpa = NULL; } } return wpa; } static void fuse_writepage_add_to_bucket(struct fuse_conn *fc, struct fuse_writepage_args *wpa) { if (!fc->sync_fs) return; rcu_read_lock(); /* Prevent resurrection of dead bucket in unlikely race with syncfs */ do { wpa->bucket = rcu_dereference(fc->curr_bucket); } while (unlikely(!atomic_inc_not_zero(&wpa->bucket->count))); rcu_read_unlock(); } static void fuse_writepage_args_page_fill(struct fuse_writepage_args *wpa, struct folio *folio, uint32_t folio_index, loff_t offset, unsigned len) { struct fuse_args_pages *ap = &wpa->ia.ap; ap->folios[folio_index] = folio; ap->descs[folio_index].offset = offset; ap->descs[folio_index].length = len; } static struct fuse_writepage_args *fuse_writepage_args_setup(struct folio *folio, size_t offset, struct fuse_file *ff) { struct inode *inode = folio->mapping->host; struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_writepage_args *wpa; struct fuse_args_pages *ap; wpa = fuse_writepage_args_alloc(); if (!wpa) return NULL; fuse_writepage_add_to_bucket(fc, wpa); fuse_write_args_fill(&wpa->ia, ff, folio_pos(folio) + offset, 0); wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE; wpa->inode = inode; wpa->ia.ff = ff; ap = &wpa->ia.ap; ap->args.in_pages = true; ap->args.end = fuse_writepage_end; return wpa; } struct fuse_fill_wb_data { struct fuse_writepage_args *wpa; struct fuse_file *ff; unsigned int max_folios; /* * nr_bytes won't overflow since fuse_folios_need_send() caps * wb requests to never exceed fc->max_pages (which has an upper bound * of U16_MAX). */ unsigned int nr_bytes; }; static bool fuse_pages_realloc(struct fuse_fill_wb_data *data, unsigned int max_pages) { struct fuse_args_pages *ap = &data->wpa->ia.ap; struct folio **folios; struct fuse_folio_desc *descs; unsigned int nfolios = min_t(unsigned int, max_t(unsigned int, data->max_folios * 2, FUSE_DEFAULT_MAX_PAGES_PER_REQ), max_pages); WARN_ON(nfolios <= data->max_folios); folios = fuse_folios_alloc(nfolios, GFP_NOFS, &descs); if (!folios) return false; memcpy(folios, ap->folios, sizeof(struct folio *) * ap->num_folios); memcpy(descs, ap->descs, sizeof(struct fuse_folio_desc) * ap->num_folios); kfree(ap->folios); ap->folios = folios; ap->descs = descs; data->max_folios = nfolios; return true; } static void fuse_writepages_send(struct inode *inode, struct fuse_fill_wb_data *data) { struct fuse_writepage_args *wpa = data->wpa; struct fuse_inode *fi = get_fuse_inode(inode); spin_lock(&fi->lock); list_add_tail(&wpa->queue_entry, &fi->queued_writes); fuse_flush_writepages(inode); spin_unlock(&fi->lock); } static bool fuse_folios_need_send(struct fuse_conn *fc, loff_t pos, unsigned len, struct fuse_args_pages *ap, unsigned cur_bytes, bool write) { struct folio *prev_folio; struct fuse_folio_desc prev_desc; unsigned bytes = cur_bytes + len; loff_t prev_pos; size_t max_bytes = write ? fc->max_write : fc->max_read; WARN_ON(!ap->num_folios); /* Reached max pages */ if ((bytes + PAGE_SIZE - 1) >> PAGE_SHIFT > fc->max_pages) return true; if (bytes > max_bytes) return true; /* Discontinuity */ prev_folio = ap->folios[ap->num_folios - 1]; prev_desc = ap->descs[ap->num_folios - 1]; prev_pos = folio_pos(prev_folio) + prev_desc.offset + prev_desc.length; if (prev_pos != pos) return true; return false; } static ssize_t fuse_iomap_writeback_range(struct iomap_writepage_ctx *wpc, struct folio *folio, u64 pos, unsigned len, u64 end_pos) { struct fuse_fill_wb_data *data = wpc->wb_ctx; struct fuse_writepage_args *wpa = data->wpa; struct fuse_args_pages *ap = &wpa->ia.ap; struct inode *inode = wpc->inode; struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = get_fuse_conn(inode); loff_t offset = offset_in_folio(folio, pos); WARN_ON_ONCE(!data); if (!data->ff) { data->ff = fuse_write_file_get(fi); if (!data->ff) return -EIO; } if (wpa) { bool send = fuse_folios_need_send(fc, pos, len, ap, data->nr_bytes, true); if (!send) { /* * Need to grow the pages array? If so, did the * expansion fail? */ send = (ap->num_folios == data->max_folios) && !fuse_pages_realloc(data, fc->max_pages); } if (send) { fuse_writepages_send(inode, data); data->wpa = NULL; data->nr_bytes = 0; } } if (data->wpa == NULL) { wpa = fuse_writepage_args_setup(folio, offset, data->ff); if (!wpa) return -ENOMEM; fuse_file_get(wpa->ia.ff); data->max_folios = 1; ap = &wpa->ia.ap; } fuse_writepage_args_page_fill(wpa, folio, ap->num_folios, offset, len); data->nr_bytes += len; ap->num_folios++; if (!data->wpa) data->wpa = wpa; return len; } static int fuse_iomap_writeback_submit(struct iomap_writepage_ctx *wpc, int error) { struct fuse_fill_wb_data *data = wpc->wb_ctx; WARN_ON_ONCE(!data); if (data->wpa) { WARN_ON(!data->wpa->ia.ap.num_folios); fuse_writepages_send(wpc->inode, data); } if (data->ff) fuse_file_put(data->ff, false); return error; } static const struct iomap_writeback_ops fuse_writeback_ops = { .writeback_range = fuse_iomap_writeback_range, .writeback_submit = fuse_iomap_writeback_submit, }; static int fuse_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct inode *inode = mapping->host; struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_fill_wb_data data = {}; struct iomap_writepage_ctx wpc = { .inode = inode, .iomap.type = IOMAP_MAPPED, .wbc = wbc, .ops = &fuse_writeback_ops, .wb_ctx = &data, }; if (fuse_is_bad(inode)) return -EIO; if (wbc->sync_mode == WB_SYNC_NONE && fc->num_background >= fc->congestion_threshold) return 0; return iomap_writepages(&wpc); } static int fuse_launder_folio(struct folio *folio) { int err = 0; struct fuse_fill_wb_data data = {}; struct iomap_writepage_ctx wpc = { .inode = folio->mapping->host, .iomap.type = IOMAP_MAPPED, .ops = &fuse_writeback_ops, .wb_ctx = &data, }; if (folio_clear_dirty_for_io(folio)) { err = iomap_writeback_folio(&wpc, folio); err = fuse_iomap_writeback_submit(&wpc, err); if (!err) folio_wait_writeback(folio); } return err; } /* * Write back dirty data/metadata now (there may not be any suitable * open files later for data) */ static void fuse_vma_close(struct vm_area_struct *vma) { int err; err = write_inode_now(vma->vm_file->f_mapping->host, 1); mapping_set_error(vma->vm_file->f_mapping, err); } /* * Wait for writeback against this page to complete before allowing it * to be marked dirty again, and hence written back again, possibly * before the previous writepage completed. * * Block here, instead of in ->writepage(), so that the userspace fs * can only block processes actually operating on the filesystem. * * Otherwise unprivileged userspace fs would be able to block * unrelated: * * - page migration * - sync(2) * - try_to_free_pages() with order > PAGE_ALLOC_COSTLY_ORDER */ static vm_fault_t fuse_page_mkwrite(struct vm_fault *vmf) { struct folio *folio = page_folio(vmf->page); struct inode *inode = file_inode(vmf->vma->vm_file); file_update_time(vmf->vma->vm_file); folio_lock(folio); if (folio->mapping != inode->i_mapping) { folio_unlock(folio); return VM_FAULT_NOPAGE; } folio_wait_writeback(folio); return VM_FAULT_LOCKED; } static const struct vm_operations_struct fuse_file_vm_ops = { .close = fuse_vma_close, .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = fuse_page_mkwrite, }; static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) { struct fuse_file *ff = file->private_data; struct fuse_conn *fc = ff->fm->fc; struct inode *inode = file_inode(file); int rc; /* DAX mmap is superior to direct_io mmap */ if (FUSE_IS_DAX(inode)) return fuse_dax_mmap(file, vma); /* * If inode is in passthrough io mode, because it has some file open * in passthrough mode, either mmap to backing file or fail mmap, * because mixing cached mmap and passthrough io mode is not allowed. */ if (fuse_file_passthrough(ff)) return fuse_passthrough_mmap(file, vma); else if (fuse_inode_backing(get_fuse_inode(inode))) return -ENODEV; /* * FOPEN_DIRECT_IO handling is special compared to O_DIRECT, * as does not allow MAP_SHARED mmap without FUSE_DIRECT_IO_ALLOW_MMAP. */ if (ff->open_flags & FOPEN_DIRECT_IO) { /* * Can't provide the coherency needed for MAP_SHARED * if FUSE_DIRECT_IO_ALLOW_MMAP isn't set. */ if ((vma->vm_flags & VM_MAYSHARE) && !fc->direct_io_allow_mmap) return -ENODEV; invalidate_inode_pages2(file->f_mapping); if (!(vma->vm_flags & VM_MAYSHARE)) { /* MAP_PRIVATE */ return generic_file_mmap(file, vma); } /* * First mmap of direct_io file enters caching inode io mode. * Also waits for parallel dio writers to go into serial mode * (exclusive instead of shared lock). * After first mmap, the inode stays in caching io mode until * the direct_io file release. */ rc = fuse_file_cached_io_open(inode, ff); if (rc) return rc; } if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) fuse_link_write_file(file); file_accessed(file); vma->vm_ops = &fuse_file_vm_ops; return 0; } static int convert_fuse_file_lock(struct fuse_conn *fc, const struct fuse_file_lock *ffl, struct file_lock *fl) { switch (ffl->type) { case F_UNLCK: break; case F_RDLCK: case F_WRLCK: if (ffl->start > OFFSET_MAX || ffl->end > OFFSET_MAX || ffl->end < ffl->start) return -EIO; fl->fl_start = ffl->start; fl->fl_end = ffl->end; /* * Convert pid into init's pid namespace. The locks API will * translate it into the caller's pid namespace. */ rcu_read_lock(); fl->c.flc_pid = pid_nr_ns(find_pid_ns(ffl->pid, fc->pid_ns), &init_pid_ns); rcu_read_unlock(); break; default: return -EIO; } fl->c.flc_type = ffl->type; return 0; } static void fuse_lk_fill(struct fuse_args *args, struct file *file, const struct file_lock *fl, int opcode, pid_t pid, int flock, struct fuse_lk_in *inarg) { struct inode *inode = file_inode(file); struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_file *ff = file->private_data; memset(inarg, 0, sizeof(*inarg)); inarg->fh = ff->fh; inarg->owner = fuse_lock_owner_id(fc, fl->c.flc_owner); inarg->lk.start = fl->fl_start; inarg->lk.end = fl->fl_end; inarg->lk.type = fl->c.flc_type; inarg->lk.pid = pid; if (flock) inarg->lk_flags |= FUSE_LK_FLOCK; args->opcode = opcode; args->nodeid = get_node_id(inode); args->in_numargs = 1; args->in_args[0].size = sizeof(*inarg); args->in_args[0].value = inarg; } static int fuse_getlk(struct file *file, struct file_lock *fl) { struct inode *inode = file_inode(file); struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); struct fuse_lk_in inarg; struct fuse_lk_out outarg; int err; fuse_lk_fill(&args, file, fl, FUSE_GETLK, 0, 0, &inarg); args.out_numargs = 1; args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; err = fuse_simple_request(fm, &args); if (!err) err = convert_fuse_file_lock(fm->fc, &outarg.lk, fl); return err; } static int fuse_setlk(struct file *file, struct file_lock *fl, int flock) { struct inode *inode = file_inode(file); struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); struct fuse_lk_in inarg; int opcode = (fl->c.flc_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK; struct pid *pid = fl->c.flc_type != F_UNLCK ? task_tgid(current) : NULL; pid_t pid_nr = pid_nr_ns(pid, fm->fc->pid_ns); int err; if (fl->fl_lmops && fl->fl_lmops->lm_grant) { /* NLM needs asynchronous locks, which we don't support yet */ return -ENOLCK; } fuse_lk_fill(&args, file, fl, opcode, pid_nr, flock, &inarg); err = fuse_simple_request(fm, &args); /* locking is restartable */ if (err == -EINTR) err = -ERESTARTSYS; return err; } static int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl) { struct inode *inode = file_inode(file); struct fuse_conn *fc = get_fuse_conn(inode); int err; if (cmd == F_CANCELLK) { err = 0; } else if (cmd == F_GETLK) { if (fc->no_lock) { posix_test_lock(file, fl); err = 0; } else err = fuse_getlk(file, fl); } else { if (fc->no_lock) err = posix_lock_file(file, fl, NULL); else err = fuse_setlk(file, fl, 0); } return err; } static int fuse_file_flock(struct file *file, int cmd, struct file_lock *fl) { struct inode *inode = file_inode(file); struct fuse_conn *fc = get_fuse_conn(inode); int err; if (fc->no_flock) { err = locks_lock_file_wait(file, fl); } else { struct fuse_file *ff = file->private_data; /* emulate flock with POSIX locks */ ff->flock = true; err = fuse_setlk(file, fl, 1); } return err; } static sector_t fuse_bmap(struct address_space *mapping, sector_t block) { struct inode *inode = mapping->host; struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); struct fuse_bmap_in inarg; struct fuse_bmap_out outarg; int err; if (!inode->i_sb->s_bdev || fm->fc->no_bmap) return 0; memset(&inarg, 0, sizeof(inarg)); inarg.block = block; inarg.blocksize = inode->i_sb->s_blocksize; args.opcode = FUSE_BMAP; args.nodeid = get_node_id(inode); args.in_numargs = 1; args.in_args[0].size = sizeof(inarg); args.in_args[0].value = &inarg; args.out_numargs = 1; args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; err = fuse_simple_request(fm, &args); if (err == -ENOSYS) fm->fc->no_bmap = 1; return err ? 0 : outarg.block; } static loff_t fuse_lseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; struct fuse_mount *fm = get_fuse_mount(inode); struct fuse_file *ff = file->private_data; FUSE_ARGS(args); struct fuse_lseek_in inarg = { .fh = ff->fh, .offset = offset, .whence = whence }; struct fuse_lseek_out outarg; int err; if (fm->fc->no_lseek) goto fallback; args.opcode = FUSE_LSEEK; args.nodeid = ff->nodeid; args.in_numargs = 1; args.in_args[0].size = sizeof(inarg); args.in_args[0].value = &inarg; args.out_numargs = 1; args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; err = fuse_simple_request(fm, &args); if (err) { if (err == -ENOSYS) { fm->fc->no_lseek = 1; goto fallback; } return err; } return vfs_setpos(file, outarg.offset, inode->i_sb->s_maxbytes); fallback: err = fuse_update_attributes(inode, file, STATX_SIZE); if (!err) return generic_file_llseek(file, offset, whence); else return err; } static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence) { loff_t retval; struct inode *inode = file_inode(file); switch (whence) { case SEEK_SET: case SEEK_CUR: /* No i_mutex protection necessary for SEEK_CUR and SEEK_SET */ retval = generic_file_llseek(file, offset, whence); break; case SEEK_END: inode_lock(inode); retval = fuse_update_attributes(inode, file, STATX_SIZE); if (!retval) retval = generic_file_llseek(file, offset, whence); inode_unlock(inode); break; case SEEK_HOLE: case SEEK_DATA: inode_lock(inode); retval = fuse_lseek(file, offset, whence); inode_unlock(inode); break; default: retval = -EINVAL; } return retval; } /* * All files which have been polled are linked to RB tree * fuse_conn->polled_files which is indexed by kh. Walk the tree and * find the matching one. */ static struct rb_node **fuse_find_polled_node(struct fuse_conn *fc, u64 kh, struct rb_node **parent_out) { struct rb_node **link = &fc->polled_files.rb_node; struct rb_node *last = NULL; while (*link) { struct fuse_file *ff; last = *link; ff = rb_entry(last, struct fuse_file, polled_node); if (kh < ff->kh) link = &last->rb_left; else if (kh > ff->kh) link = &last->rb_right; else return link; } if (parent_out) *parent_out = last; return link; } /* * The file is about to be polled. Make sure it's on the polled_files * RB tree. Note that files once added to the polled_files tree are * not removed before the file is released. This is because a file * polled once is likely to be polled again. */ static void fuse_register_polled_file(struct fuse_conn *fc, struct fuse_file *ff) { spin_lock(&fc->lock); if (RB_EMPTY_NODE(&ff->polled_node)) { struct rb_node **link, *parent; link = fuse_find_polled_node(fc, ff->kh, &parent); BUG_ON(*link); rb_link_node(&ff->polled_node, parent, link); rb_insert_color(&ff->polled_node, &fc->polled_files); } spin_unlock(&fc->lock); } __poll_t fuse_file_poll(struct file *file, poll_table *wait) { struct fuse_file *ff = file->private_data; struct fuse_mount *fm = ff->fm; struct fuse_poll_in inarg = { .fh = ff->fh, .kh = ff->kh }; struct fuse_poll_out outarg; FUSE_ARGS(args); int err; if (fm->fc->no_poll) return DEFAULT_POLLMASK; poll_wait(file, &ff->poll_wait, wait); inarg.events = mangle_poll(poll_requested_events(wait)); /* * Ask for notification iff there's someone waiting for it. * The client may ignore the flag and always notify. */ if (waitqueue_active(&ff->poll_wait)) { inarg.flags |= FUSE_POLL_SCHEDULE_NOTIFY; fuse_register_polled_file(fm->fc, ff); } args.opcode = FUSE_POLL; args.nodeid = ff->nodeid; args.in_numargs = 1; args.in_args[0].size = sizeof(inarg); args.in_args[0].value = &inarg; args.out_numargs = 1; args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; err = fuse_simple_request(fm, &args); if (!err) return demangle_poll(outarg.revents); if (err == -ENOSYS) { fm->fc->no_poll = 1; return DEFAULT_POLLMASK; } return EPOLLERR; } EXPORT_SYMBOL_GPL(fuse_file_poll); /* * This is called from fuse_handle_notify() on FUSE_NOTIFY_POLL and * wakes up the poll waiters. */ int fuse_notify_poll_wakeup(struct fuse_conn *fc, struct fuse_notify_poll_wakeup_out *outarg) { u64 kh = outarg->kh; struct rb_node **link; spin_lock(&fc->lock); link = fuse_find_polled_node(fc, kh, NULL); if (*link) { struct fuse_file *ff; ff = rb_entry(*link, struct fuse_file, polled_node); wake_up_interruptible_sync(&ff->poll_wait); } spin_unlock(&fc->lock); return 0; } static void fuse_do_truncate(struct file *file) { struct inode *inode = file->f_mapping->host; struct iattr attr; attr.ia_valid = ATTR_SIZE; attr.ia_size = i_size_read(inode); attr.ia_file = file; attr.ia_valid |= ATTR_FILE; fuse_do_setattr(file_mnt_idmap(file), file_dentry(file), &attr, file); } static inline loff_t fuse_round_up(struct fuse_conn *fc, loff_t off) { return round_up(off, fc->max_pages << PAGE_SHIFT); } static ssize_t fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { DECLARE_COMPLETION_ONSTACK(wait); ssize_t ret = 0; struct file *file = iocb->ki_filp; struct fuse_file *ff = file->private_data; loff_t pos = 0; struct inode *inode; loff_t i_size; size_t count = iov_iter_count(iter), shortened = 0; loff_t offset = iocb->ki_pos; struct fuse_io_priv *io; pos = offset; inode = file->f_mapping->host; i_size = i_size_read(inode); if ((iov_iter_rw(iter) == READ) && (offset >= i_size)) return 0; io = kmalloc_obj(struct fuse_io_priv); if (!io) return -ENOMEM; spin_lock_init(&io->lock); kref_init(&io->refcnt); io->reqs = 1; io->bytes = -1; io->size = 0; io->offset = offset; io->write = (iov_iter_rw(iter) == WRITE); io->err = 0; /* * By default, we want to optimize all I/Os with async request * submission to the client filesystem if supported. */ io->async = ff->fm->fc->async_dio; io->iocb = iocb; io->blocking = is_sync_kiocb(iocb); /* optimization for short read */ if (io->async && !io->write && offset + count > i_size) { iov_iter_truncate(iter, fuse_round_up(ff->fm->fc, i_size - offset)); shortened = count - iov_iter_count(iter); count -= shortened; } /* * We cannot asynchronously extend the size of a file. * In such case the aio will behave exactly like sync io. */ if ((offset + count > i_size) && io->write) io->blocking = true; if (io->async && io->blocking) { /* * Additional reference to keep io around after * calling fuse_aio_complete() */ kref_get(&io->refcnt); io->done = &wait; } if (iov_iter_rw(iter) == WRITE) { ret = fuse_direct_io(io, iter, &pos, FUSE_DIO_WRITE); fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); } else { ret = __fuse_direct_read(io, iter, &pos); } iov_iter_reexpand(iter, iov_iter_count(iter) + shortened); if (io->async) { bool blocking = io->blocking; fuse_aio_complete(io, ret < 0 ? ret : 0, -1); /* we have a non-extending, async request, so return */ if (!blocking) return -EIOCBQUEUED; wait_for_completion(&wait); ret = fuse_get_res_by_io(io); } kref_put(&io->refcnt, fuse_io_release); if (iov_iter_rw(iter) == WRITE) { fuse_write_update_attr(inode, pos, ret); /* For extending writes we already hold exclusive lock */ if (ret < 0 && offset + count > i_size) fuse_do_truncate(file); } return ret; } static int fuse_writeback_range(struct inode *inode, loff_t start, loff_t end) { int err = filemap_write_and_wait_range(inode->i_mapping, start, LLONG_MAX); if (!err) fuse_sync_writes(inode); return err; } static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, loff_t length) { struct fuse_file *ff = file->private_data; struct inode *inode = file_inode(file); struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_mount *fm = ff->fm; FUSE_ARGS(args); struct fuse_fallocate_in inarg = { .fh = ff->fh, .offset = offset, .length = length, .mode = mode }; int err; bool block_faults = FUSE_IS_DAX(inode) && (!(mode & FALLOC_FL_KEEP_SIZE) || (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE))); if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) return -EOPNOTSUPP; if (fm->fc->no_fallocate) return -EOPNOTSUPP; inode_lock(inode); if (block_faults) { filemap_invalidate_lock(inode->i_mapping); err = fuse_dax_break_layouts(inode, 0, -1); if (err) goto out; } if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) { loff_t endbyte = offset + length - 1; err = fuse_writeback_range(inode, offset, endbyte); if (err) goto out; } if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + length > i_size_read(inode)) { err = inode_newsize_ok(inode, offset + length); if (err) goto out; } err = file_modified(file); if (err) goto out; if (!(mode & FALLOC_FL_KEEP_SIZE)) set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); args.opcode = FUSE_FALLOCATE; args.nodeid = ff->nodeid; args.in_numargs = 1; args.in_args[0].size = sizeof(inarg); args.in_args[0].value = &inarg; err = fuse_simple_request(fm, &args); if (err == -ENOSYS) { fm->fc->no_fallocate = 1; err = -EOPNOTSUPP; } if (err) goto out; /* we could have extended the file */ if (!(mode & FALLOC_FL_KEEP_SIZE)) { if (fuse_write_update_attr(inode, offset + length, length)) file_update_time(file); } if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) truncate_pagecache_range(inode, offset, offset + length - 1); fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); out: if (!(mode & FALLOC_FL_KEEP_SIZE)) clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); if (block_faults) filemap_invalidate_unlock(inode->i_mapping); inode_unlock(inode); fuse_flush_time_update(inode); return err; } static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, size_t len, unsigned int flags) { struct fuse_file *ff_in = file_in->private_data; struct fuse_file *ff_out = file_out->private_data; struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); struct fuse_inode *fi_out = get_fuse_inode(inode_out); struct fuse_mount *fm = ff_in->fm; struct fuse_conn *fc = fm->fc; FUSE_ARGS(args); struct fuse_copy_file_range_in inarg = { .fh_in = ff_in->fh, .off_in = pos_in, .nodeid_out = ff_out->nodeid, .fh_out = ff_out->fh, .off_out = pos_out, .len = len, .flags = flags }; struct fuse_write_out outarg; struct fuse_copy_file_range_out outarg_64; u64 bytes_copied; ssize_t err; /* mark unstable when write-back is not used, and file_out gets * extended */ bool is_unstable = (!fc->writeback_cache) && ((pos_out + len) > inode_out->i_size); if (fc->no_copy_file_range) return -EOPNOTSUPP; if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb) return -EXDEV; inode_lock(inode_in); err = fuse_writeback_range(inode_in, pos_in, pos_in + len - 1); inode_unlock(inode_in); if (err) return err; inode_lock(inode_out); err = file_modified(file_out); if (err) goto out; /* * Write out dirty pages in the destination file before sending the COPY * request to userspace. After the request is completed, truncate off * pages (including partial ones) from the cache that have been copied, * since these contain stale data at that point. * * This should be mostly correct, but if the COPY writes to partial * pages (at the start or end) and the parts not covered by the COPY are * written through a memory map after calling fuse_writeback_range(), * then these partial page modifications will be lost on truncation. * * It is unlikely that someone would rely on such mixed style * modifications. Yet this does give less guarantees than if the * copying was performed with write(2). * * To fix this a mapping->invalidate_lock could be used to prevent new * faults while the copy is ongoing. */ err = fuse_writeback_range(inode_out, pos_out, pos_out + len - 1); if (err) goto out; if (is_unstable) set_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state); args.opcode = FUSE_COPY_FILE_RANGE_64; args.nodeid = ff_in->nodeid; args.in_numargs = 1; args.in_args[0].size = sizeof(inarg); args.in_args[0].value = &inarg; args.out_numargs = 1; args.out_args[0].size = sizeof(outarg_64); args.out_args[0].value = &outarg_64; if (fc->no_copy_file_range_64) { fallback: /* Fall back to old op that can't handle large copy length */ args.opcode = FUSE_COPY_FILE_RANGE; args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; inarg.len = len = min_t(size_t, len, UINT_MAX & PAGE_MASK); } err = fuse_simple_request(fm, &args); if (err == -ENOSYS) { if (fc->no_copy_file_range_64) { fc->no_copy_file_range = 1; err = -EOPNOTSUPP; } else { fc->no_copy_file_range_64 = 1; goto fallback; } } if (err) goto out; bytes_copied = fc->no_copy_file_range_64 ? outarg.size : outarg_64.bytes_copied; if (bytes_copied > len) { err = -EIO; goto out; } truncate_inode_pages_range(inode_out->i_mapping, ALIGN_DOWN(pos_out, PAGE_SIZE), ALIGN(pos_out + bytes_copied, PAGE_SIZE) - 1); file_update_time(file_out); fuse_write_update_attr(inode_out, pos_out + bytes_copied, bytes_copied); err = bytes_copied; out: if (is_unstable) clear_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state); inode_unlock(inode_out); file_accessed(file_in); fuse_flush_time_update(inode_out); return err; } static ssize_t fuse_copy_file_range(struct file *src_file, loff_t src_off, struct file *dst_file, loff_t dst_off, size_t len, unsigned int flags) { ssize_t ret; ret = __fuse_copy_file_range(src_file, src_off, dst_file, dst_off, len, flags); if (ret == -EOPNOTSUPP || ret == -EXDEV) ret = splice_copy_file_range(src_file, src_off, dst_file, dst_off, len); return ret; } static const struct file_operations fuse_file_operations = { .llseek = fuse_file_llseek, .read_iter = fuse_file_read_iter, .write_iter = fuse_file_write_iter, .mmap = fuse_file_mmap, .open = fuse_open, .flush = fuse_flush, .release = fuse_release, .fsync = fuse_fsync, .lock = fuse_file_lock, .get_unmapped_area = thp_get_unmapped_area, .flock = fuse_file_flock, .splice_read = fuse_splice_read, .splice_write = fuse_splice_write, .unlocked_ioctl = fuse_file_ioctl, .compat_ioctl = fuse_file_compat_ioctl, .poll = fuse_file_poll, .fallocate = fuse_file_fallocate, .copy_file_range = fuse_copy_file_range, .setlease = generic_setlease, }; static const struct address_space_operations fuse_file_aops = { .read_folio = fuse_read_folio, .readahead = fuse_readahead, .writepages = fuse_writepages, .launder_folio = fuse_launder_folio, .dirty_folio = iomap_dirty_folio, .release_folio = iomap_release_folio, .invalidate_folio = iomap_invalidate_folio, .is_partially_uptodate = iomap_is_partially_uptodate, .migrate_folio = filemap_migrate_folio, .bmap = fuse_bmap, .direct_IO = fuse_direct_IO, }; void fuse_init_file_inode(struct inode *inode, unsigned int flags) { struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = get_fuse_conn(inode); inode->i_fop = &fuse_file_operations; inode->i_data.a_ops = &fuse_file_aops; if (fc->writeback_cache) mapping_set_writeback_may_deadlock_on_reclaim(&inode->i_data); INIT_LIST_HEAD(&fi->write_files); INIT_LIST_HEAD(&fi->queued_writes); fi->writectr = 0; fi->iocachectr = 0; init_waitqueue_head(&fi->page_waitq); init_waitqueue_head(&fi->direct_io_waitq); if (IS_ENABLED(CONFIG_FUSE_DAX)) fuse_dax_inode_init(inode, flags); }
2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 // SPDX-License-Identifier: (BSD-3-Clause OR GPL-2.0-only) /* Copyright(c) 2014 - 2020 Intel Corporation */ #include <crypto/algapi.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/slab.h> #include <linux/fs.h> #include <linux/bitops.h> #include <linux/pci.h> #include <linux/cdev.h> #include <linux/uaccess.h> #include "adf_accel_devices.h" #include "adf_common_drv.h" #include "adf_cfg.h" #include "adf_cfg_common.h" #include "adf_cfg_user.h" #define ADF_CFG_MAX_SECTION 512 #define ADF_CFG_MAX_KEY_VAL 256 #define DEVICE_NAME "qat_adf_ctl" static DEFINE_MUTEX(adf_ctl_lock); static long adf_ctl_ioctl(struct file *fp, unsigned int cmd, unsigned long arg); static const struct file_operations adf_ctl_ops = { .owner = THIS_MODULE, .unlocked_ioctl = adf_ctl_ioctl, .compat_ioctl = compat_ptr_ioctl, }; static const struct class adf_ctl_class = { .name = DEVICE_NAME, }; struct adf_ctl_drv_info { unsigned int major; struct cdev drv_cdev; }; static struct adf_ctl_drv_info adf_ctl_drv; static void adf_chr_drv_destroy(void) { device_destroy(&adf_ctl_class, MKDEV(adf_ctl_drv.major, 0)); cdev_del(&adf_ctl_drv.drv_cdev); class_unregister(&adf_ctl_class); unregister_chrdev_region(MKDEV(adf_ctl_drv.major, 0), 1); } static int adf_chr_drv_create(void) { dev_t dev_id; struct device *drv_device; int ret; if (alloc_chrdev_region(&dev_id, 0, 1, DEVICE_NAME)) { pr_err("QAT: unable to allocate chrdev region\n"); return -EFAULT; } ret = class_register(&adf_ctl_class); if (ret) goto err_chrdev_unreg; adf_ctl_drv.major = MAJOR(dev_id); cdev_init(&adf_ctl_drv.drv_cdev, &adf_ctl_ops); if (cdev_add(&adf_ctl_drv.drv_cdev, dev_id, 1)) { pr_err("QAT: cdev add failed\n"); goto err_class_destr; } drv_device = device_create(&adf_ctl_class, NULL, MKDEV(adf_ctl_drv.major, 0), NULL, DEVICE_NAME); if (IS_ERR(drv_device)) { pr_err("QAT: failed to create device\n"); goto err_cdev_del; } return 0; err_cdev_del: cdev_del(&adf_ctl_drv.drv_cdev); err_class_destr: class_unregister(&adf_ctl_class); err_chrdev_unreg: unregister_chrdev_region(dev_id, 1); return -EFAULT; } static struct adf_user_cfg_ctl_data *adf_ctl_alloc_resources(unsigned long arg) { struct adf_user_cfg_ctl_data *cfg_data; cfg_data = memdup_user((void __user *)arg, sizeof(*cfg_data)); if (IS_ERR(cfg_data)) pr_err("QAT: failed to copy from user cfg_data.\n"); return cfg_data; } static int adf_add_key_value_data(struct adf_accel_dev *accel_dev, const char *section, const struct adf_user_cfg_key_val *key_val) { if (key_val->type == ADF_HEX) { long *ptr = (long *)key_val->val; long val = *ptr; if (adf_cfg_add_key_value_param(accel_dev, section, key_val->key, (void *)val, key_val->type)) { dev_err(&GET_DEV(accel_dev), "failed to add hex keyvalue.\n"); return -EFAULT; } } else { if (adf_cfg_add_key_value_param(accel_dev, section, key_val->key, key_val->val, key_val->type)) { dev_err(&GET_DEV(accel_dev), "failed to add keyvalue.\n"); return -EFAULT; } } return 0; } static int adf_copy_key_value_data(struct adf_accel_dev *accel_dev, struct adf_user_cfg_ctl_data *ctl_data) { struct adf_user_cfg_key_val key_val; struct adf_user_cfg_key_val *params_head; struct adf_user_cfg_section section, *section_head; int i, j; section_head = ctl_data->config_section; for (i = 0; section_head && i < ADF_CFG_MAX_SECTION; i++) { if (copy_from_user(&section, (void __user *)section_head, sizeof(*section_head))) { dev_err(&GET_DEV(accel_dev), "failed to copy section info\n"); goto out_err; } if (adf_cfg_section_add(accel_dev, section.name)) { dev_err(&GET_DEV(accel_dev), "failed to add section.\n"); goto out_err; } params_head = section.params; for (j = 0; params_head && j < ADF_CFG_MAX_KEY_VAL; j++) { if (copy_from_user(&key_val, (void __user *)params_head, sizeof(key_val))) { dev_err(&GET_DEV(accel_dev), "Failed to copy keyvalue.\n"); goto out_err; } if (adf_add_key_value_data(accel_dev, section.name, &key_val)) { goto out_err; } params_head = key_val.next; } section_head = section.next; } return 0; out_err: adf_cfg_del_all(accel_dev); return -EFAULT; } static int adf_ctl_ioctl_dev_config(struct file *fp, unsigned int cmd, unsigned long arg) { struct adf_user_cfg_ctl_data *ctl_data; struct adf_accel_dev *accel_dev; int ret = 0; ctl_data = adf_ctl_alloc_resources(arg); if (IS_ERR(ctl_data)) return PTR_ERR(ctl_data); accel_dev = adf_devmgr_get_dev_by_id(ctl_data->device_id); if (!accel_dev) { ret = -EFAULT; goto out; } if (adf_dev_started(accel_dev)) { ret = -EFAULT; goto out; } if (adf_copy_key_value_data(accel_dev, ctl_data)) { ret = -EFAULT; goto out; } set_bit(ADF_STATUS_CONFIGURED, &accel_dev->status); out: kfree(ctl_data); return ret; } static int adf_ctl_is_device_in_use(int id) { struct adf_accel_dev *dev; list_for_each_entry(dev, adf_devmgr_get_head(), list) { if (id == dev->accel_id || id == ADF_CFG_ALL_DEVICES) { if (adf_devmgr_in_reset(dev) || adf_dev_in_use(dev)) { dev_info(&GET_DEV(dev), "device qat_dev%d is busy\n", dev->accel_id); return -EBUSY; } } } return 0; } static void adf_ctl_stop_devices(u32 id) { struct adf_accel_dev *accel_dev; list_for_each_entry(accel_dev, adf_devmgr_get_head(), list) { if (id == accel_dev->accel_id || id == ADF_CFG_ALL_DEVICES) { if (!adf_dev_started(accel_dev)) continue; /* First stop all VFs */ if (!accel_dev->is_vf) continue; adf_dev_down(accel_dev); } } list_for_each_entry(accel_dev, adf_devmgr_get_head(), list) { if (id == accel_dev->accel_id || id == ADF_CFG_ALL_DEVICES) { if (!adf_dev_started(accel_dev)) continue; adf_dev_down(accel_dev); } } } static int adf_ctl_ioctl_dev_stop(struct file *fp, unsigned int cmd, unsigned long arg) { int ret; struct adf_user_cfg_ctl_data *ctl_data; ctl_data = adf_ctl_alloc_resources(arg); if (IS_ERR(ctl_data)) return PTR_ERR(ctl_data); if (adf_devmgr_verify_id(ctl_data->device_id)) { pr_err("QAT: Device %d not found\n", ctl_data->device_id); ret = -ENODEV; goto out; } ret = adf_ctl_is_device_in_use(ctl_data->device_id); if (ret) goto out; if (ctl_data->device_id == ADF_CFG_ALL_DEVICES) pr_info("QAT: Stopping all acceleration devices.\n"); else pr_info("QAT: Stopping acceleration device qat_dev%d.\n", ctl_data->device_id); adf_ctl_stop_devices(ctl_data->device_id); out: kfree(ctl_data); return ret; } static int adf_ctl_ioctl_dev_start(struct file *fp, unsigned int cmd, unsigned long arg) { int ret; struct adf_user_cfg_ctl_data *ctl_data; struct adf_accel_dev *accel_dev; ctl_data = adf_ctl_alloc_resources(arg); if (IS_ERR(ctl_data)) return PTR_ERR(ctl_data); ret = -ENODEV; accel_dev = adf_devmgr_get_dev_by_id(ctl_data->device_id); if (!accel_dev) goto out; dev_info(&GET_DEV(accel_dev), "Starting acceleration device qat_dev%d.\n", ctl_data->device_id); ret = adf_dev_up(accel_dev, false); if (ret) { dev_err(&GET_DEV(accel_dev), "Failed to start qat_dev%d\n", ctl_data->device_id); adf_dev_down(accel_dev); } out: kfree(ctl_data); return ret; } static int adf_ctl_ioctl_get_num_devices(struct file *fp, unsigned int cmd, unsigned long arg) { u32 num_devices = 0; adf_devmgr_get_num_dev(&num_devices); if (copy_to_user((void __user *)arg, &num_devices, sizeof(num_devices))) return -EFAULT; return 0; } static int adf_ctl_ioctl_get_status(struct file *fp, unsigned int cmd, unsigned long arg) { struct adf_hw_device_data *hw_data; struct adf_dev_status_info dev_info; struct adf_accel_dev *accel_dev; if (copy_from_user(&dev_info, (void __user *)arg, sizeof(struct adf_dev_status_info))) { pr_err("QAT: failed to copy from user.\n"); return -EFAULT; } accel_dev = adf_devmgr_get_dev_by_id(dev_info.accel_id); if (!accel_dev) return -ENODEV; hw_data = accel_dev->hw_device; dev_info.state = adf_dev_started(accel_dev) ? DEV_UP : DEV_DOWN; dev_info.num_ae = hw_data->get_num_aes(hw_data); dev_info.num_accel = hw_data->get_num_accels(hw_data); dev_info.num_logical_accel = hw_data->num_logical_accel; dev_info.banks_per_accel = hw_data->num_banks / hw_data->num_logical_accel; strscpy(dev_info.name, hw_data->dev_class->name, sizeof(dev_info.name)); dev_info.instance_id = hw_data->instance_id; dev_info.type = hw_data->dev_class->type; dev_info.bus = accel_to_pci_dev(accel_dev)->bus->number; dev_info.dev = PCI_SLOT(accel_to_pci_dev(accel_dev)->devfn); dev_info.fun = PCI_FUNC(accel_to_pci_dev(accel_dev)->devfn); if (copy_to_user((void __user *)arg, &dev_info, sizeof(struct adf_dev_status_info))) { dev_err(&GET_DEV(accel_dev), "failed to copy status.\n"); return -EFAULT; } return 0; } static long adf_ctl_ioctl(struct file *fp, unsigned int cmd, unsigned long arg) { int ret; if (mutex_lock_interruptible(&adf_ctl_lock)) return -EFAULT; switch (cmd) { case IOCTL_CONFIG_SYS_RESOURCE_PARAMETERS: ret = adf_ctl_ioctl_dev_config(fp, cmd, arg); break; case IOCTL_STOP_ACCEL_DEV: ret = adf_ctl_ioctl_dev_stop(fp, cmd, arg); break; case IOCTL_START_ACCEL_DEV: ret = adf_ctl_ioctl_dev_start(fp, cmd, arg); break; case IOCTL_GET_NUM_DEVICES: ret = adf_ctl_ioctl_get_num_devices(fp, cmd, arg); break; case IOCTL_STATUS_ACCEL_DEV: ret = adf_ctl_ioctl_get_status(fp, cmd, arg); break; default: pr_err_ratelimited("QAT: Invalid ioctl %d\n", cmd); ret = -EFAULT; break; } mutex_unlock(&adf_ctl_lock); return ret; } static int __init adf_register_ctl_device_driver(void) { if (adf_chr_drv_create()) goto err_chr_dev; if (adf_init_misc_wq()) goto err_misc_wq; if (adf_init_aer()) goto err_aer; if (adf_init_pf_wq()) goto err_pf_wq; if (adf_init_vf_wq()) goto err_vf_wq; if (qat_crypto_register()) goto err_crypto_register; if (qat_compression_register()) goto err_compression_register; return 0; err_compression_register: qat_crypto_unregister(); err_crypto_register: adf_exit_vf_wq(); err_vf_wq: adf_exit_pf_wq(); err_pf_wq: adf_exit_aer(); err_aer: adf_exit_misc_wq(); err_misc_wq: adf_chr_drv_destroy(); err_chr_dev: mutex_destroy(&adf_ctl_lock); return -EFAULT; } static void __exit adf_unregister_ctl_device_driver(void) { adf_chr_drv_destroy(); adf_exit_misc_wq(); adf_exit_aer(); adf_exit_vf_wq(); adf_exit_pf_wq(); qat_crypto_unregister(); qat_compression_unregister(); adf_clean_vf_map(false); mutex_destroy(&adf_ctl_lock); } module_init(adf_register_ctl_device_driver); module_exit(adf_unregister_ctl_device_driver); MODULE_LICENSE("Dual BSD/GPL"); MODULE_AUTHOR("Intel"); MODULE_DESCRIPTION("Intel(R) QuickAssist Technology"); MODULE_ALIAS_CRYPTO("intel_qat"); MODULE_VERSION(ADF_DRV_VERSION); MODULE_IMPORT_NS("CRYPTO_INTERNAL");
2 2 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 /* SPDX-License-Identifier: GPL-2.0 */ /* * linux/fs/ufs/util.h * * Copyright (C) 1998 * Daniel Pirkl <daniel.pirkl@email.cz> * Charles University, Faculty of Mathematics and Physics */ #include <linux/buffer_head.h> #include <linux/fs.h> #include "swab.h" /* * functions used for retyping */ static inline struct ufs_buffer_head *UCPI_UBH(struct ufs_cg_private_info *cpi) { return &cpi->c_ubh; } static inline struct ufs_buffer_head *USPI_UBH(struct ufs_sb_private_info *spi) { return &spi->s_ubh; } /* * macros used for accessing structures */ static inline s32 ufs_get_fs_state(struct super_block *sb, struct ufs_super_block_first *usb1, struct ufs_super_block_third *usb3) { switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) { case UFS_ST_SUNOS: if (fs32_to_cpu(sb, usb3->fs_postblformat) == UFS_42POSTBLFMT) return fs32_to_cpu(sb, usb1->fs_u0.fs_sun.fs_state); fallthrough; /* to UFS_ST_SUN */ case UFS_ST_SUN: return fs32_to_cpu(sb, usb3->fs_un2.fs_sun.fs_state); case UFS_ST_SUNx86: return fs32_to_cpu(sb, usb1->fs_u1.fs_sunx86.fs_state); case UFS_ST_44BSD: default: return fs32_to_cpu(sb, usb3->fs_un2.fs_44.fs_state); } } static inline void ufs_set_fs_state(struct super_block *sb, struct ufs_super_block_first *usb1, struct ufs_super_block_third *usb3, s32 value) { switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) { case UFS_ST_SUNOS: if (fs32_to_cpu(sb, usb3->fs_postblformat) == UFS_42POSTBLFMT) { usb1->fs_u0.fs_sun.fs_state = cpu_to_fs32(sb, value); break; } fallthrough; /* to UFS_ST_SUN */ case UFS_ST_SUN: usb3->fs_un2.fs_sun.fs_state = cpu_to_fs32(sb, value); break; case UFS_ST_SUNx86: usb1->fs_u1.fs_sunx86.fs_state = cpu_to_fs32(sb, value); break; case UFS_ST_44BSD: usb3->fs_un2.fs_44.fs_state = cpu_to_fs32(sb, value); break; } } static inline u32 ufs_get_fs_npsect(struct super_block *sb, struct ufs_super_block_first *usb1, struct ufs_super_block_third *usb3) { if ((UFS_SB(sb)->s_flags & UFS_ST_MASK) == UFS_ST_SUNx86) return fs32_to_cpu(sb, usb3->fs_un2.fs_sunx86.fs_npsect); else return fs32_to_cpu(sb, usb1->fs_u1.fs_sun.fs_npsect); } static inline u64 ufs_get_fs_qbmask(struct super_block *sb, struct ufs_super_block_third *usb3) { __fs64 tmp; switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) { case UFS_ST_SUNOS: case UFS_ST_SUN: ((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_sun.fs_qbmask[0]; ((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_sun.fs_qbmask[1]; break; case UFS_ST_SUNx86: ((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_sunx86.fs_qbmask[0]; ((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_sunx86.fs_qbmask[1]; break; case UFS_ST_44BSD: ((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_44.fs_qbmask[0]; ((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_44.fs_qbmask[1]; break; } return fs64_to_cpu(sb, tmp); } static inline u64 ufs_get_fs_qfmask(struct super_block *sb, struct ufs_super_block_third *usb3) { __fs64 tmp; switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) { case UFS_ST_SUNOS: case UFS_ST_SUN: ((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_sun.fs_qfmask[0]; ((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_sun.fs_qfmask[1]; break; case UFS_ST_SUNx86: ((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_sunx86.fs_qfmask[0]; ((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_sunx86.fs_qfmask[1]; break; case UFS_ST_44BSD: ((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_44.fs_qfmask[0]; ((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_44.fs_qfmask[1]; break; } return fs64_to_cpu(sb, tmp); } static inline u16 ufs_get_de_namlen(struct super_block *sb, struct ufs_dir_entry *de) { if ((UFS_SB(sb)->s_flags & UFS_DE_MASK) == UFS_DE_OLD) return fs16_to_cpu(sb, de->d_u.d_namlen); else return de->d_u.d_44.d_namlen; /* XXX this seems wrong */ } static inline void ufs_set_de_namlen(struct super_block *sb, struct ufs_dir_entry *de, u16 value) { if ((UFS_SB(sb)->s_flags & UFS_DE_MASK) == UFS_DE_OLD) de->d_u.d_namlen = cpu_to_fs16(sb, value); else de->d_u.d_44.d_namlen = value; /* XXX this seems wrong */ } static inline void ufs_set_de_type(struct super_block *sb, struct ufs_dir_entry *de, int mode) { if ((UFS_SB(sb)->s_flags & UFS_DE_MASK) != UFS_DE_44BSD) return; /* * TODO turn this into a table lookup */ switch (mode & S_IFMT) { case S_IFSOCK: de->d_u.d_44.d_type = DT_SOCK; break; case S_IFLNK: de->d_u.d_44.d_type = DT_LNK; break; case S_IFREG: de->d_u.d_44.d_type = DT_REG; break; case S_IFBLK: de->d_u.d_44.d_type = DT_BLK; break; case S_IFDIR: de->d_u.d_44.d_type = DT_DIR; break; case S_IFCHR: de->d_u.d_44.d_type = DT_CHR; break; case S_IFIFO: de->d_u.d_44.d_type = DT_FIFO; break; default: de->d_u.d_44.d_type = DT_UNKNOWN; } } static inline u32 ufs_get_inode_uid(struct super_block *sb, struct ufs_inode *inode) { switch (UFS_SB(sb)->s_flags & UFS_UID_MASK) { case UFS_UID_44BSD: return fs32_to_cpu(sb, inode->ui_u3.ui_44.ui_uid); case UFS_UID_EFT: if (inode->ui_u1.oldids.ui_suid == 0xFFFF) return fs32_to_cpu(sb, inode->ui_u3.ui_sun.ui_uid); fallthrough; default: return fs16_to_cpu(sb, inode->ui_u1.oldids.ui_suid); } } static inline void ufs_set_inode_uid(struct super_block *sb, struct ufs_inode *inode, u32 value) { switch (UFS_SB(sb)->s_flags & UFS_UID_MASK) { case UFS_UID_44BSD: inode->ui_u3.ui_44.ui_uid = cpu_to_fs32(sb, value); inode->ui_u1.oldids.ui_suid = cpu_to_fs16(sb, value); break; case UFS_UID_EFT: inode->ui_u3.ui_sun.ui_uid = cpu_to_fs32(sb, value); if (value > 0xFFFF) value = 0xFFFF; fallthrough; default: inode->ui_u1.oldids.ui_suid = cpu_to_fs16(sb, value); break; } } static inline u32 ufs_get_inode_gid(struct super_block *sb, struct ufs_inode *inode) { switch (UFS_SB(sb)->s_flags & UFS_UID_MASK) { case UFS_UID_44BSD: return fs32_to_cpu(sb, inode->ui_u3.ui_44.ui_gid); case UFS_UID_EFT: if (inode->ui_u1.oldids.ui_sgid == 0xFFFF) return fs32_to_cpu(sb, inode->ui_u3.ui_sun.ui_gid); fallthrough; default: return fs16_to_cpu(sb, inode->ui_u1.oldids.ui_sgid); } } static inline void ufs_set_inode_gid(struct super_block *sb, struct ufs_inode *inode, u32 value) { switch (UFS_SB(sb)->s_flags & UFS_UID_MASK) { case UFS_UID_44BSD: inode->ui_u3.ui_44.ui_gid = cpu_to_fs32(sb, value); inode->ui_u1.oldids.ui_sgid = cpu_to_fs16(sb, value); break; case UFS_UID_EFT: inode->ui_u3.ui_sun.ui_gid = cpu_to_fs32(sb, value); if (value > 0xFFFF) value = 0xFFFF; fallthrough; default: inode->ui_u1.oldids.ui_sgid = cpu_to_fs16(sb, value); break; } } dev_t ufs_get_inode_dev(struct super_block *, struct ufs_inode_info *); void ufs_set_inode_dev(struct super_block *, struct ufs_inode_info *, dev_t); int ufs_prepare_chunk(struct folio *folio, loff_t pos, unsigned len); /* * These functions manipulate ufs buffers */ #define ubh_bread(sb,fragment,size) _ubh_bread_(uspi,sb,fragment,size) extern struct ufs_buffer_head * _ubh_bread_(struct ufs_sb_private_info *, struct super_block *, u64 , u64); extern struct ufs_buffer_head * ubh_bread_uspi(struct ufs_sb_private_info *, struct super_block *, u64, u64); extern void ubh_brelse (struct ufs_buffer_head *); extern void ubh_brelse_uspi (struct ufs_sb_private_info *); extern void ubh_mark_buffer_dirty (struct ufs_buffer_head *); extern void ubh_sync_block(struct ufs_buffer_head *); extern void ubh_bforget (struct ufs_buffer_head *); extern int ubh_buffer_dirty (struct ufs_buffer_head *); /* This functions works with cache pages*/ struct folio *ufs_get_locked_folio(struct address_space *mapping, pgoff_t index); static inline void ufs_put_locked_folio(struct folio *folio) { folio_unlock(folio); folio_put(folio); } /* * macros and inline function to get important structures from ufs_sb_private_info */ static inline void *get_usb_offset(struct ufs_sb_private_info *uspi, unsigned int offset) { unsigned int index; index = offset >> uspi->s_fshift; offset &= ~uspi->s_fmask; return uspi->s_ubh.bh[index]->b_data + offset; } #define ubh_get_usb_first(uspi) \ ((struct ufs_super_block_first *)get_usb_offset((uspi), 0)) #define ubh_get_usb_second(uspi) \ ((struct ufs_super_block_second *)get_usb_offset((uspi), UFS_SECTOR_SIZE)) #define ubh_get_usb_third(uspi) \ ((struct ufs_super_block_third *)get_usb_offset((uspi), 2*UFS_SECTOR_SIZE)) #define ubh_get_ucg(ubh) \ ((struct ufs_cylinder_group *)((ubh)->bh[0]->b_data)) /* * Extract byte from ufs_buffer_head * Extract the bits for a block from a map inside ufs_buffer_head */ #define ubh_get_addr8(ubh,begin) \ ((u8*)(ubh)->bh[(begin) >> uspi->s_fshift]->b_data + \ ((begin) & ~uspi->s_fmask)) #define ubh_get_addr16(ubh,begin) \ (((__fs16*)((ubh)->bh[(begin) >> (uspi->s_fshift-1)]->b_data)) + \ ((begin) & ((uspi->fsize>>1) - 1))) #define ubh_get_addr32(ubh,begin) \ (((__fs32*)((ubh)->bh[(begin) >> (uspi->s_fshift-2)]->b_data)) + \ ((begin) & ((uspi->s_fsize>>2) - 1))) #define ubh_get_addr64(ubh,begin) \ (((__fs64*)((ubh)->bh[(begin) >> (uspi->s_fshift-3)]->b_data)) + \ ((begin) & ((uspi->s_fsize>>3) - 1))) #define ubh_get_addr ubh_get_addr8 static inline void *ubh_get_data_ptr(struct ufs_sb_private_info *uspi, struct ufs_buffer_head *ubh, u64 blk) { if (uspi->fs_magic == UFS2_MAGIC) return ubh_get_addr64(ubh, blk); else return ubh_get_addr32(ubh, blk); } #define ubh_blkmap(ubh,begin,bit) \ ((*ubh_get_addr(ubh, (begin) + ((bit) >> 3)) >> ((bit) & 7)) & (0xff >> (UFS_MAXFRAG - uspi->s_fpb))) static inline u64 ufs_freefrags(struct ufs_sb_private_info *uspi) { return ufs_blkstofrags(uspi->cs_total.cs_nbfree) + uspi->cs_total.cs_nffree; } /* * Macros to access cylinder group array structures */ #define ubh_cg_blktot(ucpi,cylno) \ (*((__fs32*)ubh_get_addr(UCPI_UBH(ucpi), (ucpi)->c_btotoff + ((cylno) << 2)))) #define ubh_cg_blks(ucpi,cylno,rpos) \ (*((__fs16*)ubh_get_addr(UCPI_UBH(ucpi), \ (ucpi)->c_boff + (((cylno) * uspi->s_nrpos + (rpos)) << 1 )))) /* * Bitmap operations * These functions work like classical bitmap operations. * The difference is that we don't have the whole bitmap * in one contiguous chunk of memory, but in several buffers. * The parameters of each function are super_block, ufs_buffer_head and * position of the beginning of the bitmap. */ #define ubh_setbit(ubh,begin,bit) \ (*ubh_get_addr(ubh, (begin) + ((bit) >> 3)) |= (1 << ((bit) & 7))) #define ubh_clrbit(ubh,begin,bit) \ (*ubh_get_addr (ubh, (begin) + ((bit) >> 3)) &= ~(1 << ((bit) & 7))) #define ubh_isset(ubh,begin,bit) \ (*ubh_get_addr (ubh, (begin) + ((bit) >> 3)) & (1 << ((bit) & 7))) #define ubh_isclr(ubh,begin,bit) (!ubh_isset(ubh,begin,bit)) #define ubh_find_first_zero_bit(ubh,begin,size) _ubh_find_next_zero_bit_(uspi,ubh,begin,size,0) #define ubh_find_next_zero_bit(ubh,begin,size,offset) _ubh_find_next_zero_bit_(uspi,ubh,begin,size,offset) static inline unsigned _ubh_find_next_zero_bit_( struct ufs_sb_private_info * uspi, struct ufs_buffer_head * ubh, unsigned begin, unsigned size, unsigned offset) { unsigned base, count, pos; size -= offset; begin <<= 3; offset += begin; base = offset >> uspi->s_bpfshift; offset &= uspi->s_bpfmask; for (;;) { count = min_t(unsigned int, size + offset, uspi->s_bpf); size -= count - offset; pos = find_next_zero_bit_le(ubh->bh[base]->b_data, count, offset); if (pos < count || !size) break; base++; offset = 0; } return (base << uspi->s_bpfshift) + pos - begin; } static inline unsigned find_last_zero_bit (unsigned char * bitmap, unsigned size, unsigned offset) { unsigned bit, i; unsigned char * mapp; unsigned char map; mapp = bitmap + (size >> 3); map = *mapp--; bit = 1 << (size & 7); for (i = size; i > offset; i--) { if ((map & bit) == 0) break; if ((i & 7) != 0) { bit >>= 1; } else { map = *mapp--; bit = 1 << 7; } } return i; } #define ubh_find_last_zero_bit(ubh,begin,size,offset) _ubh_find_last_zero_bit_(uspi,ubh,begin,size,offset) static inline unsigned _ubh_find_last_zero_bit_( struct ufs_sb_private_info * uspi, struct ufs_buffer_head * ubh, unsigned begin, unsigned start, unsigned end) { unsigned base, count, pos, size; size = start - end; begin <<= 3; start += begin; base = start >> uspi->s_bpfshift; start &= uspi->s_bpfmask; for (;;) { count = min_t(unsigned int, size + (uspi->s_bpf - start), uspi->s_bpf) - (uspi->s_bpf - start); size -= count; pos = find_last_zero_bit (ubh->bh[base]->b_data, start, start - count); if (pos > start - count || !size) break; base--; start = uspi->s_bpf; } return (base << uspi->s_bpfshift) + pos - begin; } static inline int ubh_isblockset(struct ufs_sb_private_info *uspi, struct ufs_cg_private_info *ucpi, unsigned int frag) { struct ufs_buffer_head *ubh = UCPI_UBH(ucpi); u8 *p = ubh_get_addr(ubh, ucpi->c_freeoff + (frag >> 3)); u8 mask; switch (uspi->s_fpb) { case 8: return *p == 0xff; case 4: mask = 0x0f << (frag & 4); return (*p & mask) == mask; case 2: mask = 0x03 << (frag & 6); return (*p & mask) == mask; case 1: mask = 0x01 << (frag & 7); return (*p & mask) == mask; } return 0; } static inline void ubh_clrblock(struct ufs_sb_private_info *uspi, struct ufs_cg_private_info *ucpi, unsigned int frag) { struct ufs_buffer_head *ubh = UCPI_UBH(ucpi); u8 *p = ubh_get_addr(ubh, ucpi->c_freeoff + (frag >> 3)); switch (uspi->s_fpb) { case 8: *p = 0x00; return; case 4: *p &= ~(0x0f << (frag & 4)); return; case 2: *p &= ~(0x03 << (frag & 6)); return; case 1: *p &= ~(0x01 << (frag & 7)); return; } } static inline void ubh_setblock(struct ufs_sb_private_info * uspi, struct ufs_cg_private_info *ucpi, unsigned int frag) { struct ufs_buffer_head *ubh = UCPI_UBH(ucpi); u8 *p = ubh_get_addr(ubh, ucpi->c_freeoff + (frag >> 3)); switch (uspi->s_fpb) { case 8: *p = 0xff; return; case 4: *p |= 0x0f << (frag & 4); return; case 2: *p |= 0x03 << (frag & 6); return; case 1: *p |= 0x01 << (frag & 7); return; } } static inline void ufs_fragacct (struct super_block * sb, unsigned blockmap, __fs32 * fraglist, int cnt) { struct ufs_sb_private_info * uspi; unsigned fragsize, pos; uspi = UFS_SB(sb)->s_uspi; fragsize = 0; for (pos = 0; pos < uspi->s_fpb; pos++) { if (blockmap & (1 << pos)) { fragsize++; } else if (fragsize > 0) { fs32_add(sb, &fraglist[fragsize], cnt); fragsize = 0; } } if (fragsize > 0 && fragsize < uspi->s_fpb) fs32_add(sb, &fraglist[fragsize], cnt); } static inline void *ufs_get_direct_data_ptr(struct ufs_sb_private_info *uspi, struct ufs_inode_info *ufsi, unsigned blk) { BUG_ON(blk > UFS_TIND_BLOCK); return uspi->fs_magic == UFS2_MAGIC ? (void *)&ufsi->i_u1.u2_i_data[blk] : (void *)&ufsi->i_u1.i_data[blk]; } static inline u64 ufs_data_ptr_to_cpu(struct super_block *sb, void *p) { return UFS_SB(sb)->s_uspi->fs_magic == UFS2_MAGIC ? fs64_to_cpu(sb, *(__fs64 *)p) : fs32_to_cpu(sb, *(__fs32 *)p); } static inline void ufs_cpu_to_data_ptr(struct super_block *sb, void *p, u64 val) { if (UFS_SB(sb)->s_uspi->fs_magic == UFS2_MAGIC) *(__fs64 *)p = cpu_to_fs64(sb, val); else *(__fs32 *)p = cpu_to_fs32(sb, val); } static inline void ufs_data_ptr_clear(struct ufs_sb_private_info *uspi, void *p) { if (uspi->fs_magic == UFS2_MAGIC) *(__fs64 *)p = 0; else *(__fs32 *)p = 0; } static inline int ufs_is_data_ptr_zero(struct ufs_sb_private_info *uspi, void *p) { if (uspi->fs_magic == UFS2_MAGIC) return *(__fs64 *)p == 0; else return *(__fs32 *)p == 0; } static inline __fs32 ufs_get_seconds(struct super_block *sbp) { time64_t now = ktime_get_real_seconds(); /* Signed 32-bit interpretation wraps around in 2038, which * happens in ufs1 inode stamps but not ufs2 using 64-bits * stamps. For superblock and blockgroup, let's assume * unsigned 32-bit stamps, which are good until y2106. * Wrap around rather than clamp here to make the dirty * file system detection work in the superblock stamp. */ return cpu_to_fs32(sbp, lower_32_bits(now)); }
39 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef IO_URING_MEMMAP_H #define IO_URING_MEMMAP_H #define IORING_MAP_OFF_PARAM_REGION 0x20000000ULL #define IORING_MAP_OFF_ZCRX_REGION 0x30000000ULL #define IORING_OFF_ZCRX_SHIFT 16 struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages); #ifndef CONFIG_MMU unsigned int io_uring_nommu_mmap_capabilities(struct file *file); #endif unsigned long io_uring_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); int io_uring_mmap(struct file *file, struct vm_area_struct *vma); void io_free_region(struct user_struct *user, struct io_mapped_region *mr); int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, struct io_uring_region_desc *reg, unsigned long mmap_offset); static inline void *io_region_get_ptr(struct io_mapped_region *mr) { return mr->ptr; } static inline bool io_region_is_set(struct io_mapped_region *mr) { return !!mr->nr_pages; } static inline void io_region_publish(struct io_ring_ctx *ctx, struct io_mapped_region *src_region, struct io_mapped_region *dst_region) { /* * Once published mmap can find it without holding only the ->mmap_lock * and not ->uring_lock. */ guard(mutex)(&ctx->mmap_lock); *dst_region = *src_region; } static inline size_t io_region_size(struct io_mapped_region *mr) { return (size_t) mr->nr_pages << PAGE_SHIFT; } #endif
2 1 1 6 1 1 2 2 2 2 2 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar * Copyright (C) 2005-2006, Thomas Gleixner, Russell King * * This file contains the interrupt descriptor management code. Detailed * information is available in Documentation/core-api/genericirq.rst * */ #include <linux/irq.h> #include <linux/slab.h> #include <linux/export.h> #include <linux/interrupt.h> #include <linux/kernel_stat.h> #include <linux/maple_tree.h> #include <linux/irqdomain.h> #include <linux/sysfs.h> #include <linux/string_choices.h> #include "internals.h" /* * lockdep: we want to handle all irq_desc locks as a single lock-class: */ static struct lock_class_key irq_desc_lock_class; #if defined(CONFIG_SMP) static int __init irq_affinity_setup(char *str) { alloc_bootmem_cpumask_var(&irq_default_affinity); cpulist_parse(str, irq_default_affinity); /* * Set at least the boot cpu. We don't want to end up with * bugreports caused by random commandline masks */ cpumask_set_cpu(smp_processor_id(), irq_default_affinity); return 1; } __setup("irqaffinity=", irq_affinity_setup); static void __init init_irq_default_affinity(void) { if (!cpumask_available(irq_default_affinity)) zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); if (cpumask_empty(irq_default_affinity)) cpumask_setall(irq_default_affinity); } #else static void __init init_irq_default_affinity(void) { } #endif #ifdef CONFIG_SMP static int alloc_masks(struct irq_desc *desc, int node) { if (!zalloc_cpumask_var_node(&desc->irq_common_data.affinity, GFP_KERNEL, node)) return -ENOMEM; #ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK if (!zalloc_cpumask_var_node(&desc->irq_common_data.effective_affinity, GFP_KERNEL, node)) { free_cpumask_var(desc->irq_common_data.affinity); return -ENOMEM; } #endif #ifdef CONFIG_GENERIC_PENDING_IRQ if (!zalloc_cpumask_var_node(&desc->pending_mask, GFP_KERNEL, node)) { #ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK free_cpumask_var(desc->irq_common_data.effective_affinity); #endif free_cpumask_var(desc->irq_common_data.affinity); return -ENOMEM; } #endif return 0; } static void irq_redirect_work(struct irq_work *work) { handle_irq_desc(container_of(work, struct irq_desc, redirect.work)); } static void desc_smp_init(struct irq_desc *desc, int node, const struct cpumask *affinity) { if (!affinity) affinity = irq_default_affinity; cpumask_copy(desc->irq_common_data.affinity, affinity); #ifdef CONFIG_GENERIC_PENDING_IRQ cpumask_clear(desc->pending_mask); #endif #ifdef CONFIG_NUMA desc->irq_common_data.node = node; #endif desc->redirect.work = IRQ_WORK_INIT_HARD(irq_redirect_work); } static void free_masks(struct irq_desc *desc) { #ifdef CONFIG_GENERIC_PENDING_IRQ free_cpumask_var(desc->pending_mask); #endif free_cpumask_var(desc->irq_common_data.affinity); #ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK free_cpumask_var(desc->irq_common_data.effective_affinity); #endif } #else static inline int alloc_masks(struct irq_desc *desc, int node) { return 0; } static inline void desc_smp_init(struct irq_desc *desc, int node, const struct cpumask *affinity) { } static inline void free_masks(struct irq_desc *desc) { } #endif static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node, const struct cpumask *affinity, struct module *owner) { desc->irq_common_data.handler_data = NULL; desc->irq_common_data.msi_desc = NULL; desc->irq_data.common = &desc->irq_common_data; desc->irq_data.irq = irq; desc->irq_data.chip = &no_irq_chip; desc->irq_data.chip_data = NULL; irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS); irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED); irqd_set(&desc->irq_data, IRQD_IRQ_MASKED); desc->handle_irq = handle_bad_irq; desc->depth = 1; desc->irq_count = 0; desc->irqs_unhandled = 0; desc->tot_count = 0; desc->name = NULL; desc->owner = owner; desc_smp_init(desc, node, affinity); } static unsigned int nr_irqs = NR_IRQS; /** * irq_get_nr_irqs() - Number of interrupts supported by the system. */ unsigned int irq_get_nr_irqs(void) { return nr_irqs; } EXPORT_SYMBOL_GPL(irq_get_nr_irqs); /** * irq_set_nr_irqs() - Set the number of interrupts supported by the system. * @nr: New number of interrupts. * * Return: @nr. */ unsigned int irq_set_nr_irqs(unsigned int nr) { nr_irqs = nr; return nr; } EXPORT_SYMBOL_GPL(irq_set_nr_irqs); static DEFINE_MUTEX(sparse_irq_lock); static struct maple_tree sparse_irqs = MTREE_INIT_EXT(sparse_irqs, MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN | MT_FLAGS_USE_RCU, sparse_irq_lock); static int irq_find_free_area(unsigned int from, unsigned int cnt) { MA_STATE(mas, &sparse_irqs, 0, 0); if (mas_empty_area(&mas, from, MAX_SPARSE_IRQS, cnt)) return -ENOSPC; return mas.index; } static unsigned int irq_find_at_or_after(unsigned int offset) { unsigned long index = offset; struct irq_desc *desc; guard(rcu)(); desc = mt_find(&sparse_irqs, &index, nr_irqs); return desc ? irq_desc_get_irq(desc) : nr_irqs; } static void irq_insert_desc(unsigned int irq, struct irq_desc *desc) { MA_STATE(mas, &sparse_irqs, irq, irq); WARN_ON(mas_store_gfp(&mas, desc, GFP_KERNEL) != 0); } static void delete_irq_desc(unsigned int irq) { MA_STATE(mas, &sparse_irqs, irq, irq); mas_erase(&mas); } #ifdef CONFIG_SPARSE_IRQ static const struct kobj_type irq_kobj_type; #endif static int init_desc(struct irq_desc *desc, int irq, int node, unsigned int flags, const struct cpumask *affinity, struct module *owner) { desc->kstat_irqs = alloc_percpu(struct irqstat); if (!desc->kstat_irqs) return -ENOMEM; if (alloc_masks(desc, node)) { free_percpu(desc->kstat_irqs); return -ENOMEM; } raw_spin_lock_init(&desc->lock); lockdep_set_class(&desc->lock, &irq_desc_lock_class); mutex_init(&desc->request_mutex); init_waitqueue_head(&desc->wait_for_threads); desc_set_defaults(irq, desc, node, affinity, owner); irqd_set(&desc->irq_data, flags); irq_resend_init(desc); #ifdef CONFIG_SPARSE_IRQ kobject_init(&desc->kobj, &irq_kobj_type); init_rcu_head(&desc->rcu); #endif return 0; } #ifdef CONFIG_SPARSE_IRQ static void irq_kobj_release(struct kobject *kobj); #ifdef CONFIG_SYSFS static struct kobject *irq_kobj_base; #define IRQ_ATTR_RO(_name) \ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) static ssize_t per_cpu_count_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); ssize_t ret = 0; char *p = ""; int cpu; for_each_possible_cpu(cpu) { unsigned int c = irq_desc_kstat_cpu(desc, cpu); ret += sysfs_emit_at(buf, ret, "%s%u", p, c); p = ","; } ret += sysfs_emit_at(buf, ret, "\n"); return ret; } IRQ_ATTR_RO(per_cpu_count); static ssize_t chip_name_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); guard(raw_spinlock_irq)(&desc->lock); if (desc->irq_data.chip && desc->irq_data.chip->name) return sysfs_emit(buf, "%s\n", desc->irq_data.chip->name); return 0; } IRQ_ATTR_RO(chip_name); static ssize_t hwirq_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); guard(raw_spinlock_irq)(&desc->lock); if (desc->irq_data.domain) return sysfs_emit(buf, "%lu\n", desc->irq_data.hwirq); return 0; } IRQ_ATTR_RO(hwirq); static ssize_t type_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); guard(raw_spinlock_irq)(&desc->lock); return sysfs_emit(buf, "%s\n", irqd_is_level_type(&desc->irq_data) ? "level" : "edge"); } IRQ_ATTR_RO(type); static ssize_t wakeup_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); guard(raw_spinlock_irq)(&desc->lock); return sysfs_emit(buf, "%s\n", str_enabled_disabled(irqd_is_wakeup_set(&desc->irq_data))); } IRQ_ATTR_RO(wakeup); static ssize_t name_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); guard(raw_spinlock_irq)(&desc->lock); if (desc->name) return sysfs_emit(buf, "%s\n", desc->name); return 0; } IRQ_ATTR_RO(name); static ssize_t actions_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); struct irqaction *action; ssize_t ret = 0; char *p = ""; scoped_guard(raw_spinlock_irq, &desc->lock) { for_each_action_of_desc(desc, action) { ret += sysfs_emit_at(buf, ret, "%s%s", p, action->name); p = ","; } } if (ret) ret += sysfs_emit_at(buf, ret, "\n"); return ret; } IRQ_ATTR_RO(actions); static struct attribute *irq_attrs[] = { &per_cpu_count_attr.attr, &chip_name_attr.attr, &hwirq_attr.attr, &type_attr.attr, &wakeup_attr.attr, &name_attr.attr, &actions_attr.attr, NULL }; ATTRIBUTE_GROUPS(irq); static const struct kobj_type irq_kobj_type = { .release = irq_kobj_release, .sysfs_ops = &kobj_sysfs_ops, .default_groups = irq_groups, }; static void irq_sysfs_add(int irq, struct irq_desc *desc) { if (irq_kobj_base) { /* * Continue even in case of failure as this is nothing * crucial and failures in the late irq_sysfs_init() * cannot be rolled back. */ if (kobject_add(&desc->kobj, irq_kobj_base, "%d", irq)) pr_warn("Failed to add kobject for irq %d\n", irq); else desc->istate |= IRQS_SYSFS; } } static void irq_sysfs_del(struct irq_desc *desc) { /* * Only invoke kobject_del() when kobject_add() was successfully * invoked for the descriptor. This covers both early boot, where * sysfs is not initialized yet, and the case of a failed * kobject_add() invocation. */ if (desc->istate & IRQS_SYSFS) kobject_del(&desc->kobj); } static int __init irq_sysfs_init(void) { struct irq_desc *desc; int irq; /* Prevent concurrent irq alloc/free */ guard(mutex)(&sparse_irq_lock); irq_kobj_base = kobject_create_and_add("irq", kernel_kobj); if (!irq_kobj_base) return -ENOMEM; /* Add the already allocated interrupts */ for_each_irq_desc(irq, desc) irq_sysfs_add(irq, desc); return 0; } postcore_initcall(irq_sysfs_init); #else /* !CONFIG_SYSFS */ static const struct kobj_type irq_kobj_type = { .release = irq_kobj_release, }; static void irq_sysfs_add(int irq, struct irq_desc *desc) {} static void irq_sysfs_del(struct irq_desc *desc) {} #endif /* CONFIG_SYSFS */ struct irq_desc *irq_to_desc(unsigned int irq) { return mtree_load(&sparse_irqs, irq); } #ifdef CONFIG_KVM_BOOK3S_64_HV_MODULE EXPORT_SYMBOL_GPL(irq_to_desc); #endif void irq_lock_sparse(void) { mutex_lock(&sparse_irq_lock); } void irq_unlock_sparse(void) { mutex_unlock(&sparse_irq_lock); } static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags, const struct cpumask *affinity, struct module *owner) { struct irq_desc *desc; int ret; desc = kzalloc_node(sizeof(*desc), GFP_KERNEL, node); if (!desc) return NULL; ret = init_desc(desc, irq, node, flags, affinity, owner); if (unlikely(ret)) { kfree(desc); return NULL; } return desc; } static void irq_kobj_release(struct kobject *kobj) { struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); free_masks(desc); free_percpu(desc->kstat_irqs); kfree(desc); } static void delayed_free_desc(struct rcu_head *rhp) { struct irq_desc *desc = container_of(rhp, struct irq_desc, rcu); kobject_put(&desc->kobj); } static void free_desc(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); irq_remove_debugfs_entry(desc); unregister_irq_proc(irq, desc); /* * sparse_irq_lock protects also show_interrupts() and * kstat_irq_usr(). Once we deleted the descriptor from the * sparse tree we can free it. Access in proc will fail to * lookup the descriptor. * * The sysfs entry must be serialized against a concurrent * irq_sysfs_init() as well. */ irq_sysfs_del(desc); delete_irq_desc(irq); /* * We free the descriptor, masks and stat fields via RCU. That * allows demultiplex interrupts to do rcu based management of * the child interrupts. * This also allows us to use rcu in kstat_irqs_usr(). */ call_rcu(&desc->rcu, delayed_free_desc); } static int alloc_descs(unsigned int start, unsigned int cnt, int node, const struct irq_affinity_desc *affinity, struct module *owner) { struct irq_desc *desc; int i; /* Validate affinity mask(s) */ if (affinity) { for (i = 0; i < cnt; i++) { if (cpumask_empty(&affinity[i].mask)) return -EINVAL; } } for (i = 0; i < cnt; i++) { const struct cpumask *mask = NULL; unsigned int flags = 0; if (affinity) { if (affinity->is_managed) { flags = IRQD_AFFINITY_MANAGED | IRQD_MANAGED_SHUTDOWN; } flags |= IRQD_AFFINITY_SET; mask = &affinity->mask; node = cpu_to_node(cpumask_first(mask)); affinity++; } desc = alloc_desc(start + i, node, flags, mask, owner); if (!desc) goto err; irq_insert_desc(start + i, desc); irq_sysfs_add(start + i, desc); irq_add_debugfs_entry(start + i, desc); } return start; err: for (i--; i >= 0; i--) free_desc(start + i); return -ENOMEM; } static bool irq_expand_nr_irqs(unsigned int nr) { if (nr > MAX_SPARSE_IRQS) return false; nr_irqs = nr; return true; } int __init early_irq_init(void) { int i, initcnt, node = first_online_node; struct irq_desc *desc; init_irq_default_affinity(); /* Let arch update nr_irqs and return the nr of preallocated irqs */ initcnt = arch_probe_nr_irqs(); printk(KERN_INFO "NR_IRQS: %d, nr_irqs: %d, preallocated irqs: %d\n", NR_IRQS, nr_irqs, initcnt); if (WARN_ON(nr_irqs > MAX_SPARSE_IRQS)) nr_irqs = MAX_SPARSE_IRQS; if (WARN_ON(initcnt > MAX_SPARSE_IRQS)) initcnt = MAX_SPARSE_IRQS; if (initcnt > nr_irqs) nr_irqs = initcnt; for (i = 0; i < initcnt; i++) { desc = alloc_desc(i, node, 0, NULL, NULL); irq_insert_desc(i, desc); } return arch_early_irq_init(); } #else /* !CONFIG_SPARSE_IRQ */ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { [0 ... NR_IRQS-1] = { .handle_irq = handle_bad_irq, .depth = 1, .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock), } }; int __init early_irq_init(void) { int count, i, node = first_online_node; int ret; init_irq_default_affinity(); printk(KERN_INFO "NR_IRQS: %d\n", NR_IRQS); count = ARRAY_SIZE(irq_desc); for (i = 0; i < count; i++) { ret = init_desc(irq_desc + i, i, node, 0, NULL, NULL); if (unlikely(ret)) goto __free_desc_res; } return arch_early_irq_init(); __free_desc_res: while (--i >= 0) { free_masks(irq_desc + i); free_percpu(irq_desc[i].kstat_irqs); } return ret; } struct irq_desc *irq_to_desc(unsigned int irq) { return (irq < NR_IRQS) ? irq_desc + irq : NULL; } EXPORT_SYMBOL(irq_to_desc); static void free_desc(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); int cpu; scoped_guard(raw_spinlock_irqsave, &desc->lock) desc_set_defaults(irq, desc, irq_desc_get_node(desc), NULL, NULL); for_each_possible_cpu(cpu) *per_cpu_ptr(desc->kstat_irqs, cpu) = (struct irqstat) { }; delete_irq_desc(irq); } static inline int alloc_descs(unsigned int start, unsigned int cnt, int node, const struct irq_affinity_desc *affinity, struct module *owner) { u32 i; for (i = 0; i < cnt; i++) { struct irq_desc *desc = irq_to_desc(start + i); desc->owner = owner; irq_insert_desc(start + i, desc); } return start; } static inline bool irq_expand_nr_irqs(unsigned int nr) { return false; } void irq_mark_irq(unsigned int irq) { guard(mutex)(&sparse_irq_lock); irq_insert_desc(irq, irq_desc + irq); } #endif /* !CONFIG_SPARSE_IRQ */ int handle_irq_desc(struct irq_desc *desc) { struct irq_data *data; if (!desc) return -EINVAL; data = irq_desc_get_irq_data(desc); if (WARN_ON_ONCE(!in_hardirq() && irqd_is_handle_enforce_irqctx(data))) return -EPERM; generic_handle_irq_desc(desc); return 0; } /** * generic_handle_irq - Invoke the handler for a particular irq * @irq: The irq number to handle * * Returns: 0 on success, or -EINVAL if conversion has failed * * This function must be called from an IRQ context with irq regs * initialized. */ int generic_handle_irq(unsigned int irq) { return handle_irq_desc(irq_to_desc(irq)); } EXPORT_SYMBOL_GPL(generic_handle_irq); /** * generic_handle_irq_safe - Invoke the handler for a particular irq from any * context. * @irq: The irq number to handle * * Returns: 0 on success, a negative value on error. * * This function can be called from any context (IRQ or process context). It * will report an error if not invoked from IRQ context and the irq has been * marked to enforce IRQ-context only. */ int generic_handle_irq_safe(unsigned int irq) { unsigned long flags; int ret; local_irq_save(flags); ret = handle_irq_desc(irq_to_desc(irq)); local_irq_restore(flags); return ret; } EXPORT_SYMBOL_GPL(generic_handle_irq_safe); #ifdef CONFIG_IRQ_DOMAIN /** * generic_handle_domain_irq - Invoke the handler for a HW irq belonging * to a domain. * @domain: The domain where to perform the lookup * @hwirq: The HW irq number to convert to a logical one * * Returns: 0 on success, or -EINVAL if conversion has failed * * This function must be called from an IRQ context with irq regs * initialized. */ int generic_handle_domain_irq(struct irq_domain *domain, irq_hw_number_t hwirq) { return handle_irq_desc(irq_resolve_mapping(domain, hwirq)); } EXPORT_SYMBOL_GPL(generic_handle_domain_irq); /** * generic_handle_irq_safe - Invoke the handler for a HW irq belonging * to a domain from any context. * @domain: The domain where to perform the lookup * @hwirq: The HW irq number to convert to a logical one * * Returns: 0 on success, a negative value on error. * * This function can be called from any context (IRQ or process * context). If the interrupt is marked as 'enforce IRQ-context only' then * the function must be invoked from hard interrupt context. */ int generic_handle_domain_irq_safe(struct irq_domain *domain, irq_hw_number_t hwirq) { unsigned long flags; int ret; local_irq_save(flags); ret = handle_irq_desc(irq_resolve_mapping(domain, hwirq)); local_irq_restore(flags); return ret; } EXPORT_SYMBOL_GPL(generic_handle_domain_irq_safe); /** * generic_handle_domain_nmi - Invoke the handler for a HW nmi belonging * to a domain. * @domain: The domain where to perform the lookup * @hwirq: The HW irq number to convert to a logical one * * Returns: 0 on success, or -EINVAL if conversion has failed * * This function must be called from an NMI context with irq regs * initialized. **/ int generic_handle_domain_nmi(struct irq_domain *domain, irq_hw_number_t hwirq) { WARN_ON_ONCE(!in_nmi()); return handle_irq_desc(irq_resolve_mapping(domain, hwirq)); } #ifdef CONFIG_SMP static bool demux_redirect_remote(struct irq_desc *desc) { guard(raw_spinlock)(&desc->lock); const struct cpumask *m = irq_data_get_effective_affinity_mask(&desc->irq_data); unsigned int target_cpu = READ_ONCE(desc->redirect.target_cpu); if (desc->irq_data.chip->irq_pre_redirect) desc->irq_data.chip->irq_pre_redirect(&desc->irq_data); /* * If the interrupt handler is already running on a CPU that's included * in the interrupt's affinity mask, redirection is not necessary. */ if (cpumask_test_cpu(smp_processor_id(), m)) return false; /* * The desc->action check protects against IRQ shutdown: __free_irq() sets * desc->action to NULL while holding desc->lock, which we also hold. * * Calling irq_work_queue_on() here is safe w.r.t. CPU unplugging: * - takedown_cpu() schedules multi_cpu_stop() on all active CPUs, * including the one that's taken down. * - multi_cpu_stop() acts like a barrier, which means all active * CPUs go through MULTI_STOP_DISABLE_IRQ and disable hard IRQs * *before* the dying CPU runs take_cpu_down() in MULTI_STOP_RUN. * - Hard IRQs are re-enabled at the end of multi_cpu_stop(), *after* * the dying CPU has run take_cpu_down() in MULTI_STOP_RUN. * - Since we run in hard IRQ context, we run either before or after * take_cpu_down() but never concurrently. * - If we run before take_cpu_down(), the dying CPU hasn't been marked * offline yet (it's marked via take_cpu_down() -> __cpu_disable()), * so the WARN in irq_work_queue_on() can't occur. * - Furthermore, the work item we queue will be flushed later via * take_cpu_down() -> cpuhp_invoke_callback_range_nofail() -> * smpcfd_dying_cpu() -> irq_work_run(). * - If we run after take_cpu_down(), target_cpu has been already * updated via take_cpu_down() -> __cpu_disable(), which eventually * calls irq_do_set_affinity() during IRQ migration. So, target_cpu * no longer points to the dying CPU in this case. */ if (desc->action) irq_work_queue_on(&desc->redirect.work, target_cpu); return true; } #else /* CONFIG_SMP */ static bool demux_redirect_remote(struct irq_desc *desc) { return false; } #endif /** * generic_handle_demux_domain_irq - Invoke the handler for a hardware interrupt * of a demultiplexing domain. * @domain: The domain where to perform the lookup * @hwirq: The hardware interrupt number to convert to a logical one * * Returns: True on success, or false if lookup has failed */ bool generic_handle_demux_domain_irq(struct irq_domain *domain, irq_hw_number_t hwirq) { struct irq_desc *desc = irq_resolve_mapping(domain, hwirq); if (unlikely(!desc)) return false; if (demux_redirect_remote(desc)) return true; return !handle_irq_desc(desc); } EXPORT_SYMBOL_GPL(generic_handle_demux_domain_irq); #endif /* Dynamic interrupt handling */ /** * irq_free_descs - free irq descriptors * @from: Start of descriptor range * @cnt: Number of consecutive irqs to free */ void irq_free_descs(unsigned int from, unsigned int cnt) { int i; if (from >= nr_irqs || (from + cnt) > nr_irqs) return; guard(mutex)(&sparse_irq_lock); for (i = 0; i < cnt; i++) free_desc(from + i); } EXPORT_SYMBOL_GPL(irq_free_descs); /** * __irq_alloc_descs - allocate and initialize a range of irq descriptors * @irq: Allocate for specific irq number if irq >= 0 * @from: Start the search from this irq number * @cnt: Number of consecutive irqs to allocate. * @node: Preferred node on which the irq descriptor should be allocated * @owner: Owning module (can be NULL) * @affinity: Optional pointer to an affinity mask array of size @cnt which * hints where the irq descriptors should be allocated and which * default affinities to use * * Returns the first irq number or error code */ int __ref __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node, struct module *owner, const struct irq_affinity_desc *affinity) { int start; if (!cnt) return -EINVAL; if (irq >= 0) { if (from > irq) return -EINVAL; from = irq; } else { /* * For interrupts which are freely allocated the * architecture can force a lower bound to the @from * argument. x86 uses this to exclude the GSI space. */ from = arch_dynirq_lower_bound(from); } guard(mutex)(&sparse_irq_lock); start = irq_find_free_area(from, cnt); if (irq >=0 && start != irq) return -EEXIST; if (start + cnt > nr_irqs) { if (!irq_expand_nr_irqs(start + cnt)) return -ENOMEM; } return alloc_descs(start, cnt, node, affinity, owner); } EXPORT_SYMBOL_GPL(__irq_alloc_descs); /** * irq_get_next_irq - get next allocated irq number * @offset: where to start the search * * Returns next irq number after offset or nr_irqs if none is found. */ unsigned int irq_get_next_irq(unsigned int offset) { return irq_find_at_or_after(offset); } struct irq_desc *__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus, unsigned int check) { struct irq_desc *desc; desc = irq_to_desc(irq); if (!desc) return NULL; if (check & _IRQ_DESC_CHECK) { if ((check & _IRQ_DESC_PERCPU) && !irq_settings_is_per_cpu_devid(desc)) return NULL; if (!(check & _IRQ_DESC_PERCPU) && irq_settings_is_per_cpu_devid(desc)) return NULL; } if (bus) chip_bus_lock(desc); raw_spin_lock_irqsave(&desc->lock, *flags); return desc; } void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus) __releases(&desc->lock) { raw_spin_unlock_irqrestore(&desc->lock, flags); if (bus) chip_bus_sync_unlock(desc); } int irq_set_percpu_devid(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); if (!desc || desc->percpu_enabled) return -EINVAL; desc->percpu_enabled = kzalloc_obj(*desc->percpu_enabled); if (!desc->percpu_enabled) return -ENOMEM; irq_set_percpu_devid_flags(irq); return 0; } void kstat_incr_irq_this_cpu(unsigned int irq) { kstat_incr_irqs_this_cpu(irq_to_desc(irq)); } /** * kstat_irqs_cpu - Get the statistics for an interrupt on a cpu * @irq: The interrupt number * @cpu: The cpu number * * Returns the sum of interrupt counts on @cpu since boot for * @irq. The caller must ensure that the interrupt is not removed * concurrently. */ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) { struct irq_desc *desc = irq_to_desc(irq); return desc && desc->kstat_irqs ? per_cpu(desc->kstat_irqs->cnt, cpu) : 0; } static unsigned int kstat_irqs_desc(struct irq_desc *desc, const struct cpumask *cpumask) { unsigned int sum = 0; int cpu; if (!irq_settings_is_per_cpu_devid(desc) && !irq_settings_is_per_cpu(desc) && !irq_is_nmi(desc)) return data_race(desc->tot_count); for_each_cpu(cpu, cpumask) sum += data_race(per_cpu(desc->kstat_irqs->cnt, cpu)); return sum; } static unsigned int kstat_irqs(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); if (!desc || !desc->kstat_irqs) return 0; return kstat_irqs_desc(desc, cpu_possible_mask); } #ifdef CONFIG_GENERIC_IRQ_STAT_SNAPSHOT void kstat_snapshot_irqs(void) { struct irq_desc *desc; unsigned int irq; for_each_irq_desc(irq, desc) { if (!desc->kstat_irqs) continue; this_cpu_write(desc->kstat_irqs->ref, this_cpu_read(desc->kstat_irqs->cnt)); } } unsigned int kstat_get_irq_since_snapshot(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); if (!desc || !desc->kstat_irqs) return 0; return this_cpu_read(desc->kstat_irqs->cnt) - this_cpu_read(desc->kstat_irqs->ref); } #endif /** * kstat_irqs_usr - Get the statistics for an interrupt from thread context * @irq: The interrupt number * * Returns the sum of interrupt counts on all cpus since boot for @irq. * * It uses rcu to protect the access since a concurrent removal of an * interrupt descriptor is observing an rcu grace period before * delayed_free_desc()/irq_kobj_release(). */ unsigned int kstat_irqs_usr(unsigned int irq) { unsigned int sum; rcu_read_lock(); sum = kstat_irqs(irq); rcu_read_unlock(); return sum; } #ifdef CONFIG_LOCKDEP void __irq_set_lockdep_class(unsigned int irq, struct lock_class_key *lock_class, struct lock_class_key *request_class) { struct irq_desc *desc = irq_to_desc(irq); if (desc) { lockdep_set_class(&desc->lock, lock_class); lockdep_set_class(&desc->request_mutex, request_class); } } EXPORT_SYMBOL_GPL(__irq_set_lockdep_class); #endif
69 69 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 // SPDX-License-Identifier: GPL-2.0 /* * fs/partitions/karma.c * Rio Karma partition info. * * Copyright (C) 2006 Bob Copeland (me@bobcopeland.com) * based on osf.c */ #include "check.h" #include <linux/compiler.h> #define KARMA_LABEL_MAGIC 0xAB56 int karma_partition(struct parsed_partitions *state) { int i; int slot = 1; Sector sect; unsigned char *data; struct disklabel { u8 d_reserved[270]; struct d_partition { __le32 p_res; u8 p_fstype; u8 p_res2[3]; __le32 p_offset; __le32 p_size; } d_partitions[2]; u8 d_blank[208]; __le16 d_magic; } __packed *label; struct d_partition *p; data = read_part_sector(state, 0, &sect); if (!data) return -1; label = (struct disklabel *)data; if (le16_to_cpu(label->d_magic) != KARMA_LABEL_MAGIC) { put_dev_sector(sect); return 0; } p = label->d_partitions; for (i = 0 ; i < 2; i++, p++) { if (slot == state->limit) break; if (p->p_fstype == 0x4d && le32_to_cpu(p->p_size)) { put_partition(state, slot, le32_to_cpu(p->p_offset), le32_to_cpu(p->p_size)); } slot++; } strlcat(state->pp_buf, "\n", PAGE_SIZE); put_dev_sector(sect); return 1; }
1 1 1 1 110 110 85 24 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 // SPDX-License-Identifier: GPL-2.0-only /* * linux/lib/cmdline.c * Helper functions generally used for parsing kernel command line * and module options. * * Code and copyrights come from init/main.c and arch/i386/kernel/setup.c. * * GNU Indent formatting options for this file: -kr -i8 -npsl -pcs */ #include <linux/export.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/ctype.h> /* * If a hyphen was found in get_option, this will handle the * range of numbers, M-N. This will expand the range and insert * the values[M, M+1, ..., N] into the ints array in get_options. */ static int get_range(char **str, int *pint, int n) { int x, inc_counter, upper_range; (*str)++; upper_range = simple_strtol((*str), NULL, 0); inc_counter = upper_range - *pint; for (x = *pint; n && x < upper_range; x++, n--) *pint++ = x; return inc_counter; } /** * get_option - Parse integer from an option string * @str: option string * @pint: (optional output) integer value parsed from @str * * Read an int from an option string; if available accept a subsequent * comma as well. * * When @pint is NULL the function can be used as a validator of * the current option in the string. * * Return values: * 0 - no int in string * 1 - int found, no subsequent comma * 2 - int found including a subsequent comma * 3 - hyphen found to denote a range * * Leading hyphen without integer is no integer case, but we consume it * for the sake of simplification. */ int get_option(char **str, int *pint) { char *cur = *str; int value; if (!cur || !(*cur)) return 0; if (*cur == '-') value = -simple_strtoull(++cur, str, 0); else value = simple_strtoull(cur, str, 0); if (pint) *pint = value; if (cur == *str) return 0; if (**str == ',') { (*str)++; return 2; } if (**str == '-') return 3; return 1; } EXPORT_SYMBOL(get_option); /** * get_options - Parse a string into a list of integers * @str: String to be parsed * @nints: size of integer array * @ints: integer array (must have room for at least one element) * * This function parses a string containing a comma-separated * list of integers, a hyphen-separated range of _positive_ integers, * or a combination of both. The parse halts when the array is * full, or when no more numbers can be retrieved from the * string. * * When @nints is 0, the function just validates the given @str and * returns the amount of parseable integers as described below. * * Returns: * * The first element is filled by the number of collected integers * in the range. The rest is what was parsed from the @str. * * Return value is the character in the string which caused * the parse to end (typically a null terminator, if @str is * completely parseable). */ char *get_options(const char *str, int nints, int *ints) { bool validate = (nints == 0); int res, i = 1; while (i < nints || validate) { int *pint = validate ? ints : ints + i; res = get_option((char **)&str, pint); if (res == 0) break; if (res == 3) { int n = validate ? 0 : nints - i; int range_nums; range_nums = get_range((char **)&str, pint, n); if (range_nums < 0) break; /* * Decrement the result by one to leave out the * last number in the range. The next iteration * will handle the upper number in the range */ i += (range_nums - 1); } i++; if (res == 1) break; } ints[0] = i - 1; return (char *)str; } EXPORT_SYMBOL(get_options); /** * memparse - parse a string with mem suffixes into a number * @ptr: Where parse begins * @retptr: (output) Optional pointer to next char after parse completes * * Parses a string into a number. The number stored at @ptr is * potentially suffixed with K, M, G, T, P, E. */ unsigned long long memparse(const char *ptr, char **retptr) { char *endptr; /* local pointer to end of parsed string */ unsigned long long ret = simple_strtoull(ptr, &endptr, 0); switch (*endptr) { case 'E': case 'e': ret <<= 10; fallthrough; case 'P': case 'p': ret <<= 10; fallthrough; case 'T': case 't': ret <<= 10; fallthrough; case 'G': case 'g': ret <<= 10; fallthrough; case 'M': case 'm': ret <<= 10; fallthrough; case 'K': case 'k': ret <<= 10; endptr++; fallthrough; default: break; } if (retptr) *retptr = endptr; return ret; } EXPORT_SYMBOL(memparse); /** * parse_option_str - Parse a string and check an option is set or not * @str: String to be parsed * @option: option name * * This function parses a string containing a comma-separated list of * strings like a=b,c. * * Return true if there's such option in the string, or return false. */ bool parse_option_str(const char *str, const char *option) { while (*str) { if (!strncmp(str, option, strlen(option))) { str += strlen(option); if (!*str || *str == ',') return true; } while (*str && *str != ',') str++; if (*str == ',') str++; } return false; } /* * Parse a string to get a param value pair. * You can use " around spaces, but can't escape ". * Hyphens and underscores equivalent in parameter names. */ char *next_arg(char *args, char **param, char **val) { unsigned int i, equals = 0; int in_quote = 0, quoted = 0; if (*args == '"') { args++; in_quote = 1; quoted = 1; } for (i = 0; args[i]; i++) { if (isspace(args[i]) && !in_quote) break; if (equals == 0) { if (args[i] == '=') equals = i; } if (args[i] == '"') in_quote = !in_quote; } *param = args; if (!equals) *val = NULL; else { args[equals] = '\0'; *val = args + equals + 1; /* Don't include quotes in value. */ if (**val == '"') { (*val)++; if (args[i-1] == '"') args[i-1] = '\0'; } } if (quoted && i > 0 && args[i-1] == '"') args[i-1] = '\0'; if (args[i]) { args[i] = '\0'; args += i + 1; } else args += i; /* Chew up trailing spaces. */ return skip_spaces(args); } EXPORT_SYMBOL(next_arg);
2 2 2 2 2 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 // SPDX-License-Identifier: GPL-2.0 /* * (C) 2001 Clemson University and The University of Chicago * (C) 2011 Omnibond Systems * * Changes by Acxiom Corporation to implement generic service_operation() * function, Copyright Acxiom Corporation, 2005. * * See COPYING in top-level directory. */ /* * In-kernel waitqueue operations. */ #include "protocol.h" #include "orangefs-kernel.h" #include "orangefs-bufmap.h" static int wait_for_matching_downcall(struct orangefs_kernel_op_s *op, long timeout, int flags) __acquires(op->lock); static void orangefs_clean_up_interrupted_operation(struct orangefs_kernel_op_s *op) __releases(op->lock); /* * What we do in this function is to walk the list of operations that are * present in the request queue and mark them as purged. * NOTE: This is called from the device close after client-core has * guaranteed that no new operations could appear on the list since the * client-core is anyway going to exit. */ void purge_waiting_ops(void) { struct orangefs_kernel_op_s *op, *tmp; spin_lock(&orangefs_request_list_lock); list_for_each_entry_safe(op, tmp, &orangefs_request_list, list) { gossip_debug(GOSSIP_WAIT_DEBUG, "pvfs2-client-core: purging op tag %llu %s\n", llu(op->tag), get_opname_string(op)); set_op_state_purged(op); gossip_debug(GOSSIP_DEV_DEBUG, "%s: op:%s: op_state:%d: process:%s:\n", __func__, get_opname_string(op), op->op_state, current->comm); } spin_unlock(&orangefs_request_list_lock); } /* * submits a ORANGEFS operation and waits for it to complete * * Note op->downcall.status will contain the status of the operation (in * errno format), whether provided by pvfs2-client or a result of failure to * service the operation. If the caller wishes to distinguish, then * op->state can be checked to see if it was serviced or not. * * Returns contents of op->downcall.status for convenience */ int service_operation(struct orangefs_kernel_op_s *op, const char *op_name, int flags) { long timeout = MAX_SCHEDULE_TIMEOUT; int ret = 0; DEFINE_WAIT(wait_entry); op->upcall.tgid = current->tgid; op->upcall.pid = current->pid; retry_servicing: op->downcall.status = 0; gossip_debug(GOSSIP_WAIT_DEBUG, "%s: %s op:%p: process:%s: pid:%d:\n", __func__, op_name, op, current->comm, current->pid); /* * If ORANGEFS_OP_NO_MUTEX was set in flags, we need to avoid * acquiring the request_mutex because we're servicing a * high priority remount operation and the request_mutex is * already taken. */ if (!(flags & ORANGEFS_OP_NO_MUTEX)) { if (flags & ORANGEFS_OP_INTERRUPTIBLE) ret = mutex_lock_interruptible(&orangefs_request_mutex); else ret = mutex_lock_killable(&orangefs_request_mutex); /* * check to see if we were interrupted while waiting for * mutex */ if (ret < 0) { op->downcall.status = ret; gossip_debug(GOSSIP_WAIT_DEBUG, "%s: service_operation interrupted.\n", __func__); return ret; } } /* queue up the operation */ spin_lock(&orangefs_request_list_lock); spin_lock(&op->lock); set_op_state_waiting(op); gossip_debug(GOSSIP_DEV_DEBUG, "%s: op:%s: op_state:%d: process:%s:\n", __func__, get_opname_string(op), op->op_state, current->comm); /* add high priority remount op to the front of the line. */ if (flags & ORANGEFS_OP_PRIORITY) list_add(&op->list, &orangefs_request_list); else list_add_tail(&op->list, &orangefs_request_list); spin_unlock(&op->lock); wake_up_interruptible(&orangefs_request_list_waitq); if (!__is_daemon_in_service()) { gossip_debug(GOSSIP_WAIT_DEBUG, "%s:client core is NOT in service.\n", __func__); /* * Don't wait for the userspace component to return if * the filesystem is being umounted anyway. */ if (op->upcall.type == ORANGEFS_VFS_OP_FS_UMOUNT) timeout = 0; else timeout = op_timeout_secs * HZ; } spin_unlock(&orangefs_request_list_lock); if (!(flags & ORANGEFS_OP_NO_MUTEX)) mutex_unlock(&orangefs_request_mutex); ret = wait_for_matching_downcall(op, timeout, flags); gossip_debug(GOSSIP_WAIT_DEBUG, "%s: wait_for_matching_downcall returned %d for %p\n", __func__, ret, op); /* got matching downcall; make sure status is in errno format */ if (!ret) { spin_unlock(&op->lock); op->downcall.status = orangefs_normalize_to_errno(op->downcall.status); ret = op->downcall.status; goto out; } /* failed to get matching downcall */ if (ret == -ETIMEDOUT) { gossip_err("%s: %s -- wait timed out; aborting attempt.\n", __func__, op_name); } /* * remove a waiting op from the request list or * remove an in-progress op from the in-progress list. */ orangefs_clean_up_interrupted_operation(op); op->downcall.status = ret; /* retry if operation has not been serviced and if requested */ if (ret == -EAGAIN) { op->attempts++; timeout = op_timeout_secs * HZ; gossip_debug(GOSSIP_WAIT_DEBUG, "orangefs: tag %llu (%s)" " -- operation to be retried (%d attempt)\n", llu(op->tag), op_name, op->attempts); /* * io ops (ops that use the shared memory buffer) have * to be returned to their caller for a retry. Other ops * can just be recycled here. */ if (!op->uses_shared_memory) goto retry_servicing; } out: gossip_debug(GOSSIP_WAIT_DEBUG, "%s: %s returning: %d for %p.\n", __func__, op_name, ret, op); return ret; } /* This can get called on an I/O op if it had a bad service_operation. */ bool orangefs_cancel_op_in_progress(struct orangefs_kernel_op_s *op) { u64 tag = op->tag; if (!op_state_in_progress(op)) return false; op->slot_to_free = op->upcall.req.io.buf_index; memset(&op->upcall, 0, sizeof(op->upcall)); memset(&op->downcall, 0, sizeof(op->downcall)); op->upcall.type = ORANGEFS_VFS_OP_CANCEL; op->upcall.req.cancel.op_tag = tag; op->downcall.type = ORANGEFS_VFS_OP_INVALID; op->downcall.status = -1; orangefs_new_tag(op); spin_lock(&orangefs_request_list_lock); /* orangefs_request_list_lock is enough of a barrier here */ if (!__is_daemon_in_service()) { spin_unlock(&orangefs_request_list_lock); return false; } spin_lock(&op->lock); set_op_state_waiting(op); gossip_debug(GOSSIP_DEV_DEBUG, "%s: op:%s: op_state:%d: process:%s:\n", __func__, get_opname_string(op), op->op_state, current->comm); list_add(&op->list, &orangefs_request_list); spin_unlock(&op->lock); spin_unlock(&orangefs_request_list_lock); gossip_debug(GOSSIP_WAIT_DEBUG, "Attempting ORANGEFS operation cancellation of tag %llu\n", llu(tag)); return true; } /* * Change an op to the "given up" state and remove it from its list. */ static void orangefs_clean_up_interrupted_operation(struct orangefs_kernel_op_s *op) __releases(op->lock) { /* * handle interrupted cases depending on what state we were in when * the interruption is detected. * * Called with op->lock held. */ /* * List manipulation code elsewhere will ignore ops that * have been given up upon. */ op->op_state |= OP_VFS_STATE_GIVEN_UP; if (list_empty(&op->list)) { /* caught copying to/from daemon */ BUG_ON(op_state_serviced(op)); spin_unlock(&op->lock); wait_for_completion(&op->waitq); } else if (op_state_waiting(op)) { /* * upcall hasn't been read; remove op from upcall request * list. */ spin_unlock(&op->lock); spin_lock(&orangefs_request_list_lock); list_del_init(&op->list); spin_unlock(&orangefs_request_list_lock); gossip_debug(GOSSIP_WAIT_DEBUG, "Interrupted: Removed op %p from request_list\n", op); } else if (op_state_in_progress(op)) { /* op must be removed from the in progress htable */ spin_unlock(&op->lock); spin_lock(&orangefs_htable_ops_in_progress_lock); list_del_init(&op->list); spin_unlock(&orangefs_htable_ops_in_progress_lock); gossip_debug(GOSSIP_WAIT_DEBUG, "Interrupted: Removed op %p" " from htable_ops_in_progress\n", op); } else { spin_unlock(&op->lock); gossip_err("interrupted operation is in a weird state 0x%x\n", op->op_state); } reinit_completion(&op->waitq); } /* * Sleeps on waitqueue waiting for matching downcall. * If client-core finishes servicing, then we are good to go. * else if client-core exits, we get woken up here, and retry with a timeout * * When this call returns to the caller, the specified op will no * longer be in either the in_progress hash table or on the request list. * * Returns 0 on success and -errno on failure * Errors are: * EAGAIN in case we want the caller to requeue and try again.. * EINTR/EIO/ETIMEDOUT indicating we are done trying to service this * operation since client-core seems to be exiting too often * or if we were interrupted. * * Returns with op->lock taken. */ static int wait_for_matching_downcall(struct orangefs_kernel_op_s *op, long timeout, int flags) __acquires(op->lock) { long n; int writeback = flags & ORANGEFS_OP_WRITEBACK, interruptible = flags & ORANGEFS_OP_INTERRUPTIBLE; /* * There's a "schedule_timeout" inside of these wait * primitives, during which the op is out of the hands of the * user process that needs something done and is being * manipulated by the client-core process. */ if (writeback) n = wait_for_completion_io_timeout(&op->waitq, timeout); else if (!writeback && interruptible) n = wait_for_completion_interruptible_timeout(&op->waitq, timeout); else /* !writeback && !interruptible but compiler complains */ n = wait_for_completion_killable_timeout(&op->waitq, timeout); spin_lock(&op->lock); if (op_state_serviced(op)) return 0; if (unlikely(n < 0)) { gossip_debug(GOSSIP_WAIT_DEBUG, "%s: operation interrupted, tag %llu, %p\n", __func__, llu(op->tag), op); return -EINTR; } if (op_state_purged(op)) { gossip_debug(GOSSIP_WAIT_DEBUG, "%s: operation purged, tag %llu, %p, %d\n", __func__, llu(op->tag), op, op->attempts); return (op->attempts < ORANGEFS_PURGE_RETRY_COUNT) ? -EAGAIN : -EIO; } /* must have timed out, then... */ gossip_debug(GOSSIP_WAIT_DEBUG, "%s: operation timed out, tag %llu, %p, %d)\n", __func__, llu(op->tag), op, op->attempts); return -ETIMEDOUT; }
5003 5013 5019 5003 4531 4531 4528 4522 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 // SPDX-License-Identifier: GPL-2.0 /* * Workingset detection * * Copyright (C) 2013 Red Hat, Inc., Johannes Weiner */ #include <linux/memcontrol.h> #include <linux/mm_inline.h> #include <linux/writeback.h> #include <linux/shmem_fs.h> #include <linux/pagemap.h> #include <linux/atomic.h> #include <linux/module.h> #include <linux/swap.h> #include <linux/dax.h> #include <linux/fs.h> #include <linux/mm.h> #include "internal.h" /* * Double CLOCK lists * * Per node, two clock lists are maintained for file pages: the * inactive and the active list. Freshly faulted pages start out at * the head of the inactive list and page reclaim scans pages from the * tail. Pages that are accessed multiple times on the inactive list * are promoted to the active list, to protect them from reclaim, * whereas active pages are demoted to the inactive list when the * active list grows too big. * * fault ------------------------+ * | * +--------------+ | +-------------+ * reclaim <- | inactive | <-+-- demotion | active | <--+ * +--------------+ +-------------+ | * | | * +-------------- promotion ------------------+ * * * Access frequency and refault distance * * A workload is thrashing when its pages are frequently used but they * are evicted from the inactive list every time before another access * would have promoted them to the active list. * * In cases where the average access distance between thrashing pages * is bigger than the size of memory there is nothing that can be * done - the thrashing set could never fit into memory under any * circumstance. * * However, the average access distance could be bigger than the * inactive list, yet smaller than the size of memory. In this case, * the set could fit into memory if it weren't for the currently * active pages - which may be used more, hopefully less frequently: * * +-memory available to cache-+ * | | * +-inactive------+-active----+ * a b | c d e f g h i | J K L M N | * +---------------+-----------+ * * It is prohibitively expensive to accurately track access frequency * of pages. But a reasonable approximation can be made to measure * thrashing on the inactive list, after which refaulting pages can be * activated optimistically to compete with the existing active pages. * * Approximating inactive page access frequency - Observations: * * 1. When a page is accessed for the first time, it is added to the * head of the inactive list, slides every existing inactive page * towards the tail by one slot, and pushes the current tail page * out of memory. * * 2. When a page is accessed for the second time, it is promoted to * the active list, shrinking the inactive list by one slot. This * also slides all inactive pages that were faulted into the cache * more recently than the activated page towards the tail of the * inactive list. * * Thus: * * 1. The sum of evictions and activations between any two points in * time indicate the minimum number of inactive pages accessed in * between. * * 2. Moving one inactive page N page slots towards the tail of the * list requires at least N inactive page accesses. * * Combining these: * * 1. When a page is finally evicted from memory, the number of * inactive pages accessed while the page was in cache is at least * the number of page slots on the inactive list. * * 2. In addition, measuring the sum of evictions and activations (E) * at the time of a page's eviction, and comparing it to another * reading (R) at the time the page faults back into memory tells * the minimum number of accesses while the page was not cached. * This is called the refault distance. * * Because the first access of the page was the fault and the second * access the refault, we combine the in-cache distance with the * out-of-cache distance to get the complete minimum access distance * of this page: * * NR_inactive + (R - E) * * And knowing the minimum access distance of a page, we can easily * tell if the page would be able to stay in cache assuming all page * slots in the cache were available: * * NR_inactive + (R - E) <= NR_inactive + NR_active * * If we have swap we should consider about NR_inactive_anon and * NR_active_anon, so for page cache and anonymous respectively: * * NR_inactive_file + (R - E) <= NR_inactive_file + NR_active_file * + NR_inactive_anon + NR_active_anon * * NR_inactive_anon + (R - E) <= NR_inactive_anon + NR_active_anon * + NR_inactive_file + NR_active_file * * Which can be further simplified to: * * (R - E) <= NR_active_file + NR_inactive_anon + NR_active_anon * * (R - E) <= NR_active_anon + NR_inactive_file + NR_active_file * * Put into words, the refault distance (out-of-cache) can be seen as * a deficit in inactive list space (in-cache). If the inactive list * had (R - E) more page slots, the page would not have been evicted * in between accesses, but activated instead. And on a full system, * the only thing eating into inactive list space is active pages. * * * Refaulting inactive pages * * All that is known about the active list is that the pages have been * accessed more than once in the past. This means that at any given * time there is actually a good chance that pages on the active list * are no longer in active use. * * So when a refault distance of (R - E) is observed and there are at * least (R - E) pages in the userspace workingset, the refaulting page * is activated optimistically in the hope that (R - E) pages are actually * used less frequently than the refaulting page - or even not used at * all anymore. * * That means if inactive cache is refaulting with a suitable refault * distance, we assume the cache workingset is transitioning and put * pressure on the current workingset. * * If this is wrong and demotion kicks in, the pages which are truly * used more frequently will be reactivated while the less frequently * used once will be evicted from memory. * * But if this is right, the stale pages will be pushed out of memory * and the used pages get to stay in cache. * * Refaulting active pages * * If on the other hand the refaulting pages have recently been * deactivated, it means that the active list is no longer protecting * actively used cache from reclaim. The cache is NOT transitioning to * a different workingset; the existing workingset is thrashing in the * space allocated to the page cache. * * * Implementation * * For each node's LRU lists, a counter for inactive evictions and * activations is maintained (node->nonresident_age). * * On eviction, a snapshot of this counter (along with some bits to * identify the node) is stored in the now empty page cache * slot of the evicted page. This is called a shadow entry. * * On cache misses for which there are shadow entries, an eligible * refault distance will immediately activate the refaulting page. */ #define WORKINGSET_SHIFT 1 #define EVICTION_SHIFT ((BITS_PER_LONG - BITS_PER_XA_VALUE) + \ WORKINGSET_SHIFT + NODES_SHIFT + \ MEM_CGROUP_ID_SHIFT) #define EVICTION_MASK (~0UL >> EVICTION_SHIFT) /* * Eviction timestamps need to be able to cover the full range of * actionable refaults. However, bits are tight in the xarray * entry, and after storing the identifier for the lruvec there might * not be enough left to represent every single actionable refault. In * that case, we have to sacrifice granularity for distance, and group * evictions into coarser buckets by shaving off lower timestamp bits. */ static unsigned int bucket_order __read_mostly; static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction, bool workingset) { eviction &= EVICTION_MASK; eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; eviction = (eviction << NODES_SHIFT) | pgdat->node_id; eviction = (eviction << WORKINGSET_SHIFT) | workingset; return xa_mk_value(eviction); } static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, unsigned long *evictionp, bool *workingsetp) { unsigned long entry = xa_to_value(shadow); int memcgid, nid; bool workingset; workingset = entry & ((1UL << WORKINGSET_SHIFT) - 1); entry >>= WORKINGSET_SHIFT; nid = entry & ((1UL << NODES_SHIFT) - 1); entry >>= NODES_SHIFT; memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1); entry >>= MEM_CGROUP_ID_SHIFT; *memcgidp = memcgid; *pgdat = NODE_DATA(nid); *evictionp = entry; *workingsetp = workingset; } #ifdef CONFIG_LRU_GEN static void *lru_gen_eviction(struct folio *folio) { int hist; unsigned long token; unsigned long min_seq; struct lruvec *lruvec; struct lru_gen_folio *lrugen; int type = folio_is_file_lru(folio); int delta = folio_nr_pages(folio); int refs = folio_lru_refs(folio); bool workingset = folio_test_workingset(folio); int tier = lru_tier_from_refs(refs, workingset); struct mem_cgroup *memcg = folio_memcg(folio); struct pglist_data *pgdat = folio_pgdat(folio); BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > BITS_PER_LONG - EVICTION_SHIFT); lruvec = mem_cgroup_lruvec(memcg, pgdat); lrugen = &lruvec->lrugen; min_seq = READ_ONCE(lrugen->min_seq[type]); token = (min_seq << LRU_REFS_WIDTH) | max(refs - 1, 0); hist = lru_hist_from_seq(min_seq); atomic_long_add(delta, &lrugen->evicted[hist][type][tier]); return pack_shadow(mem_cgroup_private_id(memcg), pgdat, token, workingset); } /* * Tests if the shadow entry is for a folio that was recently evicted. * Fills in @lruvec, @token, @workingset with the values unpacked from shadow. */ static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec, unsigned long *token, bool *workingset) { int memcg_id; unsigned long max_seq; struct mem_cgroup *memcg; struct pglist_data *pgdat; unpack_shadow(shadow, &memcg_id, &pgdat, token, workingset); memcg = mem_cgroup_from_private_id(memcg_id); *lruvec = mem_cgroup_lruvec(memcg, pgdat); max_seq = READ_ONCE((*lruvec)->lrugen.max_seq); max_seq &= EVICTION_MASK >> LRU_REFS_WIDTH; return abs_diff(max_seq, *token >> LRU_REFS_WIDTH) < MAX_NR_GENS; } static void lru_gen_refault(struct folio *folio, void *shadow) { bool recent; int hist, tier, refs; bool workingset; unsigned long token; struct lruvec *lruvec; struct lru_gen_folio *lrugen; int type = folio_is_file_lru(folio); int delta = folio_nr_pages(folio); rcu_read_lock(); recent = lru_gen_test_recent(shadow, &lruvec, &token, &workingset); if (lruvec != folio_lruvec(folio)) goto unlock; mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta); if (!recent) goto unlock; lrugen = &lruvec->lrugen; hist = lru_hist_from_seq(READ_ONCE(lrugen->min_seq[type])); refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + 1; tier = lru_tier_from_refs(refs, workingset); atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]); /* see folio_add_lru() where folio_set_active() will be called */ if (lru_gen_in_fault()) mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta); if (workingset) { folio_set_workingset(folio); mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta); } else set_mask_bits(&folio->flags.f, LRU_REFS_MASK, (refs - 1UL) << LRU_REFS_PGOFF); unlock: rcu_read_unlock(); } #else /* !CONFIG_LRU_GEN */ static void *lru_gen_eviction(struct folio *folio) { return NULL; } static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec, unsigned long *token, bool *workingset) { return false; } static void lru_gen_refault(struct folio *folio, void *shadow) { } #endif /* CONFIG_LRU_GEN */ /** * workingset_age_nonresident - age non-resident entries as LRU ages * @lruvec: the lruvec that was aged * @nr_pages: the number of pages to count * * As in-memory pages are aged, non-resident pages need to be aged as * well, in order for the refault distances later on to be comparable * to the in-memory dimensions. This function allows reclaim and LRU * operations to drive the non-resident aging along in parallel. */ void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages) { /* * Reclaiming a cgroup means reclaiming all its children in a * round-robin fashion. That means that each cgroup has an LRU * order that is composed of the LRU orders of its child * cgroups; and every page has an LRU position not just in the * cgroup that owns it, but in all of that group's ancestors. * * So when the physical inactive list of a leaf cgroup ages, * the virtual inactive lists of all its parents, including * the root cgroup's, age as well. */ do { atomic_long_add(nr_pages, &lruvec->nonresident_age); } while ((lruvec = parent_lruvec(lruvec))); } /** * workingset_eviction - note the eviction of a folio from memory * @target_memcg: the cgroup that is causing the reclaim * @folio: the folio being evicted * * Return: a shadow entry to be stored in @folio->mapping->i_pages in place * of the evicted @folio so that a later refault can be detected. */ void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg) { struct pglist_data *pgdat = folio_pgdat(folio); unsigned long eviction; struct lruvec *lruvec; int memcgid; /* Folio is fully exclusive and pins folio's memory cgroup pointer */ VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); VM_BUG_ON_FOLIO(folio_ref_count(folio), folio); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); if (lru_gen_enabled()) return lru_gen_eviction(folio); lruvec = mem_cgroup_lruvec(target_memcg, pgdat); /* XXX: target_memcg can be NULL, go through lruvec */ memcgid = mem_cgroup_private_id(lruvec_memcg(lruvec)); eviction = atomic_long_read(&lruvec->nonresident_age); eviction >>= bucket_order; workingset_age_nonresident(lruvec, folio_nr_pages(folio)); return pack_shadow(memcgid, pgdat, eviction, folio_test_workingset(folio)); } /** * workingset_test_recent - tests if the shadow entry is for a folio that was * recently evicted. Also fills in @workingset with the value unpacked from * shadow. * @shadow: the shadow entry to be tested. * @file: whether the corresponding folio is from the file lru. * @workingset: where the workingset value unpacked from shadow should * be stored. * @flush: whether to flush cgroup rstat. * * Return: true if the shadow is for a recently evicted folio; false otherwise. */ bool workingset_test_recent(void *shadow, bool file, bool *workingset, bool flush) { struct mem_cgroup *eviction_memcg; struct lruvec *eviction_lruvec; unsigned long refault_distance; unsigned long workingset_size; unsigned long refault; int memcgid; struct pglist_data *pgdat; unsigned long eviction; if (lru_gen_enabled()) { bool recent; rcu_read_lock(); recent = lru_gen_test_recent(shadow, &eviction_lruvec, &eviction, workingset); rcu_read_unlock(); return recent; } rcu_read_lock(); unpack_shadow(shadow, &memcgid, &pgdat, &eviction, workingset); eviction <<= bucket_order; /* * Look up the memcg associated with the stored ID. It might * have been deleted since the folio's eviction. * * Note that in rare events the ID could have been recycled * for a new cgroup that refaults a shared folio. This is * impossible to tell from the available data. However, this * should be a rare and limited disturbance, and activations * are always speculative anyway. Ultimately, it's the aging * algorithm's job to shake out the minimum access frequency * for the active cache. * * XXX: On !CONFIG_MEMCG, this will always return NULL; it * would be better if the root_mem_cgroup existed in all * configurations instead. */ eviction_memcg = mem_cgroup_from_private_id(memcgid); if (!mem_cgroup_tryget(eviction_memcg)) eviction_memcg = NULL; rcu_read_unlock(); if (!mem_cgroup_disabled() && !eviction_memcg) return false; /* * Flush stats (and potentially sleep) outside the RCU read section. * * Note that workingset_test_recent() itself might be called in RCU read * section (for e.g, in cachestat) - these callers need to skip flushing * stats (via the flush argument). * * XXX: With per-memcg flushing and thresholding, is ratelimiting * still needed here? */ if (flush) mem_cgroup_flush_stats_ratelimited(eviction_memcg); eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat); refault = atomic_long_read(&eviction_lruvec->nonresident_age); /* * Calculate the refault distance * * The unsigned subtraction here gives an accurate distance * across nonresident_age overflows in most cases. There is a * special case: usually, shadow entries have a short lifetime * and are either refaulted or reclaimed along with the inode * before they get too old. But it is not impossible for the * nonresident_age to lap a shadow entry in the field, which * can then result in a false small refault distance, leading * to a false activation should this old entry actually * refault again. However, earlier kernels used to deactivate * unconditionally with *every* reclaim invocation for the * longest time, so the occasional inappropriate activation * leading to pressure on the active list is not a problem. */ refault_distance = (refault - eviction) & EVICTION_MASK; /* * Compare the distance to the existing workingset size. We * don't activate pages that couldn't stay resident even if * all the memory was available to the workingset. Whether * workingset competition needs to consider anon or not depends * on having free swap space. */ workingset_size = lruvec_page_state(eviction_lruvec, NR_ACTIVE_FILE); if (!file) { workingset_size += lruvec_page_state(eviction_lruvec, NR_INACTIVE_FILE); } if (mem_cgroup_get_nr_swap_pages(eviction_memcg) > 0) { workingset_size += lruvec_page_state(eviction_lruvec, NR_ACTIVE_ANON); if (file) { workingset_size += lruvec_page_state(eviction_lruvec, NR_INACTIVE_ANON); } } mem_cgroup_put(eviction_memcg); return refault_distance <= workingset_size; } /** * workingset_refault - Evaluate the refault of a previously evicted folio. * @folio: The freshly allocated replacement folio. * @shadow: Shadow entry of the evicted folio. * * Calculates and evaluates the refault distance of the previously * evicted folio in the context of the node and the memcg whose memory * pressure caused the eviction. */ void workingset_refault(struct folio *folio, void *shadow) { bool file = folio_is_file_lru(folio); struct pglist_data *pgdat; struct mem_cgroup *memcg; struct lruvec *lruvec; bool workingset; long nr; VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); if (lru_gen_enabled()) { lru_gen_refault(folio, shadow); return; } /* * The activation decision for this folio is made at the level * where the eviction occurred, as that is where the LRU order * during folio reclaim is being determined. * * However, the cgroup that will own the folio is the one that * is actually experiencing the refault event. Make sure the folio is * locked to guarantee folio_memcg() stability throughout. */ nr = folio_nr_pages(folio); memcg = folio_memcg(folio); pgdat = folio_pgdat(folio); lruvec = mem_cgroup_lruvec(memcg, pgdat); mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr); if (!workingset_test_recent(shadow, file, &workingset, true)) return; folio_set_active(folio); workingset_age_nonresident(lruvec, nr); mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file, nr); /* Folio was active prior to eviction */ if (workingset) { folio_set_workingset(folio); /* * XXX: Move to folio_add_lru() when it supports new vs * putback */ lru_note_cost_refault(folio); mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file, nr); } } /** * workingset_activation - note a page activation * @folio: Folio that is being activated. */ void workingset_activation(struct folio *folio) { /* * Filter non-memcg pages here, e.g. unmap can call * mark_page_accessed() on VDSO pages. */ if (mem_cgroup_disabled() || folio_memcg_charged(folio)) workingset_age_nonresident(folio_lruvec(folio), folio_nr_pages(folio)); } /* * Shadow entries reflect the share of the working set that does not * fit into memory, so their number depends on the access pattern of * the workload. In most cases, they will refault or get reclaimed * along with the inode, but a (malicious) workload that streams * through files with a total size several times that of available * memory, while preventing the inodes from being reclaimed, can * create excessive amounts of shadow nodes. To keep a lid on this, * track shadow nodes and reclaim them when they grow way past the * point where they would still be useful. */ struct list_lru shadow_nodes; void workingset_update_node(struct xa_node *node) { struct page *page = virt_to_page(node); /* * Track non-empty nodes that contain only shadow entries; * unlink those that contain pages or are being freed. * * Avoid acquiring the list_lru lock when the nodes are * already where they should be. The list_empty() test is safe * as node->private_list is protected by the i_pages lock. */ lockdep_assert_held(&node->array->xa_lock); if (node->count && node->count == node->nr_values) { if (list_empty(&node->private_list)) { list_lru_add_obj(&shadow_nodes, &node->private_list); __inc_node_page_state(page, WORKINGSET_NODES); } } else { if (!list_empty(&node->private_list)) { list_lru_del_obj(&shadow_nodes, &node->private_list); __dec_node_page_state(page, WORKINGSET_NODES); } } } static unsigned long count_shadow_nodes(struct shrinker *shrinker, struct shrink_control *sc) { unsigned long max_nodes; unsigned long nodes; unsigned long pages; nodes = list_lru_shrink_count(&shadow_nodes, sc); if (!nodes) return SHRINK_EMPTY; /* * Approximate a reasonable limit for the nodes * containing shadow entries. We don't need to keep more * shadow entries than possible pages on the active list, * since refault distances bigger than that are dismissed. * * The size of the active list converges toward 100% of * overall page cache as memory grows, with only a tiny * inactive list. Assume the total cache size for that. * * Nodes might be sparsely populated, with only one shadow * entry in the extreme case. Obviously, we cannot keep one * node for every eligible shadow entry, so compromise on a * worst-case density of 1/8th. Below that, not all eligible * refaults can be detected anymore. * * On 64-bit with 7 xa_nodes per page and 64 slots * each, this will reclaim shadow entries when they consume * ~1.8% of available memory: * * PAGE_SIZE / xa_nodes / node_entries * 8 / PAGE_SIZE */ #ifdef CONFIG_MEMCG if (sc->memcg) { struct lruvec *lruvec; int i; mem_cgroup_flush_stats_ratelimited(sc->memcg); lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid)); for (pages = 0, i = 0; i < NR_LRU_LISTS; i++) pages += lruvec_page_state_local(lruvec, NR_LRU_BASE + i); pages += lruvec_page_state_local( lruvec, NR_SLAB_RECLAIMABLE_B) >> PAGE_SHIFT; pages += lruvec_page_state_local( lruvec, NR_SLAB_UNRECLAIMABLE_B) >> PAGE_SHIFT; } else #endif pages = node_present_pages(sc->nid); max_nodes = pages >> (XA_CHUNK_SHIFT - 3); if (nodes <= max_nodes) return 0; return nodes - max_nodes; } static enum lru_status shadow_lru_isolate(struct list_head *item, struct list_lru_one *lru, void *arg) __must_hold(lru->lock) { struct xa_node *node = container_of(item, struct xa_node, private_list); struct address_space *mapping; int ret; /* * Page cache insertions and deletions synchronously maintain * the shadow node LRU under the i_pages lock and the * &lru->lock. Because the page cache tree is emptied before * the inode can be destroyed, holding the &lru->lock pins any * address_space that has nodes on the LRU. * * We can then safely transition to the i_pages lock to * pin only the address_space of the particular node we want * to reclaim, take the node off-LRU, and drop the &lru->lock. */ mapping = container_of(node->array, struct address_space, i_pages); /* Coming from the list, invert the lock order */ if (!xa_trylock(&mapping->i_pages)) { spin_unlock_irq(&lru->lock); ret = LRU_RETRY; goto out; } /* For page cache we need to hold i_lock */ if (mapping->host != NULL) { if (!spin_trylock(&mapping->host->i_lock)) { xa_unlock(&mapping->i_pages); spin_unlock_irq(&lru->lock); ret = LRU_RETRY; goto out; } } list_lru_isolate(lru, item); __dec_node_page_state(virt_to_page(node), WORKINGSET_NODES); spin_unlock(&lru->lock); /* * The nodes should only contain one or more shadow entries, * no pages, so we expect to be able to remove them all and * delete and free the empty node afterwards. */ if (WARN_ON_ONCE(!node->nr_values)) goto out_invalid; if (WARN_ON_ONCE(node->count != node->nr_values)) goto out_invalid; xa_delete_node(node, workingset_update_node); mod_lruvec_kmem_state(node, WORKINGSET_NODERECLAIM, 1); out_invalid: xa_unlock_irq(&mapping->i_pages); if (mapping->host != NULL) { if (mapping_shrinkable(mapping)) inode_lru_list_add(mapping->host); spin_unlock(&mapping->host->i_lock); } ret = LRU_REMOVED_RETRY; out: cond_resched(); return ret; } static unsigned long scan_shadow_nodes(struct shrinker *shrinker, struct shrink_control *sc) { /* list_lru lock nests inside the IRQ-safe i_pages lock */ return list_lru_shrink_walk_irq(&shadow_nodes, sc, shadow_lru_isolate, NULL); } /* * Our list_lru->lock is IRQ-safe as it nests inside the IRQ-safe * i_pages lock. */ static struct lock_class_key shadow_nodes_key; static int __init workingset_init(void) { struct shrinker *workingset_shadow_shrinker; unsigned int timestamp_bits; unsigned int max_order; int ret = -ENOMEM; BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT); /* * Calculate the eviction bucket size to cover the longest * actionable refault distance, which is currently half of * memory (totalram_pages/2). However, memory hotplug may add * some more pages at runtime, so keep working with up to * double the initial memory by using totalram_pages as-is. */ timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT; max_order = fls_long(totalram_pages() - 1); if (max_order > timestamp_bits) bucket_order = max_order - timestamp_bits; pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n", timestamp_bits, max_order, bucket_order); workingset_shadow_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, "mm-shadow"); if (!workingset_shadow_shrinker) goto err; ret = list_lru_init_memcg_key(&shadow_nodes, workingset_shadow_shrinker, &shadow_nodes_key); if (ret) goto err_list_lru; workingset_shadow_shrinker->count_objects = count_shadow_nodes; workingset_shadow_shrinker->scan_objects = scan_shadow_nodes; /* ->count reports only fully expendable nodes */ workingset_shadow_shrinker->seeks = 0; shrinker_register(workingset_shadow_shrinker); return 0; err_list_lru: shrinker_free(workingset_shadow_shrinker); err: return ret; } module_init(workingset_init);
1316 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 /* SPDX-License-Identifier: GPL-2.0 */ /* * Block data types and constants. Directly include this file only to * break include dependency loop. */ #ifndef __LINUX_BLK_TYPES_H #define __LINUX_BLK_TYPES_H #include <linux/types.h> #include <linux/bvec.h> #include <linux/device.h> #include <linux/ktime.h> #include <linux/rw_hint.h> struct bio_set; struct bio; struct bio_integrity_payload; struct page; struct io_context; struct cgroup_subsys_state; typedef void (bio_end_io_t) (struct bio *); struct bio_crypt_ctx; /* * The basic unit of block I/O is a sector. It is used in a number of contexts * in Linux (blk, bio, genhd). The size of one sector is 512 = 2**9 * bytes. Variables of type sector_t represent an offset or size that is a * multiple of 512 bytes. Hence these two constants. */ #ifndef SECTOR_SHIFT #define SECTOR_SHIFT 9 #endif #ifndef SECTOR_SIZE #define SECTOR_SIZE (1 << SECTOR_SHIFT) #endif #define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT) #define PAGE_SECTORS (1 << PAGE_SECTORS_SHIFT) #define SECTOR_MASK (PAGE_SECTORS - 1) struct block_device { sector_t bd_start_sect; sector_t bd_nr_sectors; struct gendisk * bd_disk; struct request_queue * bd_queue; struct disk_stats __percpu *bd_stats; unsigned long bd_stamp; atomic_t __bd_flags; // partition number + flags #define BD_PARTNO 255 // lower 8 bits; assign-once #define BD_READ_ONLY (1u<<8) // read-only policy #define BD_WRITE_HOLDER (1u<<9) #define BD_HAS_SUBMIT_BIO (1u<<10) #define BD_RO_WARNED (1u<<11) #ifdef CONFIG_FAIL_MAKE_REQUEST #define BD_MAKE_IT_FAIL (1u<<12) #endif dev_t bd_dev; struct address_space *bd_mapping; /* page cache */ atomic_t bd_openers; spinlock_t bd_size_lock; /* for bd_inode->i_size updates */ void * bd_claiming; void * bd_holder; const struct blk_holder_ops *bd_holder_ops; struct mutex bd_holder_lock; int bd_holders; struct kobject *bd_holder_dir; atomic_t bd_fsfreeze_count; /* number of freeze requests */ struct mutex bd_fsfreeze_mutex; /* serialize freeze/thaw */ struct partition_meta_info *bd_meta_info; int bd_writers; #ifdef CONFIG_SECURITY void *bd_security; #endif /* * keep this out-of-line as it's both big and not needed in the fast * path */ struct device bd_device; } __randomize_layout; #define bdev_whole(_bdev) \ ((_bdev)->bd_disk->part0) #define dev_to_bdev(device) \ container_of((device), struct block_device, bd_device) #define bdev_kobj(_bdev) \ (&((_bdev)->bd_device.kobj)) /* * Block error status values. See block/blk-core:blk_errors for the details. */ typedef u8 __bitwise blk_status_t; typedef u16 blk_short_t; #define BLK_STS_OK 0 #define BLK_STS_NOTSUPP ((__force blk_status_t)1) #define BLK_STS_TIMEOUT ((__force blk_status_t)2) #define BLK_STS_NOSPC ((__force blk_status_t)3) #define BLK_STS_TRANSPORT ((__force blk_status_t)4) #define BLK_STS_TARGET ((__force blk_status_t)5) #define BLK_STS_RESV_CONFLICT ((__force blk_status_t)6) #define BLK_STS_MEDIUM ((__force blk_status_t)7) #define BLK_STS_PROTECTION ((__force blk_status_t)8) #define BLK_STS_RESOURCE ((__force blk_status_t)9) #define BLK_STS_IOERR ((__force blk_status_t)10) /* hack for device mapper, don't use elsewhere: */ #define BLK_STS_DM_REQUEUE ((__force blk_status_t)11) /* * BLK_STS_AGAIN should only be returned if RQF_NOWAIT is set * and the bio would block (cf bio_wouldblock_error()) */ #define BLK_STS_AGAIN ((__force blk_status_t)12) /* * BLK_STS_DEV_RESOURCE is returned from the driver to the block layer if * device related resources are unavailable, but the driver can guarantee * that the queue will be rerun in the future once resources become * available again. This is typically the case for device specific * resources that are consumed for IO. If the driver fails allocating these * resources, we know that inflight (or pending) IO will free these * resource upon completion. * * This is different from BLK_STS_RESOURCE in that it explicitly references * a device specific resource. For resources of wider scope, allocation * failure can happen without having pending IO. This means that we can't * rely on request completions freeing these resources, as IO may not be in * flight. Examples of that are kernel memory allocations, DMA mappings, or * any other system wide resources. */ #define BLK_STS_DEV_RESOURCE ((__force blk_status_t)13) /* * BLK_STS_ZONE_OPEN_RESOURCE is returned from the driver in the completion * path if the device returns a status indicating that too many zone resources * are currently open. The same command should be successful if resubmitted * after the number of open zones decreases below the device's limits, which is * reported in the request_queue's max_open_zones. */ #define BLK_STS_ZONE_OPEN_RESOURCE ((__force blk_status_t)14) /* * BLK_STS_ZONE_ACTIVE_RESOURCE is returned from the driver in the completion * path if the device returns a status indicating that too many zone resources * are currently active. The same command should be successful if resubmitted * after the number of active zones decreases below the device's limits, which * is reported in the request_queue's max_active_zones. */ #define BLK_STS_ZONE_ACTIVE_RESOURCE ((__force blk_status_t)15) /* * BLK_STS_OFFLINE is returned from the driver when the target device is offline * or is being taken offline. This could help differentiate the case where a * device is intentionally being shut down from a real I/O error. */ #define BLK_STS_OFFLINE ((__force blk_status_t)16) /* * BLK_STS_DURATION_LIMIT is returned from the driver when the target device * aborted the command because it exceeded one of its Command Duration Limits. */ #define BLK_STS_DURATION_LIMIT ((__force blk_status_t)17) /* * Invalid size or alignment. */ #define BLK_STS_INVAL ((__force blk_status_t)19) /** * blk_path_error - returns true if error may be path related * @error: status the request was completed with * * Description: * This classifies block error status into non-retryable errors and ones * that may be successful if retried on a failover path. * * Return: * %false - retrying failover path will not help * %true - may succeed if retried */ static inline bool blk_path_error(blk_status_t error) { switch (error) { case BLK_STS_NOTSUPP: case BLK_STS_NOSPC: case BLK_STS_TARGET: case BLK_STS_RESV_CONFLICT: case BLK_STS_MEDIUM: case BLK_STS_PROTECTION: return false; } /* Anything else could be a path failure, so should be retried */ return true; } typedef __u32 __bitwise blk_opf_t; typedef unsigned int blk_qc_t; #define BLK_QC_T_NONE -1U /* * main unit of I/O for the block layer and lower layers (ie drivers and * stacking drivers) */ struct bio { struct bio *bi_next; /* request queue link */ struct block_device *bi_bdev; blk_opf_t bi_opf; /* bottom bits REQ_OP, top bits * req_flags. */ unsigned short bi_flags; /* BIO_* below */ unsigned short bi_ioprio; enum rw_hint bi_write_hint; u8 bi_write_stream; blk_status_t bi_status; /* * The bvec gap bit indicates the lowest set bit in any address offset * between all bi_io_vecs. This field is initialized only after the bio * is split to the hardware limits (see bio_split_io_at()). The value * may be used to consider DMA optimization when performing that * mapping. The value is compared to a power of two mask where the * result depends on any bit set within the mask, so saving the lowest * bit is sufficient to know if any segment gap collides with the mask. */ u8 bi_bvec_gap_bit; atomic_t __bi_remaining; /* The actual vec list, preserved by bio_reset() */ struct bio_vec *bi_io_vec; struct bvec_iter bi_iter; union { /* for polled bios: */ blk_qc_t bi_cookie; /* for plugged zoned writes only: */ unsigned int __bi_nr_segments; }; bio_end_io_t *bi_end_io; void *bi_private; #ifdef CONFIG_BLK_CGROUP /* * Represents the association of the css and request_queue for the bio. * If a bio goes direct to device, it will not have a blkg as it will * not have a request_queue associated with it. The reference is put * on release of the bio. */ struct blkcg_gq *bi_blkg; /* Time that this bio was issued. */ u64 issue_time_ns; #ifdef CONFIG_BLK_CGROUP_IOCOST u64 bi_iocost_cost; #endif #endif #ifdef CONFIG_BLK_INLINE_ENCRYPTION struct bio_crypt_ctx *bi_crypt_context; #endif #if defined(CONFIG_BLK_DEV_INTEGRITY) struct bio_integrity_payload *bi_integrity; /* data integrity */ #endif unsigned short bi_vcnt; /* how many bio_vec's */ /* * Everything starting with bi_max_vecs will be preserved by bio_reset() */ /* * Number of elements in `bi_io_vec` that were allocated for this bio. * Only used by the bio submitter to make `bio_add_page` fail once full * and to free the `bi_io_vec` allocation. Must not be used in drivers * and does not hold a useful value for cloned bios. */ unsigned short bi_max_vecs; atomic_t __bi_cnt; /* pin count */ struct bio_set *bi_pool; }; #define BIO_RESET_BYTES offsetof(struct bio, bi_max_vecs) #define BIO_MAX_SIZE UINT_MAX /* max value of bi_iter.bi_size */ #define BIO_MAX_SECTORS (BIO_MAX_SIZE >> SECTOR_SHIFT) static inline struct bio_vec *bio_inline_vecs(struct bio *bio) { return (struct bio_vec *)(bio + 1); } /* * bio flags */ enum { BIO_PAGE_PINNED, /* Unpin pages in bio_release_pages() */ BIO_CLONED, /* doesn't own data */ BIO_QUIET, /* Make BIO Quiet */ BIO_CHAIN, /* chained bio, ->bi_remaining in effect */ BIO_REFFED, /* bio has elevated ->bi_cnt */ BIO_BPS_THROTTLED, /* This bio has already been subjected to * throttling rules. Don't do it again. */ BIO_TRACE_COMPLETION, /* bio_endio() should trace the final completion * of this bio. */ BIO_CGROUP_ACCT, /* has been accounted to a cgroup */ BIO_QOS_THROTTLED, /* bio went through rq_qos throttle path */ /* * This bio has completed bps throttling at the single tg granularity, * which is different from BIO_BPS_THROTTLED. When the bio is enqueued * into the sq->queued of the upper tg, or is about to be dispatched, * this flag needs to be cleared. Since blk-throttle and rq_qos are not * on the same hierarchical level, reuse the value. */ BIO_TG_BPS_THROTTLED = BIO_QOS_THROTTLED, BIO_QOS_MERGED, /* but went through rq_qos merge path */ BIO_REMAPPED, BIO_ZONE_WRITE_PLUGGING, /* bio handled through zone write plugging */ BIO_EMULATES_ZONE_APPEND, /* bio emulates a zone append operation */ BIO_FLAG_LAST }; typedef __u32 __bitwise blk_mq_req_flags_t; #define REQ_OP_BITS 8 #define REQ_OP_MASK (__force blk_opf_t)((1 << REQ_OP_BITS) - 1) #define REQ_FLAG_BITS 24 /** * enum req_op - Operations common to the bio and request structures. * We use 8 bits for encoding the operation, and the remaining 24 for flags. * * The least significant bit of the operation number indicates the data * transfer direction: * * - if the least significant bit is set transfers are TO the device * - if the least significant bit is not set transfers are FROM the device * * If a operation does not transfer data the least significant bit has no * meaning. */ enum req_op { /** @REQ_OP_READ: read sectors from the device */ REQ_OP_READ = (__force blk_opf_t)0, /** @REQ_OP_WRITE: write sectors to the device */ REQ_OP_WRITE = (__force blk_opf_t)1, /** @REQ_OP_FLUSH: flush the volatile write cache */ REQ_OP_FLUSH = (__force blk_opf_t)2, /** @REQ_OP_DISCARD: discard sectors */ REQ_OP_DISCARD = (__force blk_opf_t)3, /** @REQ_OP_SECURE_ERASE: securely erase sectors */ REQ_OP_SECURE_ERASE = (__force blk_opf_t)5, /** @REQ_OP_ZONE_APPEND: write data at the current zone write pointer */ REQ_OP_ZONE_APPEND = (__force blk_opf_t)7, /** @REQ_OP_WRITE_ZEROES: write the zero filled sector many times */ REQ_OP_WRITE_ZEROES = (__force blk_opf_t)9, /** @REQ_OP_ZONE_OPEN: Open a zone */ REQ_OP_ZONE_OPEN = (__force blk_opf_t)11, /** @REQ_OP_ZONE_CLOSE: Close a zone */ REQ_OP_ZONE_CLOSE = (__force blk_opf_t)13, /** @REQ_OP_ZONE_FINISH: Transition a zone to full */ REQ_OP_ZONE_FINISH = (__force blk_opf_t)15, /** @REQ_OP_ZONE_RESET: reset a zone write pointer */ REQ_OP_ZONE_RESET = (__force blk_opf_t)17, /** @REQ_OP_ZONE_RESET_ALL: reset all the zone present on the device */ REQ_OP_ZONE_RESET_ALL = (__force blk_opf_t)19, /* Driver private requests */ /* private: */ REQ_OP_DRV_IN = (__force blk_opf_t)34, REQ_OP_DRV_OUT = (__force blk_opf_t)35, REQ_OP_LAST = (__force blk_opf_t)36, }; /* Keep cmd_flag_name[] in sync with the definitions below */ enum req_flag_bits { __REQ_FAILFAST_DEV = /* no driver retries of device errors */ REQ_OP_BITS, __REQ_FAILFAST_TRANSPORT, /* no driver retries of transport errors */ __REQ_FAILFAST_DRIVER, /* no driver retries of driver errors */ __REQ_SYNC, /* request is sync (sync write or read) */ __REQ_META, /* metadata io request */ __REQ_PRIO, /* boost priority in cfq */ __REQ_NOMERGE, /* don't touch this for merging */ __REQ_IDLE, /* anticipate more IO after this one */ __REQ_INTEGRITY, /* I/O includes block integrity payload */ __REQ_FUA, /* forced unit access */ __REQ_PREFLUSH, /* request for cache flush */ __REQ_RAHEAD, /* read ahead, can fail anytime */ __REQ_BACKGROUND, /* background IO */ __REQ_NOWAIT, /* Don't wait if request will block */ __REQ_POLLED, /* caller polls for completion using bio_poll */ __REQ_ALLOC_CACHE, /* allocate IO from cache if available */ __REQ_SWAP, /* swap I/O */ __REQ_DRV, /* for driver use */ __REQ_FS_PRIVATE, /* for file system (submitter) use */ __REQ_ATOMIC, /* for atomic write operations */ /* * Command specific flags, keep last: */ /* for REQ_OP_WRITE_ZEROES: */ __REQ_NOUNMAP, /* do not free blocks when zeroing */ __REQ_NR_BITS, /* stops here */ }; #define REQ_FAILFAST_DEV \ (__force blk_opf_t)(1ULL << __REQ_FAILFAST_DEV) #define REQ_FAILFAST_TRANSPORT \ (__force blk_opf_t)(1ULL << __REQ_FAILFAST_TRANSPORT) #define REQ_FAILFAST_DRIVER \ (__force blk_opf_t)(1ULL << __REQ_FAILFAST_DRIVER) #define REQ_SYNC (__force blk_opf_t)(1ULL << __REQ_SYNC) #define REQ_META (__force blk_opf_t)(1ULL << __REQ_META) #define REQ_PRIO (__force blk_opf_t)(1ULL << __REQ_PRIO) #define REQ_NOMERGE (__force blk_opf_t)(1ULL << __REQ_NOMERGE) #define REQ_IDLE (__force blk_opf_t)(1ULL << __REQ_IDLE) #define REQ_INTEGRITY (__force blk_opf_t)(1ULL << __REQ_INTEGRITY) #define REQ_FUA (__force blk_opf_t)(1ULL << __REQ_FUA) #define REQ_PREFLUSH (__force blk_opf_t)(1ULL << __REQ_PREFLUSH) #define REQ_RAHEAD (__force blk_opf_t)(1ULL << __REQ_RAHEAD) #define REQ_BACKGROUND (__force blk_opf_t)(1ULL << __REQ_BACKGROUND) #define REQ_NOWAIT (__force blk_opf_t)(1ULL << __REQ_NOWAIT) #define REQ_POLLED (__force blk_opf_t)(1ULL << __REQ_POLLED) #define REQ_ALLOC_CACHE (__force blk_opf_t)(1ULL << __REQ_ALLOC_CACHE) #define REQ_SWAP (__force blk_opf_t)(1ULL << __REQ_SWAP) #define REQ_DRV (__force blk_opf_t)(1ULL << __REQ_DRV) #define REQ_FS_PRIVATE (__force blk_opf_t)(1ULL << __REQ_FS_PRIVATE) #define REQ_ATOMIC (__force blk_opf_t)(1ULL << __REQ_ATOMIC) #define REQ_NOUNMAP (__force blk_opf_t)(1ULL << __REQ_NOUNMAP) #define REQ_FAILFAST_MASK \ (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER) #define REQ_NOMERGE_FLAGS \ (REQ_NOMERGE | REQ_PREFLUSH | REQ_FUA) enum stat_group { STAT_READ, STAT_WRITE, STAT_DISCARD, STAT_FLUSH, NR_STAT_GROUPS }; static inline enum req_op bio_op(const struct bio *bio) { return bio->bi_opf & REQ_OP_MASK; } static inline bool op_is_write(blk_opf_t op) { return !!(op & (__force blk_opf_t)1); } /* * Check if the bio or request is one that needs special treatment in the * flush state machine. */ static inline bool op_is_flush(blk_opf_t op) { return op & (REQ_FUA | REQ_PREFLUSH); } /* * Reads are always treated as synchronous, as are requests with the FUA or * PREFLUSH flag. Other operations may be marked as synchronous using the * REQ_SYNC flag. */ static inline bool op_is_sync(blk_opf_t op) { return (op & REQ_OP_MASK) == REQ_OP_READ || (op & (REQ_SYNC | REQ_FUA | REQ_PREFLUSH)); } static inline bool op_is_discard(blk_opf_t op) { return (op & REQ_OP_MASK) == REQ_OP_DISCARD; } /* * Check if a bio or request operation is a zone management operation. */ static inline bool op_is_zone_mgmt(enum req_op op) { switch (op & REQ_OP_MASK) { case REQ_OP_ZONE_RESET: case REQ_OP_ZONE_RESET_ALL: case REQ_OP_ZONE_OPEN: case REQ_OP_ZONE_CLOSE: case REQ_OP_ZONE_FINISH: return true; default: return false; } } static inline int op_stat_group(enum req_op op) { if (op_is_discard(op)) return STAT_DISCARD; return op_is_write(op); } struct blk_rq_stat { u64 mean; u64 min; u64 max; u32 nr_samples; u64 batch; }; #endif /* __LINUX_BLK_TYPES_H */
20 15 20 17 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 /* * linux/fs/nls/mac-turkish.c * * Charset macturkish translation tables. * Generated automatically from the Unicode and charset * tables from the Unicode Organization (www.unicode.org). * The Unicode to charset table has only exact mappings. */ /* * COPYRIGHT AND PERMISSION NOTICE * * Copyright 1991-2012 Unicode, Inc. All rights reserved. Distributed under * the Terms of Use in http://www.unicode.org/copyright.html. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of the Unicode data files and any associated documentation (the "Data * Files") or Unicode software and any associated documentation (the * "Software") to deal in the Data Files or Software without restriction, * including without limitation the rights to use, copy, modify, merge, * publish, distribute, and/or sell copies of the Data Files or Software, and * to permit persons to whom the Data Files or Software are furnished to do * so, provided that (a) the above copyright notice(s) and this permission * notice appear with all copies of the Data Files or Software, (b) both the * above copyright notice(s) and this permission notice appear in associated * documentation, and (c) there is clear notice in each modified Data File or * in the Software as well as in the documentation associated with the Data * File(s) or Software that the data or software has been modified. * * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY * KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF * THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS * INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR * PERFORMANCE OF THE DATA FILES OR SOFTWARE. * * Except as contained in this notice, the name of a copyright holder shall * not be used in advertising or otherwise to promote the sale, use or other * dealings in these Data Files or Software without prior written * authorization of the copyright holder. */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/nls.h> #include <linux/errno.h> static const wchar_t charset2uni[256] = { /* 0x00 */ 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, /* 0x10 */ 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, /* 0x20 */ 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, /* 0x30 */ 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, /* 0x40 */ 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, /* 0x50 */ 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, /* 0x60 */ 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, /* 0x70 */ 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x007f, /* 0x80 */ 0x00c4, 0x00c5, 0x00c7, 0x00c9, 0x00d1, 0x00d6, 0x00dc, 0x00e1, 0x00e0, 0x00e2, 0x00e4, 0x00e3, 0x00e5, 0x00e7, 0x00e9, 0x00e8, /* 0x90 */ 0x00ea, 0x00eb, 0x00ed, 0x00ec, 0x00ee, 0x00ef, 0x00f1, 0x00f3, 0x00f2, 0x00f4, 0x00f6, 0x00f5, 0x00fa, 0x00f9, 0x00fb, 0x00fc, /* 0xa0 */ 0x2020, 0x00b0, 0x00a2, 0x00a3, 0x00a7, 0x2022, 0x00b6, 0x00df, 0x00ae, 0x00a9, 0x2122, 0x00b4, 0x00a8, 0x2260, 0x00c6, 0x00d8, /* 0xb0 */ 0x221e, 0x00b1, 0x2264, 0x2265, 0x00a5, 0x00b5, 0x2202, 0x2211, 0x220f, 0x03c0, 0x222b, 0x00aa, 0x00ba, 0x03a9, 0x00e6, 0x00f8, /* 0xc0 */ 0x00bf, 0x00a1, 0x00ac, 0x221a, 0x0192, 0x2248, 0x2206, 0x00ab, 0x00bb, 0x2026, 0x00a0, 0x00c0, 0x00c3, 0x00d5, 0x0152, 0x0153, /* 0xd0 */ 0x2013, 0x2014, 0x201c, 0x201d, 0x2018, 0x2019, 0x00f7, 0x25ca, 0x00ff, 0x0178, 0x011e, 0x011f, 0x0130, 0x0131, 0x015e, 0x015f, /* 0xe0 */ 0x2021, 0x00b7, 0x201a, 0x201e, 0x2030, 0x00c2, 0x00ca, 0x00c1, 0x00cb, 0x00c8, 0x00cd, 0x00ce, 0x00cf, 0x00cc, 0x00d3, 0x00d4, /* 0xf0 */ 0xf8ff, 0x00d2, 0x00da, 0x00db, 0x00d9, 0xf8a0, 0x02c6, 0x02dc, 0x00af, 0x02d8, 0x02d9, 0x02da, 0x00b8, 0x02dd, 0x02db, 0x02c7, }; static const unsigned char page00[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0xca, 0xc1, 0xa2, 0xa3, 0x00, 0xb4, 0x00, 0xa4, /* 0xa0-0xa7 */ 0xac, 0xa9, 0xbb, 0xc7, 0xc2, 0x00, 0xa8, 0xf8, /* 0xa8-0xaf */ 0xa1, 0xb1, 0x00, 0x00, 0xab, 0xb5, 0xa6, 0xe1, /* 0xb0-0xb7 */ 0xfc, 0x00, 0xbc, 0xc8, 0x00, 0x00, 0x00, 0xc0, /* 0xb8-0xbf */ 0xcb, 0xe7, 0xe5, 0xcc, 0x80, 0x81, 0xae, 0x82, /* 0xc0-0xc7 */ 0xe9, 0x83, 0xe6, 0xe8, 0xed, 0xea, 0xeb, 0xec, /* 0xc8-0xcf */ 0x00, 0x84, 0xf1, 0xee, 0xef, 0xcd, 0x85, 0x00, /* 0xd0-0xd7 */ 0xaf, 0xf4, 0xf2, 0xf3, 0x86, 0x00, 0x00, 0xa7, /* 0xd8-0xdf */ 0x88, 0x87, 0x89, 0x8b, 0x8a, 0x8c, 0xbe, 0x8d, /* 0xe0-0xe7 */ 0x8f, 0x8e, 0x90, 0x91, 0x93, 0x92, 0x94, 0x95, /* 0xe8-0xef */ 0x00, 0x96, 0x98, 0x97, 0x99, 0x9b, 0x9a, 0xd6, /* 0xf0-0xf7 */ 0xbf, 0x9d, 0x9c, 0x9e, 0x9f, 0x00, 0x00, 0xd8, /* 0xf8-0xff */ }; static const unsigned char page01[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xda, 0xdb, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0xdc, 0xdd, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0xce, 0xcf, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xde, 0xdf, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0xd9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0xc4, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char page02[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf6, 0xff, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0xf9, 0xfa, 0xfb, 0xfe, 0xf7, 0xfd, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char page03[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0xbd, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0xb9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char page20[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0xd0, 0xd1, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0xd4, 0xd5, 0xe2, 0x00, 0xd2, 0xd3, 0xe3, 0x00, /* 0x18-0x1f */ 0xa0, 0xe0, 0xa5, 0x00, 0x00, 0x00, 0xc9, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0xe4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char page21[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0xaa, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char page22[256] = { 0x00, 0x00, 0xb6, 0x00, 0x00, 0x00, 0xc6, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb8, /* 0x08-0x0f */ 0x00, 0xb7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, 0xb0, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0xba, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0xc5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0xad, 0x00, 0x00, 0x00, 0xb2, 0xb3, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char page25[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0xd7, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char pagef8[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0xf5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, /* 0xf8-0xff */ }; static const unsigned char *const page_uni2charset[256] = { page00, page01, page02, page03, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, page20, page21, page22, NULL, NULL, page25, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, pagef8, NULL, NULL, NULL, NULL, NULL, NULL, NULL, }; static const unsigned char charset2lower[256] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x00-0x07 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x08-0x0f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x10-0x17 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x18-0x1f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x20-0x27 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x28-0x2f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x30-0x37 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x38-0x3f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x40-0x47 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x48-0x4f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x50-0x57 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x58-0x5f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x60-0x67 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x68-0x6f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x70-0x77 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x78-0x7f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x80-0x87 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x88-0x8f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x90-0x97 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x98-0x9f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa0-0xa7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa8-0xaf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb0-0xb7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb8-0xbf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc0-0xc7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc8-0xcf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd0-0xd7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd8-0xdf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe0-0xe7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe8-0xef */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf0-0xf7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf8-0xff */ }; static const unsigned char charset2upper[256] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x00-0x07 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x08-0x0f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x10-0x17 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x18-0x1f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x20-0x27 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x28-0x2f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x30-0x37 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x38-0x3f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x40-0x47 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x48-0x4f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x50-0x57 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x58-0x5f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x60-0x67 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x68-0x6f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x70-0x77 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x78-0x7f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x80-0x87 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x88-0x8f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x90-0x97 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x98-0x9f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa0-0xa7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa8-0xaf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb0-0xb7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb8-0xbf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc0-0xc7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc8-0xcf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd0-0xd7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd8-0xdf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe0-0xe7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe8-0xef */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf0-0xf7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf8-0xff */ }; static int uni2char(wchar_t uni, unsigned char *out, int boundlen) { const unsigned char *uni2charset; unsigned char cl = uni & 0x00ff; unsigned char ch = (uni & 0xff00) >> 8; if (boundlen <= 0) return -ENAMETOOLONG; uni2charset = page_uni2charset[ch]; if (uni2charset && uni2charset[cl]) out[0] = uni2charset[cl]; else return -EINVAL; return 1; } static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) { *uni = charset2uni[*rawstring]; if (*uni == 0x0000) return -EINVAL; return 1; } static struct nls_table table = { .charset = "macturkish", .uni2char = uni2char, .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, }; static int __init init_nls_macturkish(void) { return register_nls(&table); } static void __exit exit_nls_macturkish(void) { unregister_nls(&table); } module_init(init_nls_macturkish) module_exit(exit_nls_macturkish) MODULE_DESCRIPTION("NLS Codepage macturkish"); MODULE_LICENSE("Dual BSD/GPL");
78 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Copyright (C) International Business Machines Corp., 2000-2004 * Portions Copyright (C) Christoph Hellwig, 2001-2002 */ #ifndef _H_JFS_INCORE #define _H_JFS_INCORE #include <linux/mutex.h> #include <linux/rwsem.h> #include <linux/slab.h> #include <linux/bitops.h> #include <linux/uuid.h> #include "jfs_types.h" #include "jfs_xtree.h" #include "jfs_dtree.h" /* * JFS magic number */ #define JFS_SUPER_MAGIC 0x3153464a /* "JFS1" */ /* * JFS-private inode information */ struct jfs_inode_info { int fileset; /* fileset number (always 16)*/ uint mode2; /* jfs-specific mode */ kuid_t saved_uid; /* saved for uid mount option */ kgid_t saved_gid; /* saved for gid mount option */ pxd_t ixpxd; /* inode extent descriptor */ dxd_t acl; /* dxd describing acl */ dxd_t ea; /* dxd describing ea */ time64_t otime; /* time created */ uint next_index; /* next available directory entry index */ int acltype; /* Type of ACL */ short btorder; /* access order */ short btindex; /* btpage entry index*/ struct inode *ipimap; /* inode map */ unsigned long cflag; /* commit flags */ u64 agstart; /* agstart of the containing IAG */ u16 bxflag; /* xflag of pseudo buffer? */ unchar pad; signed char active_ag; /* ag currently allocating from */ lid_t blid; /* lid of pseudo buffer? */ lid_t atlhead; /* anonymous tlock list head */ lid_t atltail; /* anonymous tlock list tail */ spinlock_t ag_lock; /* protects active_ag */ struct list_head anon_inode_list; /* inodes having anonymous txns */ /* * rdwrlock serializes xtree between reads & writes and synchronizes * changes to special inodes. It's use would be redundant on * directories since the i_mutex taken in the VFS is sufficient. */ struct rw_semaphore rdwrlock; /* * commit_mutex serializes transaction processing on an inode. * It must be taken after beginning a transaction (txBegin), since * dirty inodes may be committed while a new transaction on the * inode is blocked in txBegin or TxBeginAnon */ struct mutex commit_mutex; /* xattr_sem allows us to access the xattrs without taking i_mutex */ struct rw_semaphore xattr_sem; lid_t xtlid; /* lid of xtree lock on directory */ union { struct { xtroot_t _xtroot; /* 288: xtree root */ struct inomap *_imap; /* 4: inode map header */ } file; struct { struct dir_table_slot _table[12]; /* 96: dir index */ dtroot_t _dtroot; /* 288: dtree root */ } dir; struct { unchar _unused[16]; /* 16: */ dxd_t _dxd; /* 16: */ /* _inline_sym may overflow into _inline_ea when needed */ /* _inline_ea may overlay the last part of * file._xtroot if maxentry = XTROOTINITSLOT */ union { struct { /* 128: inline symlink */ unchar _inline_sym[128]; /* 128: inline extended attr */ unchar _inline_ea[128]; }; unchar _inline_all[256]; }; } link; } u; #ifdef CONFIG_QUOTA struct dquot __rcu *i_dquot[MAXQUOTAS]; #endif u32 dev; /* will die when we get wide dev_t */ struct inode vfs_inode; }; #define i_xtroot u.file._xtroot #define i_imap u.file._imap #define i_dirtable u.dir._table #define i_dtroot u.dir._dtroot #define i_inline u.link._inline_sym #define i_inline_ea u.link._inline_ea #define i_inline_all u.link._inline_all #define IREAD_LOCK(ip, subclass) \ down_read_nested(&JFS_IP(ip)->rdwrlock, subclass) #define IREAD_UNLOCK(ip) up_read(&JFS_IP(ip)->rdwrlock) #define IWRITE_LOCK(ip, subclass) \ down_write_nested(&JFS_IP(ip)->rdwrlock, subclass) #define IWRITE_UNLOCK(ip) up_write(&JFS_IP(ip)->rdwrlock) /* * cflag */ enum cflags { COMMIT_Nolink, /* inode committed with zero link count */ COMMIT_Inlineea, /* commit inode inline EA */ COMMIT_Freewmap, /* free WMAP at iClose() */ COMMIT_Dirty, /* Inode is really dirty */ COMMIT_Dirtable, /* commit changes to di_dirtable */ COMMIT_Stale, /* data extent is no longer valid */ COMMIT_Synclist, /* metadata pages on group commit synclist */ }; /* * commit_mutex nesting subclasses: */ enum commit_mutex_class { COMMIT_MUTEX_PARENT, COMMIT_MUTEX_CHILD, COMMIT_MUTEX_SECOND_PARENT, /* Renaming */ COMMIT_MUTEX_VICTIM /* Inode being unlinked due to rename */ }; /* * rdwrlock subclasses: * The dmap inode may be locked while a normal inode or the imap inode are * locked. */ enum rdwrlock_class { RDWRLOCK_NORMAL, RDWRLOCK_IMAP, RDWRLOCK_DMAP }; #define set_cflag(flag, ip) set_bit(flag, &(JFS_IP(ip)->cflag)) #define clear_cflag(flag, ip) clear_bit(flag, &(JFS_IP(ip)->cflag)) #define test_cflag(flag, ip) test_bit(flag, &(JFS_IP(ip)->cflag)) #define test_and_clear_cflag(flag, ip) \ test_and_clear_bit(flag, &(JFS_IP(ip)->cflag)) /* * JFS-private superblock information. */ struct jfs_sb_info { struct super_block *sb; /* Point back to vfs super block */ unsigned long mntflag; /* aggregate attributes */ struct inode *ipbmap; /* block map inode */ struct inode *ipaimap; /* aggregate inode map inode */ struct inode *ipaimap2; /* secondary aimap inode */ struct inode *ipimap; /* aggregate inode map inode */ struct jfs_log *log; /* log */ struct list_head log_list; /* volumes associated with a journal */ short bsize; /* logical block size */ short l2bsize; /* log2 logical block size */ short nbperpage; /* blocks per page */ short l2nbperpage; /* log2 blocks per page */ short l2niperblk; /* log2 inodes per page */ dev_t logdev; /* external log device */ uint aggregate; /* volume identifier in log record */ pxd_t logpxd; /* pxd describing log */ pxd_t fsckpxd; /* pxd describing fsck wkspc */ pxd_t ait2; /* pxd describing AIT copy */ uuid_t uuid; /* 128-bit uuid for volume */ uuid_t loguuid; /* 128-bit uuid for log */ /* * commit_state is used for synchronization of the jfs_commit * threads. It is protected by LAZY_LOCK(). */ int commit_state; /* commit state */ /* Formerly in ipimap */ uint gengen; /* inode generation generator*/ uint inostamp; /* shows inode belongs to fileset*/ /* Formerly in ipbmap */ struct bmap *bmap; /* incore bmap descriptor */ struct nls_table *nls_tab; /* current codepage */ struct inode *direct_inode; /* metadata inode */ uint state; /* mount/recovery state */ unsigned long flag; /* mount time flags */ uint p_state; /* state prior to going no integrity */ kuid_t uid; /* uid to override on-disk uid */ kgid_t gid; /* gid to override on-disk gid */ uint umask; /* umask to override on-disk umask */ uint minblks_trim; /* minimum blocks, for online trim */ }; /* jfs_sb_info commit_state */ #define IN_LAZYCOMMIT 1 static inline struct jfs_inode_info *JFS_IP(struct inode *inode) { return container_of(inode, struct jfs_inode_info, vfs_inode); } static inline int jfs_dirtable_inline(struct inode *inode) { return (JFS_IP(inode)->next_index <= (MAX_INLINE_DIRTABLE_ENTRY + 1)); } static inline struct jfs_sb_info *JFS_SBI(struct super_block *sb) { return sb->s_fs_info; } static inline int isReadOnly(struct inode *inode) { if (JFS_SBI(inode->i_sb)->log) return 0; return 1; } #endif /* _H_JFS_INCORE */
804 804 360 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_IVERSION_H #define _LINUX_IVERSION_H #include <linux/fs.h> /* * The inode->i_version field: * --------------------------- * The change attribute (i_version) is mandated by NFSv4 and is mostly for * knfsd, but is also used for other purposes (e.g. IMA). The i_version must * appear larger to observers if there was an explicit change to the inode's * data or metadata since it was last queried. * * An explicit change is one that would ordinarily result in a change to the * inode status change time (aka ctime). i_version must appear to change, even * if the ctime does not (since the whole point is to avoid missing updates due * to timestamp granularity). If POSIX or other relevant spec mandates that the * ctime must change due to an operation, then the i_version counter must be * incremented as well. * * Making the i_version update completely atomic with the operation itself would * be prohibitively expensive. Traditionally the kernel has updated the times on * directories after an operation that changes its contents. For regular files, * the ctime is usually updated before the data is copied into the cache for a * write. This means that there is a window of time when an observer can * associate a new timestamp with old file contents. Since the purpose of the * i_version is to allow for better cache coherency, the i_version must always * be updated after the results of the operation are visible. Updating it before * and after a change is also permitted. (Note that no filesystems currently do * this. Fixing that is a work-in-progress). * * Observers see the i_version as a 64-bit number that never decreases. If it * remains the same since it was last checked, then nothing has changed in the * inode. If it's different then something has changed. Observers cannot infer * anything about the nature or magnitude of the changes from the value, only * that the inode has changed in some fashion. * * Not all filesystems properly implement the i_version counter. Subsystems that * want to use i_version field on an inode should first check whether the * filesystem sets the SB_I_VERSION flag (usually via the IS_I_VERSION macro). * * Those that set SB_I_VERSION will automatically have their i_version counter * incremented on writes to normal files. If the SB_I_VERSION is not set, then * the VFS will not touch it on writes, and the filesystem can use it how it * wishes. Note that the filesystem is always responsible for updating the * i_version on namespace changes in directories (mkdir, rmdir, unlink, etc.). * We consider these sorts of filesystems to have a kernel-managed i_version. * * It may be impractical for filesystems to keep i_version updates atomic with * respect to the changes that cause them. They should, however, guarantee * that i_version updates are never visible before the changes that caused * them. Also, i_version updates should never be delayed longer than it takes * the original change to reach disk. * * This implementation uses the low bit in the i_version field as a flag to * track when the value has been queried. If it has not been queried since it * was last incremented, we can skip the increment in most cases. * * In the event that we're updating the ctime, we will usually go ahead and * bump the i_version anyway. Since that has to go to stable storage in some * fashion, we might as well increment it as well. * * With this implementation, the value should always appear to observers to * increase over time if the file has changed. It's recommended to use * inode_eq_iversion() helper to compare values. * * Note that some filesystems (e.g. NFS and AFS) just use the field to store * a server-provided value (for the most part). For that reason, those * filesystems do not set SB_I_VERSION. These filesystems are considered to * have a self-managed i_version. * * Persistently storing the i_version * ---------------------------------- * Queries of the i_version field are not gated on them hitting the backing * store. It's always possible that the host could crash after allowing * a query of the value but before it has made it to disk. * * To mitigate this problem, filesystems should always use * inode_set_iversion_queried when loading an existing inode from disk. This * ensures that the next attempted inode increment will result in the value * changing. * * Storing the value to disk therefore does not count as a query, so those * filesystems should use inode_peek_iversion to grab the value to be stored. * There is no need to flag the value as having been queried in that case. */ /* * We borrow the lowest bit in the i_version to use as a flag to tell whether * it has been queried since we last incremented it. If it has, then we must * increment it on the next change. After that, we can clear the flag and * avoid incrementing it again until it has again been queried. */ #define I_VERSION_QUERIED_SHIFT (1) #define I_VERSION_QUERIED (1ULL << (I_VERSION_QUERIED_SHIFT - 1)) #define I_VERSION_INCREMENT (1ULL << I_VERSION_QUERIED_SHIFT) /** * inode_set_iversion_raw - set i_version to the specified raw value * @inode: inode to set * @val: new i_version value to set * * Set @inode's i_version field to @val. This function is for use by * filesystems that self-manage the i_version. * * For example, the NFS client stores its NFSv4 change attribute in this way, * and the AFS client stores the data_version from the server here. */ static inline void inode_set_iversion_raw(struct inode *inode, u64 val) { atomic64_set(&inode->i_version, val); } /** * inode_peek_iversion_raw - grab a "raw" iversion value * @inode: inode from which i_version should be read * * Grab a "raw" inode->i_version value and return it. The i_version is not * flagged or converted in any way. This is mostly used to access a self-managed * i_version. * * With those filesystems, we want to treat the i_version as an entirely * opaque value. */ static inline u64 inode_peek_iversion_raw(const struct inode *inode) { return atomic64_read(&inode->i_version); } /** * inode_set_max_iversion_raw - update i_version new value is larger * @inode: inode to set * @val: new i_version to set * * Some self-managed filesystems (e.g Ceph) will only update the i_version * value if the new value is larger than the one we already have. */ static inline void inode_set_max_iversion_raw(struct inode *inode, u64 val) { u64 cur = inode_peek_iversion_raw(inode); do { if (cur > val) break; } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, val)); } /** * inode_set_iversion - set i_version to a particular value * @inode: inode to set * @val: new i_version value to set * * Set @inode's i_version field to @val. This function is for filesystems with * a kernel-managed i_version, for initializing a newly-created inode from * scratch. * * In this case, we do not set the QUERIED flag since we know that this value * has never been queried. */ static inline void inode_set_iversion(struct inode *inode, u64 val) { inode_set_iversion_raw(inode, val << I_VERSION_QUERIED_SHIFT); } /** * inode_set_iversion_queried - set i_version to a particular value as quereied * @inode: inode to set * @val: new i_version value to set * * Set @inode's i_version field to @val, and flag it for increment on the next * change. * * Filesystems that persistently store the i_version on disk should use this * when loading an existing inode from disk. * * When loading in an i_version value from a backing store, we can't be certain * that it wasn't previously viewed before being stored. Thus, we must assume * that it was, to ensure that we don't end up handing out the same value for * different versions of the same inode. */ static inline void inode_set_iversion_queried(struct inode *inode, u64 val) { inode_set_iversion_raw(inode, (val << I_VERSION_QUERIED_SHIFT) | I_VERSION_QUERIED); } bool inode_maybe_inc_iversion(struct inode *inode, bool force); /** * inode_inc_iversion - forcibly increment i_version * @inode: inode that needs to be updated * * Forcbily increment the i_version field. This always results in a change to * the observable value. */ static inline void inode_inc_iversion(struct inode *inode) { inode_maybe_inc_iversion(inode, true); } /** * inode_iversion_need_inc - is the i_version in need of being incremented? * @inode: inode to check * * Returns whether the inode->i_version counter needs incrementing on the next * change. Just fetch the value and check the QUERIED flag. */ static inline bool inode_iversion_need_inc(struct inode *inode) { return inode_peek_iversion_raw(inode) & I_VERSION_QUERIED; } /** * inode_inc_iversion_raw - forcibly increment raw i_version * @inode: inode that needs to be updated * * Forcbily increment the raw i_version field. This always results in a change * to the raw value. * * NFS will use the i_version field to store the value from the server. It * mostly treats it as opaque, but in the case where it holds a write * delegation, it must increment the value itself. This function does that. */ static inline void inode_inc_iversion_raw(struct inode *inode) { atomic64_inc(&inode->i_version); } /** * inode_peek_iversion - read i_version without flagging it to be incremented * @inode: inode from which i_version should be read * * Read the inode i_version counter for an inode without registering it as a * query. * * This is typically used by local filesystems that need to store an i_version * on disk. In that situation, it's not necessary to flag it as having been * viewed, as the result won't be used to gauge changes from that point. */ static inline u64 inode_peek_iversion(const struct inode *inode) { return inode_peek_iversion_raw(inode) >> I_VERSION_QUERIED_SHIFT; } /* * For filesystems without any sort of change attribute, the best we can * do is fake one up from the ctime: */ static inline u64 time_to_chattr(const struct timespec64 *t) { u64 chattr = t->tv_sec; chattr <<= 32; chattr += t->tv_nsec; return chattr; } u64 inode_query_iversion(struct inode *inode); /** * inode_eq_iversion_raw - check whether the raw i_version counter has changed * @inode: inode to check * @old: old value to check against its i_version * * Compare the current raw i_version counter with a previous one. Returns true * if they are the same or false if they are different. */ static inline bool inode_eq_iversion_raw(const struct inode *inode, u64 old) { return inode_peek_iversion_raw(inode) == old; } /** * inode_eq_iversion - check whether the i_version counter has changed * @inode: inode to check * @old: old value to check against its i_version * * Compare an i_version counter with a previous one. Returns true if they are * the same, and false if they are different. * * Note that we don't need to set the QUERIED flag in this case, as the value * in the inode is not being recorded for later use. */ static inline bool inode_eq_iversion(const struct inode *inode, u64 old) { return inode_peek_iversion(inode) == old; } #endif
175 175 481 480 271 271 322 23 321 175 38 175 175 24 24 14 14 402 86 319 22 339 338 339 255 255 30 246 328 327 147 146 147 309 309 309 10 10 10 85 85 85 85 85 85 85 263 257 258 11 257 245 250 245 248 4 269 3 277 272 1 2 1 1 1 1 253 253 253 47 47 11 36 13 34 47 275 5 271 267 11 11 13 253 253 253 253 1 27 4 3 3 259 11 104 104 104 11 24 24 24 24 24 31 31 31 30 31 31 31 31 31 31 31 31 31 31 11 24 335 48 283 283 284 284 284 8 9 9 9 8 8 272 263 263 270 149 107 15 271 256 256 10 256 6 254 252 256 7 255 255 255 13 3 14 3 1 4 254 254 3 2 1 251 7 1 245 252 9 9 243 250 3 345 345 344 345 345 344 344 343 344 345 345 345 345 345 345 1 345 1 345 1 343 344 345 345 344 345 343 269 1 345 344 345 345 345 263 262 14 262 13 256 260 259 17 260 17 6 11 11 11 260 256 7 251 283 222 49 247 25 9 10 212 212 212 212 212 212 212 212 150 62 150 62 62 182 30 30 212 212 211 1 212 286 286 286 286 286 220 271 272 272 272 1 2 1 269 245 24 269 268 6 263 263 1 2 9 22 229 1 2 249 249 249 2 39 204 4 10 10 208 12 5 27 163 166 97 166 5 160 163 74 97 163 159 158 166 161 166 162 166 161 160 12 12 12 12 170 79 104 5 166 163 166 159 163 158 12 8 4 1 12 12 7 7 5 2 2 5 7 7 7 7 286 286 77 60 125 125 15 16 2 4 1 3 3 3 4 4 4 4 4 4 4 4 4 4 4 258 1 2 258 61 61 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. */ #include <linux/fs.h> #include <linux/blkdev.h> #include <linux/radix-tree.h> #include <linux/writeback.h> #include <linux/workqueue.h> #include <linux/kthread.h> #include <linux/slab.h> #include <linux/migrate.h> #include <linux/ratelimit.h> #include <linux/uuid.h> #include <linux/semaphore.h> #include <linux/error-injection.h> #include <linux/crc32c.h> #include <linux/sched/mm.h> #include <linux/unaligned.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" #include "btrfs_inode.h" #include "delayed-inode.h" #include "bio.h" #include "print-tree.h" #include "locking.h" #include "tree-log.h" #include "free-space-cache.h" #include "free-space-tree.h" #include "dev-replace.h" #include "raid56.h" #include "sysfs.h" #include "qgroup.h" #include "compression.h" #include "tree-checker.h" #include "ref-verify.h" #include "block-group.h" #include "discard.h" #include "space-info.h" #include "zoned.h" #include "subpage.h" #include "fs.h" #include "accessors.h" #include "extent-tree.h" #include "root-tree.h" #include "defrag.h" #include "uuid-tree.h" #include "relocation.h" #include "scrub.h" #include "super.h" #include "delayed-inode.h" #define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\ BTRFS_HEADER_FLAG_RELOC |\ BTRFS_SUPER_FLAG_ERROR |\ BTRFS_SUPER_FLAG_SEEDING |\ BTRFS_SUPER_FLAG_METADUMP |\ BTRFS_SUPER_FLAG_METADUMP_V2) static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info); static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info); /* * Compute the csum of a btree block and store the result to provided buffer. */ static void csum_tree_block(struct extent_buffer *buf, u8 *result) { struct btrfs_fs_info *fs_info = buf->fs_info; int num_pages; u32 first_page_part; struct btrfs_csum_ctx csum; char *kaddr; int i; btrfs_csum_init(&csum, fs_info->csum_type); if (buf->addr) { /* Pages are contiguous, handle them as a big one. */ kaddr = buf->addr; first_page_part = fs_info->nodesize; num_pages = 1; } else { kaddr = folio_address(buf->folios[0]); first_page_part = min_t(u32, PAGE_SIZE, fs_info->nodesize); num_pages = num_extent_pages(buf); } btrfs_csum_update(&csum, kaddr + BTRFS_CSUM_SIZE, first_page_part - BTRFS_CSUM_SIZE); /* * Multiple single-page folios case would reach here. * * nodesize <= PAGE_SIZE and large folio all handled by above * btrfs_csum_update() already. */ for (i = 1; i < num_pages && INLINE_EXTENT_BUFFER_PAGES > 1; i++) { kaddr = folio_address(buf->folios[i]); btrfs_csum_update(&csum, kaddr, PAGE_SIZE); } memset(result, 0, BTRFS_CSUM_SIZE); btrfs_csum_final(&csum, result); } /* * we can't consider a given block up to date unless the transid of the * block matches the transid in the parent node's pointer. This is how we * detect blocks that either didn't get written at all or got written * in the wrong place. */ int btrfs_buffer_uptodate(struct extent_buffer *eb, u64 parent_transid, bool atomic) { if (!extent_buffer_uptodate(eb)) return 0; if (!parent_transid || btrfs_header_generation(eb) == parent_transid) return 1; if (atomic) return -EAGAIN; if (!extent_buffer_uptodate(eb) || btrfs_header_generation(eb) != parent_transid) { btrfs_err_rl(eb->fs_info, "parent transid verify failed on logical %llu mirror %u wanted %llu found %llu", eb->start, eb->read_mirror, parent_transid, btrfs_header_generation(eb)); clear_extent_buffer_uptodate(eb); return 0; } return 1; } static bool btrfs_supported_super_csum(u16 csum_type) { switch (csum_type) { case BTRFS_CSUM_TYPE_CRC32: case BTRFS_CSUM_TYPE_XXHASH: case BTRFS_CSUM_TYPE_SHA256: case BTRFS_CSUM_TYPE_BLAKE2: return true; default: return false; } } /* * Return 0 if the superblock checksum type matches the checksum value of that * algorithm. Pass the raw disk superblock data. */ int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, const struct btrfs_super_block *disk_sb) { u8 result[BTRFS_CSUM_SIZE]; /* * The super_block structure does not span the whole * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space is * filled with zeros and is included in the checksum. */ btrfs_csum(fs_info->csum_type, (const u8 *)disk_sb + BTRFS_CSUM_SIZE, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, result); if (memcmp(disk_sb->csum, result, fs_info->csum_size)) return 1; return 0; } static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num) { struct btrfs_fs_info *fs_info = eb->fs_info; const u32 step = min(fs_info->nodesize, PAGE_SIZE); const u32 nr_steps = eb->len / step; phys_addr_t paddrs[BTRFS_MAX_BLOCKSIZE / PAGE_SIZE]; if (sb_rdonly(fs_info->sb)) return -EROFS; for (int i = 0; i < num_extent_pages(eb); i++) { struct folio *folio = eb->folios[i]; /* No large folio support yet. */ ASSERT(folio_order(folio) == 0); ASSERT(i < nr_steps); /* * For nodesize < page size, there is just one paddr, with some * offset inside the page. * * For nodesize >= page size, it's one or more paddrs, and eb->start * must be aligned to page boundary. */ paddrs[i] = page_to_phys(&folio->page) + offset_in_page(eb->start); } return btrfs_repair_io_failure(fs_info, 0, eb->start, eb->len, eb->start, paddrs, step, mirror_num); } /* * helper to read a given tree block, doing retries as required when * the checksums don't match and we have alternate mirrors to try. * * @check: expected tree parentness check, see the comments of the * structure for details. */ int btrfs_read_extent_buffer(struct extent_buffer *eb, const struct btrfs_tree_parent_check *check) { struct btrfs_fs_info *fs_info = eb->fs_info; int failed = 0; int ret; int num_copies = 0; int mirror_num = 0; int failed_mirror = 0; ASSERT(check); while (1) { ret = read_extent_buffer_pages(eb, mirror_num, check); if (!ret) break; num_copies = btrfs_num_copies(fs_info, eb->start, eb->len); if (num_copies == 1) break; if (!failed_mirror) { failed = 1; failed_mirror = eb->read_mirror; } mirror_num++; if (mirror_num == failed_mirror) mirror_num++; if (mirror_num > num_copies) break; } if (failed && !ret && failed_mirror) btrfs_repair_eb_io_failure(eb, failed_mirror); return ret; } /* * Checksum a dirty tree block before IO. */ int btree_csum_one_bio(struct btrfs_bio *bbio) { struct extent_buffer *eb = bbio->private; struct btrfs_fs_info *fs_info = eb->fs_info; u64 found_start = btrfs_header_bytenr(eb); u64 last_trans; u8 result[BTRFS_CSUM_SIZE]; int ret; /* Btree blocks are always contiguous on disk. */ if (WARN_ON_ONCE(bbio->file_offset != eb->start)) return -EIO; if (WARN_ON_ONCE(bbio->bio.bi_iter.bi_size != eb->len)) return -EIO; /* * If an extent_buffer is marked as EXTENT_BUFFER_ZONED_ZEROOUT, don't * checksum it but zero-out its content. This is done to preserve * ordering of I/O without unnecessarily writing out data. */ if (test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags)) { memzero_extent_buffer(eb, 0, eb->len); return 0; } if (WARN_ON_ONCE(found_start != eb->start)) return -EIO; if (WARN_ON(!btrfs_meta_folio_test_uptodate(eb->folios[0], eb))) return -EIO; ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid, offsetof(struct btrfs_header, fsid), BTRFS_FSID_SIZE) == 0); csum_tree_block(eb, result); if (btrfs_header_level(eb)) ret = btrfs_check_node(eb); else ret = btrfs_check_leaf(eb); if (ret < 0) goto error; /* * Also check the generation, the eb reached here must be newer than * last committed. Or something seriously wrong happened. */ last_trans = btrfs_get_last_trans_committed(fs_info); if (unlikely(btrfs_header_generation(eb) <= last_trans)) { ret = -EUCLEAN; btrfs_err(fs_info, "block=%llu bad generation, have %llu expect > %llu", eb->start, btrfs_header_generation(eb), last_trans); goto error; } write_extent_buffer(eb, result, 0, fs_info->csum_size); return 0; error: btrfs_print_tree(eb, 0); btrfs_err(fs_info, "block=%llu write time tree block corruption detected", eb->start); /* * Be noisy if this is an extent buffer from a log tree. We don't abort * a transaction in case there's a bad log tree extent buffer, we just * fallback to a transaction commit. Still we want to know when there is * a bad log tree extent buffer, as that may signal a bug somewhere. */ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG) || btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID); return ret; } static bool check_tree_block_fsid(struct extent_buffer *eb) { struct btrfs_fs_info *fs_info = eb->fs_info; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; u8 fsid[BTRFS_FSID_SIZE]; read_extent_buffer(eb, fsid, offsetof(struct btrfs_header, fsid), BTRFS_FSID_SIZE); /* * alloc_fsid_devices() copies the fsid into fs_devices::metadata_uuid. * This is then overwritten by metadata_uuid if it is present in the * device_list_add(). The same true for a seed device as well. So use of * fs_devices::metadata_uuid is appropriate here. */ if (memcmp(fsid, fs_info->fs_devices->metadata_uuid, BTRFS_FSID_SIZE) == 0) return false; list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) if (!memcmp(fsid, seed_devs->fsid, BTRFS_FSID_SIZE)) return false; return true; } /* Do basic extent buffer checks at read time */ int btrfs_validate_extent_buffer(struct extent_buffer *eb, const struct btrfs_tree_parent_check *check) { struct btrfs_fs_info *fs_info = eb->fs_info; u64 found_start; const u32 csum_size = fs_info->csum_size; u8 found_level; u8 result[BTRFS_CSUM_SIZE]; const u8 *header_csum; int ret = 0; const bool ignore_csum = btrfs_test_opt(fs_info, IGNOREMETACSUMS); ASSERT(check); found_start = btrfs_header_bytenr(eb); if (unlikely(found_start != eb->start)) { btrfs_err_rl(fs_info, "bad tree block start, mirror %u want %llu have %llu", eb->read_mirror, eb->start, found_start); return -EIO; } if (unlikely(check_tree_block_fsid(eb))) { btrfs_err_rl(fs_info, "bad fsid on logical %llu mirror %u", eb->start, eb->read_mirror); return -EIO; } found_level = btrfs_header_level(eb); if (unlikely(found_level >= BTRFS_MAX_LEVEL)) { btrfs_err(fs_info, "bad tree block level, mirror %u level %d on logical %llu", eb->read_mirror, btrfs_header_level(eb), eb->start); return -EIO; } csum_tree_block(eb, result); header_csum = folio_address(eb->folios[0]) + get_eb_offset_in_folio(eb, offsetof(struct btrfs_header, csum)); if (memcmp(result, header_csum, csum_size) != 0) { btrfs_warn_rl(fs_info, "checksum verify failed on logical %llu mirror %u wanted " BTRFS_CSUM_FMT " found " BTRFS_CSUM_FMT " level %d%s", eb->start, eb->read_mirror, BTRFS_CSUM_FMT_VALUE(csum_size, header_csum), BTRFS_CSUM_FMT_VALUE(csum_size, result), btrfs_header_level(eb), ignore_csum ? ", ignored" : ""); if (unlikely(!ignore_csum)) return -EUCLEAN; } if (unlikely(found_level != check->level)) { btrfs_err(fs_info, "level verify failed on logical %llu mirror %u wanted %u found %u", eb->start, eb->read_mirror, check->level, found_level); return -EIO; } if (unlikely(check->transid && btrfs_header_generation(eb) != check->transid)) { btrfs_err_rl(eb->fs_info, "parent transid verify failed on logical %llu mirror %u wanted %llu found %llu", eb->start, eb->read_mirror, check->transid, btrfs_header_generation(eb)); return -EIO; } if (check->has_first_key) { const struct btrfs_key *expect_key = &check->first_key; struct btrfs_key found_key; if (found_level) btrfs_node_key_to_cpu(eb, &found_key, 0); else btrfs_item_key_to_cpu(eb, &found_key, 0); if (unlikely(btrfs_comp_cpu_keys(expect_key, &found_key))) { btrfs_err(fs_info, "tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)", eb->start, check->transid, expect_key->objectid, expect_key->type, expect_key->offset, found_key.objectid, found_key.type, found_key.offset); return -EUCLEAN; } } if (check->owner_root) { ret = btrfs_check_eb_owner(eb, check->owner_root); if (ret < 0) return ret; } /* If this is a leaf block and it is corrupt, just return -EIO. */ if (found_level == 0 && btrfs_check_leaf(eb)) ret = -EIO; if (found_level > 0 && btrfs_check_node(eb)) ret = -EIO; if (ret) btrfs_err(fs_info, "read time tree block corruption detected on logical %llu mirror %u", eb->start, eb->read_mirror); return ret; } #ifdef CONFIG_MIGRATION static int btree_migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode) { /* * we can't safely write a btree page from here, * we haven't done the locking hook */ if (folio_test_dirty(src)) return -EAGAIN; /* * Buffers may be managed in a filesystem specific way. * We must have no buffers or drop them. */ if (folio_get_private(src) && !filemap_release_folio(src, GFP_KERNEL)) return -EAGAIN; return migrate_folio(mapping, dst, src, mode); } #else #define btree_migrate_folio NULL #endif static bool btree_release_folio(struct folio *folio, gfp_t gfp_flags) { if (folio_test_writeback(folio) || folio_test_dirty(folio)) return false; return try_release_extent_buffer(folio); } static void btree_invalidate_folio(struct folio *folio, size_t offset, size_t length) { struct extent_io_tree *tree; tree = &folio_to_inode(folio)->io_tree; extent_invalidate_folio(tree, folio, offset); btree_release_folio(folio, GFP_NOFS); if (folio_get_private(folio)) { btrfs_warn(folio_to_fs_info(folio), "folio private not zero on folio %llu", (unsigned long long)folio_pos(folio)); folio_detach_private(folio); } } #ifdef DEBUG static bool btree_dirty_folio(struct address_space *mapping, struct folio *folio) { struct btrfs_fs_info *fs_info = inode_to_fs_info(mapping->host); struct btrfs_subpage_info *spi = fs_info->subpage_info; struct btrfs_subpage *subpage; struct extent_buffer *eb; int cur_bit = 0; u64 page_start = folio_pos(folio); if (fs_info->sectorsize == PAGE_SIZE) { eb = folio_get_private(folio); BUG_ON(!eb); BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); BUG_ON(!atomic_read(&eb->refs)); btrfs_assert_tree_write_locked(eb); return filemap_dirty_folio(mapping, folio); } ASSERT(spi); subpage = folio_get_private(folio); for (cur_bit = spi->dirty_offset; cur_bit < spi->dirty_offset + spi->bitmap_nr_bits; cur_bit++) { unsigned long flags; u64 cur; spin_lock_irqsave(&subpage->lock, flags); if (!test_bit(cur_bit, subpage->bitmaps)) { spin_unlock_irqrestore(&subpage->lock, flags); continue; } spin_unlock_irqrestore(&subpage->lock, flags); cur = page_start + cur_bit * fs_info->sectorsize; eb = find_extent_buffer(fs_info, cur); ASSERT(eb); ASSERT(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); ASSERT(atomic_read(&eb->refs)); btrfs_assert_tree_write_locked(eb); free_extent_buffer(eb); cur_bit += (fs_info->nodesize >> fs_info->sectorsize_bits) - 1; } return filemap_dirty_folio(mapping, folio); } #else #define btree_dirty_folio filemap_dirty_folio #endif static const struct address_space_operations btree_aops = { .writepages = btree_writepages, .release_folio = btree_release_folio, .invalidate_folio = btree_invalidate_folio, .migrate_folio = btree_migrate_folio, .dirty_folio = btree_dirty_folio, }; struct extent_buffer *btrfs_find_create_tree_block( struct btrfs_fs_info *fs_info, u64 bytenr, u64 owner_root, int level) { if (btrfs_is_testing(fs_info)) return alloc_test_extent_buffer(fs_info, bytenr); return alloc_extent_buffer(fs_info, bytenr, owner_root, level); } /* * Read tree block at logical address @bytenr and do variant basic but critical * verification. * * @check: expected tree parentness check, see comments of the * structure for details. */ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, struct btrfs_tree_parent_check *check) { struct extent_buffer *buf = NULL; int ret; ASSERT(check); buf = btrfs_find_create_tree_block(fs_info, bytenr, check->owner_root, check->level); if (IS_ERR(buf)) return buf; ret = btrfs_read_extent_buffer(buf, check); if (ret) { free_extent_buffer_stale(buf); return ERR_PTR(ret); } return buf; } static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info, u64 objectid, gfp_t flags) { struct btrfs_root *root; root = kzalloc_obj(*root, flags); if (!root) return NULL; root->fs_info = fs_info; root->root_key.objectid = objectid; RB_CLEAR_NODE(&root->rb_node); xa_init(&root->inodes); xa_init(&root->delayed_nodes); btrfs_init_root_block_rsv(root); INIT_LIST_HEAD(&root->dirty_list); INIT_LIST_HEAD(&root->root_list); INIT_LIST_HEAD(&root->delalloc_inodes); INIT_LIST_HEAD(&root->delalloc_root); INIT_LIST_HEAD(&root->ordered_extents); INIT_LIST_HEAD(&root->ordered_root); INIT_LIST_HEAD(&root->reloc_dirty_list); spin_lock_init(&root->delalloc_lock); spin_lock_init(&root->ordered_extent_lock); spin_lock_init(&root->accounting_lock); spin_lock_init(&root->qgroup_meta_rsv_lock); mutex_init(&root->objectid_mutex); mutex_init(&root->log_mutex); mutex_init(&root->ordered_extent_mutex); mutex_init(&root->delalloc_mutex); init_waitqueue_head(&root->qgroup_flush_wait); init_waitqueue_head(&root->log_writer_wait); init_waitqueue_head(&root->log_commit_wait[0]); init_waitqueue_head(&root->log_commit_wait[1]); INIT_LIST_HEAD(&root->log_ctxs[0]); INIT_LIST_HEAD(&root->log_ctxs[1]); atomic_set(&root->log_commit[0], 0); atomic_set(&root->log_commit[1], 0); atomic_set(&root->log_writers, 0); atomic_set(&root->log_batch, 0); refcount_set(&root->refs, 1); atomic_set(&root->snapshot_force_cow, 0); atomic_set(&root->nr_swapfiles, 0); root->log_transid_committed = -1; if (!btrfs_is_testing(fs_info)) { btrfs_extent_io_tree_init(fs_info, &root->dirty_log_pages, IO_TREE_ROOT_DIRTY_LOG_PAGES); btrfs_extent_io_tree_init(fs_info, &root->log_csum_range, IO_TREE_LOG_CSUM_RANGE); } spin_lock_init(&root->root_item_lock); btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks); #ifdef CONFIG_BTRFS_DEBUG INIT_LIST_HEAD(&root->leak_list); spin_lock(&fs_info->fs_roots_radix_lock); list_add_tail(&root->leak_list, &fs_info->allocated_roots); spin_unlock(&fs_info->fs_roots_radix_lock); #endif return root; } #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS /* Should only be used by the testing infrastructure */ struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info) { struct btrfs_root *root; if (!fs_info) return ERR_PTR(-EINVAL); root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID, GFP_KERNEL); if (!root) return ERR_PTR(-ENOMEM); /* We don't use the stripesize in selftest, set it as sectorsize */ root->alloc_bytenr = 0; return root; } #endif static int global_root_cmp(struct rb_node *a_node, const struct rb_node *b_node) { const struct btrfs_root *a = rb_entry(a_node, struct btrfs_root, rb_node); const struct btrfs_root *b = rb_entry(b_node, struct btrfs_root, rb_node); return btrfs_comp_cpu_keys(&a->root_key, &b->root_key); } static int global_root_key_cmp(const void *k, const struct rb_node *node) { const struct btrfs_key *key = k; const struct btrfs_root *root = rb_entry(node, struct btrfs_root, rb_node); return btrfs_comp_cpu_keys(key, &root->root_key); } int btrfs_global_root_insert(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; struct rb_node *tmp; int ret = 0; write_lock(&fs_info->global_root_lock); tmp = rb_find_add(&root->rb_node, &fs_info->global_root_tree, global_root_cmp); write_unlock(&fs_info->global_root_lock); if (tmp) { ret = -EEXIST; btrfs_warn(fs_info, "global root %llu %llu already exists", btrfs_root_id(root), root->root_key.offset); } return ret; } void btrfs_global_root_delete(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; write_lock(&fs_info->global_root_lock); rb_erase(&root->rb_node, &fs_info->global_root_tree); write_unlock(&fs_info->global_root_lock); } struct btrfs_root *btrfs_global_root(struct btrfs_fs_info *fs_info, struct btrfs_key *key) { struct rb_node *node; struct btrfs_root *root = NULL; read_lock(&fs_info->global_root_lock); node = rb_find(key, &fs_info->global_root_tree, global_root_key_cmp); if (node) root = container_of(node, struct btrfs_root, rb_node); read_unlock(&fs_info->global_root_lock); return root; } static u64 btrfs_global_root_id(struct btrfs_fs_info *fs_info, u64 bytenr) { struct btrfs_block_group *block_group; u64 ret; if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) return 0; if (bytenr) block_group = btrfs_lookup_block_group(fs_info, bytenr); else block_group = btrfs_lookup_first_block_group(fs_info, bytenr); ASSERT(block_group); if (!block_group) return 0; ret = block_group->global_root_id; btrfs_put_block_group(block_group); return ret; } struct btrfs_root *btrfs_csum_root(struct btrfs_fs_info *fs_info, u64 bytenr) { struct btrfs_key key = { .objectid = BTRFS_CSUM_TREE_OBJECTID, .type = BTRFS_ROOT_ITEM_KEY, .offset = btrfs_global_root_id(fs_info, bytenr), }; return btrfs_global_root(fs_info, &key); } struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr) { struct btrfs_key key = { .objectid = BTRFS_EXTENT_TREE_OBJECTID, .type = BTRFS_ROOT_ITEM_KEY, .offset = btrfs_global_root_id(fs_info, bytenr), }; return btrfs_global_root(fs_info, &key); } struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, u64 objectid) { struct btrfs_fs_info *fs_info = trans->fs_info; struct extent_buffer *leaf; struct btrfs_root *tree_root = fs_info->tree_root; struct btrfs_root *root; unsigned int nofs_flag; int ret = 0; /* * We're holding a transaction handle, so use a NOFS memory allocation * context to avoid deadlock if reclaim happens. */ nofs_flag = memalloc_nofs_save(); root = btrfs_alloc_root(fs_info, objectid, GFP_KERNEL); memalloc_nofs_restore(nofs_flag); if (!root) return ERR_PTR(-ENOMEM); root->root_key.objectid = objectid; root->root_key.type = BTRFS_ROOT_ITEM_KEY; root->root_key.offset = 0; leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0, 0, BTRFS_NESTING_NORMAL); if (IS_ERR(leaf)) { ret = PTR_ERR(leaf); leaf = NULL; goto fail; } root->node = leaf; btrfs_mark_buffer_dirty(trans, leaf); root->commit_root = btrfs_root_node(root); set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); btrfs_set_root_flags(&root->root_item, 0); btrfs_set_root_limit(&root->root_item, 0); btrfs_set_root_bytenr(&root->root_item, leaf->start); btrfs_set_root_generation(&root->root_item, trans->transid); btrfs_set_root_level(&root->root_item, 0); btrfs_set_root_refs(&root->root_item, 1); btrfs_set_root_used(&root->root_item, leaf->len); btrfs_set_root_last_snapshot(&root->root_item, 0); btrfs_set_root_dirid(&root->root_item, 0); if (btrfs_is_fstree(objectid)) generate_random_guid(root->root_item.uuid); else export_guid(root->root_item.uuid, &guid_null); btrfs_set_root_drop_level(&root->root_item, 0); btrfs_tree_unlock(leaf); ret = btrfs_insert_root(trans, tree_root, &root->root_key, &root->root_item); if (ret) goto fail; return root; fail: btrfs_put_root(root); return ERR_PTR(ret); } static struct btrfs_root *alloc_log_tree(struct btrfs_fs_info *fs_info) { struct btrfs_root *root; root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID, GFP_NOFS); if (!root) return ERR_PTR(-ENOMEM); root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID; root->root_key.type = BTRFS_ROOT_ITEM_KEY; root->root_key.offset = BTRFS_TREE_LOG_OBJECTID; return root; } int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans, struct btrfs_root *root) { struct extent_buffer *leaf; /* * DON'T set SHAREABLE bit for log trees. * * Log trees are not exposed to user space thus can't be snapshotted, * and they go away before a real commit is actually done. * * They do store pointers to file data extents, and those reference * counts still get updated (along with back refs to the log tree). */ leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID, NULL, 0, 0, 0, 0, BTRFS_NESTING_NORMAL); if (IS_ERR(leaf)) return PTR_ERR(leaf); root->node = leaf; btrfs_mark_buffer_dirty(trans, root->node); btrfs_tree_unlock(root->node); return 0; } int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info) { struct btrfs_root *log_root; log_root = alloc_log_tree(fs_info); if (IS_ERR(log_root)) return PTR_ERR(log_root); if (!btrfs_is_zoned(fs_info)) { int ret = btrfs_alloc_log_tree_node(trans, log_root); if (ret) { btrfs_put_root(log_root); return ret; } } WARN_ON(fs_info->log_root_tree); fs_info->log_root_tree = log_root; return 0; } int btrfs_add_log_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_root *log_root; struct btrfs_inode_item *inode_item; int ret; log_root = alloc_log_tree(fs_info); if (IS_ERR(log_root)) return PTR_ERR(log_root); ret = btrfs_alloc_log_tree_node(trans, log_root); if (ret) { btrfs_put_root(log_root); return ret; } btrfs_set_root_last_trans(log_root, trans->transid); log_root->root_key.offset = btrfs_root_id(root); inode_item = &log_root->root_item.inode; btrfs_set_stack_inode_generation(inode_item, 1); btrfs_set_stack_inode_size(inode_item, 3); btrfs_set_stack_inode_nlink(inode_item, 1); btrfs_set_stack_inode_nbytes(inode_item, fs_info->nodesize); btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755); btrfs_set_root_node(&log_root->root_item, log_root->node); WARN_ON(root->log_root); root->log_root = log_root; btrfs_set_root_log_transid(root, 0); root->log_transid_committed = -1; btrfs_set_root_last_log_commit(root, 0); return 0; } static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root, struct btrfs_path *path, const struct btrfs_key *key) { struct btrfs_root *root; struct btrfs_tree_parent_check check = { 0 }; struct btrfs_fs_info *fs_info = tree_root->fs_info; u64 generation; int ret; int level; root = btrfs_alloc_root(fs_info, key->objectid, GFP_NOFS); if (!root) return ERR_PTR(-ENOMEM); ret = btrfs_find_root(tree_root, key, path, &root->root_item, &root->root_key); if (ret) { if (ret > 0) ret = -ENOENT; goto fail; } generation = btrfs_root_generation(&root->root_item); level = btrfs_root_level(&root->root_item); check.level = level; check.transid = generation; check.owner_root = key->objectid; root->node = read_tree_block(fs_info, btrfs_root_bytenr(&root->root_item), &check); if (IS_ERR(root->node)) { ret = PTR_ERR(root->node); root->node = NULL; goto fail; } if (unlikely(!btrfs_buffer_uptodate(root->node, generation, false))) { ret = -EIO; goto fail; } /* * For real fs, and not log/reloc trees, root owner must * match its root node owner */ if (unlikely(!btrfs_is_testing(fs_info) && btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID && btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID && btrfs_root_id(root) != btrfs_header_owner(root->node))) { btrfs_crit(fs_info, "root=%llu block=%llu, tree root owner mismatch, have %llu expect %llu", btrfs_root_id(root), root->node->start, btrfs_header_owner(root->node), btrfs_root_id(root)); ret = -EUCLEAN; goto fail; } root->commit_root = btrfs_root_node(root); return root; fail: btrfs_put_root(root); return ERR_PTR(ret); } struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, const struct btrfs_key *key) { struct btrfs_root *root; BTRFS_PATH_AUTO_FREE(path); path = btrfs_alloc_path(); if (!path) return ERR_PTR(-ENOMEM); root = read_tree_root_path(tree_root, path, key); return root; } /* * Initialize subvolume root in-memory structure. * * @anon_dev: anonymous device to attach to the root, if zero, allocate new * * In case of failure the caller is responsible to call btrfs_free_fs_root() */ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) { int ret; btrfs_drew_lock_init(&root->snapshot_lock); if (btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID && !btrfs_is_data_reloc_root(root) && btrfs_is_fstree(btrfs_root_id(root))) { set_bit(BTRFS_ROOT_SHAREABLE, &root->state); btrfs_check_and_init_root_item(&root->root_item); } /* * Don't assign anonymous block device to roots that are not exposed to * userspace, the id pool is limited to 1M */ if (btrfs_is_fstree(btrfs_root_id(root)) && btrfs_root_refs(&root->root_item) > 0) { if (!anon_dev) { ret = get_anon_bdev(&root->anon_dev); if (ret) return ret; } else { root->anon_dev = anon_dev; } } mutex_lock(&root->objectid_mutex); ret = btrfs_init_root_free_objectid(root); if (ret) { mutex_unlock(&root->objectid_mutex); return ret; } ASSERT(root->free_objectid <= BTRFS_LAST_FREE_OBJECTID); mutex_unlock(&root->objectid_mutex); return 0; } static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, u64 root_id) { struct btrfs_root *root; spin_lock(&fs_info->fs_roots_radix_lock); root = radix_tree_lookup(&fs_info->fs_roots_radix, (unsigned long)root_id); root = btrfs_grab_root(root); spin_unlock(&fs_info->fs_roots_radix_lock); return root; } static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info, u64 objectid) { struct btrfs_key key = { .objectid = objectid, .type = BTRFS_ROOT_ITEM_KEY, .offset = 0, }; switch (objectid) { case BTRFS_ROOT_TREE_OBJECTID: return btrfs_grab_root(fs_info->tree_root); case BTRFS_EXTENT_TREE_OBJECTID: return btrfs_grab_root(btrfs_global_root(fs_info, &key)); case BTRFS_CHUNK_TREE_OBJECTID: return btrfs_grab_root(fs_info->chunk_root); case BTRFS_DEV_TREE_OBJECTID: return btrfs_grab_root(fs_info->dev_root); case BTRFS_CSUM_TREE_OBJECTID: return btrfs_grab_root(btrfs_global_root(fs_info, &key)); case BTRFS_QUOTA_TREE_OBJECTID: return btrfs_grab_root(fs_info->quota_root); case BTRFS_UUID_TREE_OBJECTID: return btrfs_grab_root(fs_info->uuid_root); case BTRFS_BLOCK_GROUP_TREE_OBJECTID: return btrfs_grab_root(fs_info->block_group_root); case BTRFS_FREE_SPACE_TREE_OBJECTID: return btrfs_grab_root(btrfs_global_root(fs_info, &key)); case BTRFS_RAID_STRIPE_TREE_OBJECTID: return btrfs_grab_root(fs_info->stripe_root); case BTRFS_REMAP_TREE_OBJECTID: return btrfs_grab_root(fs_info->remap_root); default: return NULL; } } int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) { int ret; ret = radix_tree_preload(GFP_NOFS); if (ret) return ret; spin_lock(&fs_info->fs_roots_radix_lock); ret = radix_tree_insert(&fs_info->fs_roots_radix, (unsigned long)btrfs_root_id(root), root); if (ret == 0) { btrfs_grab_root(root); set_bit(BTRFS_ROOT_IN_RADIX, &root->state); } spin_unlock(&fs_info->fs_roots_radix_lock); radix_tree_preload_end(); return ret; } void btrfs_check_leaked_roots(const struct btrfs_fs_info *fs_info) { #ifdef CONFIG_BTRFS_DEBUG struct btrfs_root *root; while (!list_empty(&fs_info->allocated_roots)) { char buf[BTRFS_ROOT_NAME_BUF_LEN]; root = list_first_entry(&fs_info->allocated_roots, struct btrfs_root, leak_list); btrfs_err(fs_info, "leaked root %s refcount %d", btrfs_root_name(&root->root_key, buf), refcount_read(&root->refs)); WARN_ON_ONCE(1); while (refcount_read(&root->refs) > 1) btrfs_put_root(root); btrfs_put_root(root); } #endif } static void free_global_roots(struct btrfs_fs_info *fs_info) { struct btrfs_root *root; struct rb_node *node; while ((node = rb_first_postorder(&fs_info->global_root_tree)) != NULL) { root = rb_entry(node, struct btrfs_root, rb_node); rb_erase(&root->rb_node, &fs_info->global_root_tree); btrfs_put_root(root); } } void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) { struct percpu_counter *em_counter = &fs_info->evictable_extent_maps; if (fs_info->fs_devices) btrfs_close_devices(fs_info->fs_devices); btrfs_free_compress_wsm(fs_info); percpu_counter_destroy(&fs_info->stats_read_blocks); percpu_counter_destroy(&fs_info->dirty_metadata_bytes); percpu_counter_destroy(&fs_info->delalloc_bytes); percpu_counter_destroy(&fs_info->ordered_bytes); if (percpu_counter_initialized(em_counter)) ASSERT(percpu_counter_sum_positive(em_counter) == 0); percpu_counter_destroy(em_counter); percpu_counter_destroy(&fs_info->dev_replace.bio_counter); btrfs_free_stripe_hash_table(fs_info); btrfs_free_ref_cache(fs_info); kfree(fs_info->balance_ctl); free_global_roots(fs_info); btrfs_put_root(fs_info->tree_root); btrfs_put_root(fs_info->chunk_root); btrfs_put_root(fs_info->dev_root); btrfs_put_root(fs_info->quota_root); btrfs_put_root(fs_info->uuid_root); btrfs_put_root(fs_info->fs_root); btrfs_put_root(fs_info->data_reloc_root); btrfs_put_root(fs_info->block_group_root); btrfs_put_root(fs_info->stripe_root); btrfs_put_root(fs_info->remap_root); btrfs_check_leaked_roots(fs_info); btrfs_extent_buffer_leak_debug_check(fs_info); kfree(fs_info->super_copy); kfree(fs_info->super_for_commit); kvfree(fs_info); } /* * Get an in-memory reference of a root structure. * * For essential trees like root/extent tree, we grab it from fs_info directly. * For subvolume trees, we check the cached filesystem roots first. If not * found, then read it from disk and add it to cached fs roots. * * Caller should release the root by calling btrfs_put_root() after the usage. * * NOTE: Reloc and log trees can't be read by this function as they share the * same root objectid. * * @objectid: root id * @anon_dev: preallocated anonymous block device number for new roots, * pass NULL for a new allocation. * @check_ref: whether to check root item references, If true, return -ENOENT * for orphan roots */ static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info, u64 objectid, dev_t *anon_dev, bool check_ref) { struct btrfs_root *root; struct btrfs_path *path; struct btrfs_key key; int ret; root = btrfs_get_global_root(fs_info, objectid); if (root) return root; /* * If we're called for non-subvolume trees, and above function didn't * find one, do not try to read it from disk. * * This is namely for free-space-tree and quota tree, which can change * at runtime and should only be grabbed from fs_info. */ if (!btrfs_is_fstree(objectid) && objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) return ERR_PTR(-ENOENT); again: root = btrfs_lookup_fs_root(fs_info, objectid); if (root) { /* * Some other caller may have read out the newly inserted * subvolume already (for things like backref walk etc). Not * that common but still possible. In that case, we just need * to free the anon_dev. */ if (unlikely(anon_dev && *anon_dev)) { free_anon_bdev(*anon_dev); *anon_dev = 0; } if (check_ref && btrfs_root_refs(&root->root_item) == 0) { btrfs_put_root(root); return ERR_PTR(-ENOENT); } return root; } key.objectid = objectid; key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; root = btrfs_read_tree_root(fs_info->tree_root, &key); if (IS_ERR(root)) return root; if (check_ref && btrfs_root_refs(&root->root_item) == 0) { ret = -ENOENT; goto fail; } ret = btrfs_init_fs_root(root, anon_dev ? *anon_dev : 0); if (ret) goto fail; path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto fail; } key.objectid = BTRFS_ORPHAN_OBJECTID; key.type = BTRFS_ORPHAN_ITEM_KEY; key.offset = objectid; ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); btrfs_free_path(path); if (ret < 0) goto fail; if (ret == 0) set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state); ret = btrfs_insert_fs_root(fs_info, root); if (ret) { if (ret == -EEXIST) { btrfs_put_root(root); goto again; } goto fail; } return root; fail: /* * If our caller provided us an anonymous device, then it's his * responsibility to free it in case we fail. So we have to set our * root's anon_dev to 0 to avoid a double free, once by btrfs_put_root() * and once again by our caller. */ if (anon_dev && *anon_dev) root->anon_dev = 0; btrfs_put_root(root); return ERR_PTR(ret); } /* * Get in-memory reference of a root structure * * @objectid: tree objectid * @check_ref: if set, verify that the tree exists and the item has at least * one reference */ struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, u64 objectid, bool check_ref) { return btrfs_get_root_ref(fs_info, objectid, NULL, check_ref); } /* * Get in-memory reference of a root structure, created as new, optionally pass * the anonymous block device id * * @objectid: tree objectid * @anon_dev: if NULL, allocate a new anonymous block device or use the * parameter value if not NULL */ struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info, u64 objectid, dev_t *anon_dev) { return btrfs_get_root_ref(fs_info, objectid, anon_dev, true); } /* * Return a root for the given objectid. * * @fs_info: the fs_info * @objectid: the objectid we need to lookup * * This is exclusively used for backref walking, and exists specifically because * of how qgroups does lookups. Qgroups will do a backref lookup at delayed ref * creation time, which means we may have to read the tree_root in order to look * up a fs root that is not in memory. If the root is not in memory we will * read the tree root commit root and look up the fs root from there. This is a * temporary root, it will not be inserted into the radix tree as it doesn't * have the most uptodate information, it'll simply be discarded once the * backref code is finished using the root. */ struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 objectid) { struct btrfs_root *root; struct btrfs_key key; ASSERT(path->search_commit_root && path->skip_locking); /* * This can return -ENOENT if we ask for a root that doesn't exist, but * since this is called via the backref walking code we won't be looking * up a root that doesn't exist, unless there's corruption. So if root * != NULL just return it. */ root = btrfs_get_global_root(fs_info, objectid); if (root) return root; root = btrfs_lookup_fs_root(fs_info, objectid); if (root) return root; key.objectid = objectid; key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; root = read_tree_root_path(fs_info->tree_root, path, &key); btrfs_release_path(path); return root; } static int cleaner_kthread(void *arg) { struct btrfs_fs_info *fs_info = arg; int again; while (1) { again = 0; set_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags); /* Make the cleaner go to sleep early. */ if (btrfs_need_cleaner_sleep(fs_info)) goto sleep; /* * Do not do anything if we might cause open_ctree() to block * before we have finished mounting the filesystem. */ if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) goto sleep; if (!mutex_trylock(&fs_info->cleaner_mutex)) goto sleep; /* * Avoid the problem that we change the status of the fs * during the above check and trylock. */ if (btrfs_need_cleaner_sleep(fs_info)) { mutex_unlock(&fs_info->cleaner_mutex); goto sleep; } if (test_and_clear_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags)) btrfs_sysfs_feature_update(fs_info); btrfs_run_delayed_iputs(fs_info); again = btrfs_clean_one_deleted_snapshot(fs_info); mutex_unlock(&fs_info->cleaner_mutex); /* * The defragger has dealt with the R/O remount and umount, * needn't do anything special here. */ btrfs_run_defrag_inodes(fs_info); if (btrfs_fs_incompat(fs_info, REMAP_TREE) && !btrfs_test_opt(fs_info, DISCARD_ASYNC)) btrfs_handle_fully_remapped_bgs(fs_info); /* * Acquires fs_info->reclaim_bgs_lock to avoid racing * with relocation (btrfs_relocate_chunk) and relocation * acquires fs_info->cleaner_mutex (btrfs_relocate_block_group) * after acquiring fs_info->reclaim_bgs_lock. So we * can't hold, nor need to, fs_info->cleaner_mutex when deleting * unused block groups. */ btrfs_delete_unused_bgs(fs_info); /* * Reclaim block groups in the reclaim_bgs list after we deleted * all unused block_groups. This possibly gives us some more free * space. */ btrfs_reclaim_bgs(fs_info); sleep: clear_and_wake_up_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags); if (kthread_should_park()) kthread_parkme(); if (kthread_should_stop()) return 0; if (!again) { set_current_state(TASK_INTERRUPTIBLE); schedule(); __set_current_state(TASK_RUNNING); } } } static int transaction_kthread(void *arg) { struct btrfs_root *root = arg; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; struct btrfs_transaction *cur; u64 transid; time64_t delta; unsigned long delay; bool cannot_commit; do { cannot_commit = false; delay = secs_to_jiffies(fs_info->commit_interval); mutex_lock(&fs_info->transaction_kthread_mutex); spin_lock(&fs_info->trans_lock); cur = fs_info->running_transaction; if (!cur) { spin_unlock(&fs_info->trans_lock); goto sleep; } delta = ktime_get_seconds() - cur->start_time; if (!test_and_clear_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags) && cur->state < TRANS_STATE_COMMIT_PREP && delta < fs_info->commit_interval) { spin_unlock(&fs_info->trans_lock); delay -= secs_to_jiffies(delta - 1); delay = min(delay, secs_to_jiffies(fs_info->commit_interval)); goto sleep; } transid = cur->transid; spin_unlock(&fs_info->trans_lock); /* If the file system is aborted, this will always fail. */ trans = btrfs_attach_transaction(root); if (IS_ERR(trans)) { if (PTR_ERR(trans) != -ENOENT) cannot_commit = true; goto sleep; } if (transid == trans->transid) { btrfs_commit_transaction(trans); } else { btrfs_end_transaction(trans); } sleep: wake_up_process(fs_info->cleaner_kthread); mutex_unlock(&fs_info->transaction_kthread_mutex); if (BTRFS_FS_ERROR(fs_info)) btrfs_cleanup_transaction(fs_info); if (!kthread_should_stop() && (!btrfs_transaction_blocked(fs_info) || cannot_commit)) schedule_timeout_interruptible(delay); } while (!kthread_should_stop()); return 0; } /* * This will find the highest generation in the array of root backups. The * index of the highest array is returned, or -EINVAL if we can't find * anything. * * We check to make sure the array is valid by comparing the * generation of the latest root in the array with the generation * in the super block. If they don't match we pitch it. */ static int find_newest_super_backup(struct btrfs_fs_info *info) { const u64 newest_gen = btrfs_super_generation(info->super_copy); u64 cur; struct btrfs_root_backup *root_backup; int i; for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) { root_backup = info->super_copy->super_roots + i; cur = btrfs_backup_tree_root_gen(root_backup); if (cur == newest_gen) return i; } return -EINVAL; } /* * copy all the root pointers into the super backup array. * this will bump the backup pointer by one when it is * done */ static int backup_super_roots(struct btrfs_fs_info *info) { const int next_backup = info->backup_root_index; struct btrfs_root_backup *root_backup; root_backup = info->super_for_commit->super_roots + next_backup; /* * make sure all of our padding and empty slots get zero filled * regardless of which ones we use today */ memset(root_backup, 0, sizeof(*root_backup)); info->backup_root_index = (next_backup + 1) % BTRFS_NUM_BACKUP_ROOTS; btrfs_set_backup_tree_root(root_backup, info->tree_root->node->start); btrfs_set_backup_tree_root_gen(root_backup, btrfs_header_generation(info->tree_root->node)); btrfs_set_backup_tree_root_level(root_backup, btrfs_header_level(info->tree_root->node)); btrfs_set_backup_chunk_root(root_backup, info->chunk_root->node->start); btrfs_set_backup_chunk_root_gen(root_backup, btrfs_header_generation(info->chunk_root->node)); btrfs_set_backup_chunk_root_level(root_backup, btrfs_header_level(info->chunk_root->node)); if (!btrfs_fs_incompat(info, EXTENT_TREE_V2)) { struct btrfs_root *extent_root = btrfs_extent_root(info, 0); struct btrfs_root *csum_root = btrfs_csum_root(info, 0); if (unlikely(!extent_root)) { btrfs_err(info, "missing extent root for extent at bytenr 0"); return -EUCLEAN; } if (unlikely(!csum_root)) { btrfs_err(info, "missing csum root for extent at bytenr 0"); return -EUCLEAN; } btrfs_set_backup_extent_root(root_backup, extent_root->node->start); btrfs_set_backup_extent_root_gen(root_backup, btrfs_header_generation(extent_root->node)); btrfs_set_backup_extent_root_level(root_backup, btrfs_header_level(extent_root->node)); btrfs_set_backup_csum_root(root_backup, csum_root->node->start); btrfs_set_backup_csum_root_gen(root_backup, btrfs_header_generation(csum_root->node)); btrfs_set_backup_csum_root_level(root_backup, btrfs_header_level(csum_root->node)); } /* * we might commit during log recovery, which happens before we set * the fs_root. Make sure it is valid before we fill it in. */ if (info->fs_root && info->fs_root->node) { btrfs_set_backup_fs_root(root_backup, info->fs_root->node->start); btrfs_set_backup_fs_root_gen(root_backup, btrfs_header_generation(info->fs_root->node)); btrfs_set_backup_fs_root_level(root_backup, btrfs_header_level(info->fs_root->node)); } btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start); btrfs_set_backup_dev_root_gen(root_backup, btrfs_header_generation(info->dev_root->node)); btrfs_set_backup_dev_root_level(root_backup, btrfs_header_level(info->dev_root->node)); btrfs_set_backup_total_bytes(root_backup, btrfs_super_total_bytes(info->super_copy)); btrfs_set_backup_bytes_used(root_backup, btrfs_super_bytes_used(info->super_copy)); btrfs_set_backup_num_devices(root_backup, btrfs_super_num_devices(info->super_copy)); /* * if we don't copy this out to the super_copy, it won't get remembered * for the next commit */ memcpy(&info->super_copy->super_roots, &info->super_for_commit->super_roots, sizeof(*root_backup) * BTRFS_NUM_BACKUP_ROOTS); return 0; } /* * Reads a backup root based on the passed priority. Prio 0 is the newest, prio * 1/2/3 are 2nd newest/3rd newest/4th (oldest) backup roots * * @fs_info: filesystem whose backup roots need to be read * @priority: priority of backup root required * * Returns backup root index on success and -EINVAL otherwise. */ static int read_backup_root(struct btrfs_fs_info *fs_info, u8 priority) { int backup_index = find_newest_super_backup(fs_info); struct btrfs_super_block *super = fs_info->super_copy; struct btrfs_root_backup *root_backup; if (priority < BTRFS_NUM_BACKUP_ROOTS && backup_index >= 0) { if (priority == 0) return backup_index; backup_index = backup_index + BTRFS_NUM_BACKUP_ROOTS - priority; backup_index %= BTRFS_NUM_BACKUP_ROOTS; } else { return -EINVAL; } root_backup = super->super_roots + backup_index; btrfs_set_super_generation(super, btrfs_backup_tree_root_gen(root_backup)); btrfs_set_super_root(super, btrfs_backup_tree_root(root_backup)); btrfs_set_super_root_level(super, btrfs_backup_tree_root_level(root_backup)); btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup)); /* * Fixme: the total bytes and num_devices need to match or we should * need a fsck */ btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup)); btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup)); return backup_index; } /* helper to cleanup workers */ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) { btrfs_destroy_workqueue(fs_info->fixup_workers); btrfs_destroy_workqueue(fs_info->delalloc_workers); btrfs_destroy_workqueue(fs_info->workers); if (fs_info->endio_workers) destroy_workqueue(fs_info->endio_workers); if (fs_info->rmw_workers) destroy_workqueue(fs_info->rmw_workers); btrfs_destroy_workqueue(fs_info->endio_write_workers); btrfs_destroy_workqueue(fs_info->endio_freespace_worker); btrfs_destroy_workqueue(fs_info->delayed_workers); btrfs_destroy_workqueue(fs_info->caching_workers); btrfs_destroy_workqueue(fs_info->flush_workers); btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers); if (fs_info->discard_ctl.discard_workers) destroy_workqueue(fs_info->discard_ctl.discard_workers); /* * Now that all other work queues are destroyed, we can safely destroy * the queues used for metadata I/O, since tasks from those other work * queues can do metadata I/O operations. */ if (fs_info->endio_meta_workers) destroy_workqueue(fs_info->endio_meta_workers); } static void free_root_extent_buffers(struct btrfs_root *root) { if (root) { free_extent_buffer(root->node); free_extent_buffer(root->commit_root); root->node = NULL; root->commit_root = NULL; } } static void free_global_root_pointers(struct btrfs_fs_info *fs_info) { struct btrfs_root *root, *tmp; rbtree_postorder_for_each_entry_safe(root, tmp, &fs_info->global_root_tree, rb_node) free_root_extent_buffers(root); } /* helper to cleanup tree roots */ static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root) { free_root_extent_buffers(info->tree_root); free_global_root_pointers(info); free_root_extent_buffers(info->dev_root); free_root_extent_buffers(info->quota_root); free_root_extent_buffers(info->uuid_root); free_root_extent_buffers(info->fs_root); free_root_extent_buffers(info->data_reloc_root); free_root_extent_buffers(info->block_group_root); free_root_extent_buffers(info->stripe_root); free_root_extent_buffers(info->remap_root); if (free_chunk_root) free_root_extent_buffers(info->chunk_root); } void btrfs_put_root(struct btrfs_root *root) { if (!root) return; if (refcount_dec_and_test(&root->refs)) { if (WARN_ON(!xa_empty(&root->inodes))) xa_destroy(&root->inodes); if (WARN_ON(!xa_empty(&root->delayed_nodes))) xa_destroy(&root->delayed_nodes); WARN_ON(test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state)); if (root->anon_dev) free_anon_bdev(root->anon_dev); free_root_extent_buffers(root); #ifdef CONFIG_BTRFS_DEBUG spin_lock(&root->fs_info->fs_roots_radix_lock); list_del_init(&root->leak_list); spin_unlock(&root->fs_info->fs_roots_radix_lock); #endif kfree(root); } } void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info) { int ret; struct btrfs_root *gang[8]; int i; while (!list_empty(&fs_info->dead_roots)) { gang[0] = list_first_entry(&fs_info->dead_roots, struct btrfs_root, root_list); list_del(&gang[0]->root_list); if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state)) btrfs_drop_and_free_fs_root(fs_info, gang[0]); btrfs_put_root(gang[0]); } while (1) { ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, (void **)gang, 0, ARRAY_SIZE(gang)); if (!ret) break; for (i = 0; i < ret; i++) btrfs_drop_and_free_fs_root(fs_info, gang[i]); } } static void btrfs_init_scrub(struct btrfs_fs_info *fs_info) { mutex_init(&fs_info->scrub_lock); atomic_set(&fs_info->scrubs_running, 0); atomic_set(&fs_info->scrub_pause_req, 0); atomic_set(&fs_info->scrubs_paused, 0); atomic_set(&fs_info->scrub_cancel_req, 0); init_waitqueue_head(&fs_info->scrub_pause_wait); refcount_set(&fs_info->scrub_workers_refcnt, 0); } static void btrfs_init_balance(struct btrfs_fs_info *fs_info) { spin_lock_init(&fs_info->balance_lock); mutex_init(&fs_info->balance_mutex); atomic_set(&fs_info->balance_pause_req, 0); atomic_set(&fs_info->balance_cancel_req, 0); fs_info->balance_ctl = NULL; init_waitqueue_head(&fs_info->balance_wait_q); atomic_set(&fs_info->reloc_cancel_req, 0); } static int btrfs_init_btree_inode(struct super_block *sb) { struct btrfs_fs_info *fs_info = btrfs_sb(sb); unsigned long hash = btrfs_inode_hash(BTRFS_BTREE_INODE_OBJECTID, fs_info->tree_root); struct inode *inode; inode = new_inode(sb); if (!inode) return -ENOMEM; btrfs_set_inode_number(BTRFS_I(inode), BTRFS_BTREE_INODE_OBJECTID); set_nlink(inode, 1); /* * we set the i_size on the btree inode to the max possible int. * the real end of the address space is determined by all of * the devices in the system */ inode->i_size = OFFSET_MAX; inode->i_mapping->a_ops = &btree_aops; mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); btrfs_extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree, IO_TREE_BTREE_INODE_IO); btrfs_extent_map_tree_init(&BTRFS_I(inode)->extent_tree); BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root); set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); __insert_inode_hash(inode, hash); set_bit(AS_KERNEL_FILE, &inode->i_mapping->flags); fs_info->btree_inode = inode; return 0; } static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info) { mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount); init_rwsem(&fs_info->dev_replace.rwsem); init_waitqueue_head(&fs_info->dev_replace.replace_wait); } static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info) { spin_lock_init(&fs_info->qgroup_lock); mutex_init(&fs_info->qgroup_ioctl_lock); fs_info->qgroup_tree = RB_ROOT; INIT_LIST_HEAD(&fs_info->dirty_qgroups); fs_info->qgroup_seq = 1; fs_info->qgroup_rescan_running = false; fs_info->qgroup_drop_subtree_thres = BTRFS_QGROUP_DROP_SUBTREE_THRES_DEFAULT; mutex_init(&fs_info->qgroup_rescan_lock); } static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) { u32 max_active = fs_info->thread_pool_size; unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND; unsigned int ordered_flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_PERCPU; fs_info->workers = btrfs_alloc_workqueue(fs_info, "worker", flags, max_active, 16); fs_info->delalloc_workers = btrfs_alloc_workqueue(fs_info, "delalloc", flags, max_active, 2); fs_info->flush_workers = btrfs_alloc_workqueue(fs_info, "flush_delalloc", flags, max_active, 0); fs_info->caching_workers = btrfs_alloc_workqueue(fs_info, "cache", flags, max_active, 0); fs_info->fixup_workers = btrfs_alloc_ordered_workqueue(fs_info, "fixup", ordered_flags); fs_info->endio_workers = alloc_workqueue("btrfs-endio", flags, max_active); fs_info->endio_meta_workers = alloc_workqueue("btrfs-endio-meta", flags, max_active); fs_info->rmw_workers = alloc_workqueue("btrfs-rmw", flags, max_active); fs_info->endio_write_workers = btrfs_alloc_workqueue(fs_info, "endio-write", flags, max_active, 2); fs_info->endio_freespace_worker = btrfs_alloc_workqueue(fs_info, "freespace-write", flags, max_active, 0); fs_info->delayed_workers = btrfs_alloc_workqueue(fs_info, "delayed-meta", flags, max_active, 0); fs_info->qgroup_rescan_workers = btrfs_alloc_ordered_workqueue(fs_info, "qgroup-rescan", ordered_flags); fs_info->discard_ctl.discard_workers = alloc_ordered_workqueue("btrfs-discard", WQ_FREEZABLE); if (!(fs_info->workers && fs_info->delalloc_workers && fs_info->flush_workers && fs_info->endio_workers && fs_info->endio_meta_workers && fs_info->endio_write_workers && fs_info->endio_freespace_worker && fs_info->rmw_workers && fs_info->caching_workers && fs_info->fixup_workers && fs_info->delayed_workers && fs_info->qgroup_rescan_workers && fs_info->discard_ctl.discard_workers)) { return -ENOMEM; } return 0; } static void btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type) { /* Check if the checksum implementation is a fast accelerated one. */ switch (csum_type) { case BTRFS_CSUM_TYPE_CRC32: if (crc32_optimizations() & CRC32C_OPTIMIZATION) set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags); break; case BTRFS_CSUM_TYPE_XXHASH: set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags); break; default: break; } btrfs_info(fs_info, "using %s checksum algorithm", btrfs_super_csum_name(csum_type)); } static int btrfs_replay_log(struct btrfs_fs_info *fs_info, struct btrfs_fs_devices *fs_devices) { int ret; struct btrfs_tree_parent_check check = { 0 }; struct btrfs_root *log_tree_root; struct btrfs_super_block *disk_super = fs_info->super_copy; u64 bytenr = btrfs_super_log_root(disk_super); int level = btrfs_super_log_root_level(disk_super); if (unlikely(fs_devices->rw_devices == 0)) { btrfs_err(fs_info, "log replay required on RO media"); return -EIO; } log_tree_root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID, GFP_KERNEL); if (!log_tree_root) return -ENOMEM; check.level = level; check.transid = fs_info->generation + 1; check.owner_root = BTRFS_TREE_LOG_OBJECTID; log_tree_root->node = read_tree_block(fs_info, bytenr, &check); if (IS_ERR(log_tree_root->node)) { ret = PTR_ERR(log_tree_root->node); log_tree_root->node = NULL; btrfs_err(fs_info, "failed to read log tree with error: %d", ret); btrfs_put_root(log_tree_root); return ret; } if (unlikely(!extent_buffer_uptodate(log_tree_root->node))) { btrfs_err(fs_info, "failed to read log tree"); btrfs_put_root(log_tree_root); return -EIO; } /* returns with log_tree_root freed on success */ ret = btrfs_recover_log_trees(log_tree_root); btrfs_put_root(log_tree_root); if (unlikely(ret)) { ASSERT(BTRFS_FS_ERROR(fs_info) != 0); btrfs_err(fs_info, "failed to recover log trees with error: %d", ret); return ret; } if (sb_rdonly(fs_info->sb)) { ret = btrfs_commit_super(fs_info); if (ret) return ret; } return 0; } static int load_global_roots_objectid(struct btrfs_root *tree_root, struct btrfs_path *path, u64 objectid, const char *name) { struct btrfs_fs_info *fs_info = tree_root->fs_info; struct btrfs_root *root; u64 max_global_id = 0; int ret; struct btrfs_key key = { .objectid = objectid, .type = BTRFS_ROOT_ITEM_KEY, .offset = 0, }; bool found = false; /* If we have IGNOREDATACSUMS skip loading these roots. */ if (objectid == BTRFS_CSUM_TREE_OBJECTID && btrfs_test_opt(fs_info, IGNOREDATACSUMS)) { set_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state); return 0; } while (1) { ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0); if (ret < 0) break; if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { ret = btrfs_next_leaf(tree_root, path); if (ret) { if (ret > 0) ret = 0; break; } } ret = 0; btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); if (key.objectid != objectid) break; btrfs_release_path(path); /* * Just worry about this for extent tree, it'll be the same for * everybody. */ if (objectid == BTRFS_EXTENT_TREE_OBJECTID) max_global_id = max(max_global_id, key.offset); found = true; root = read_tree_root_path(tree_root, path, &key); if (IS_ERR(root)) { ret = PTR_ERR(root); break; } set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); ret = btrfs_global_root_insert(root); if (ret) { btrfs_put_root(root); break; } key.offset++; } btrfs_release_path(path); if (objectid == BTRFS_EXTENT_TREE_OBJECTID) fs_info->nr_global_roots = max_global_id + 1; if (!found || ret) { if (objectid == BTRFS_CSUM_TREE_OBJECTID) set_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state); if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) ret = ret ? ret : -ENOENT; else ret = 0; btrfs_err(fs_info, "failed to load root %s", name); } return ret; } static int load_global_roots(struct btrfs_root *tree_root) { BTRFS_PATH_AUTO_FREE(path); int ret; path = btrfs_alloc_path(); if (!path) return -ENOMEM; ret = load_global_roots_objectid(tree_root, path, BTRFS_EXTENT_TREE_OBJECTID, "extent"); if (ret) return ret; ret = load_global_roots_objectid(tree_root, path, BTRFS_CSUM_TREE_OBJECTID, "csum"); if (ret) return ret; if (!btrfs_fs_compat_ro(tree_root->fs_info, FREE_SPACE_TREE)) return ret; return load_global_roots_objectid(tree_root, path, BTRFS_FREE_SPACE_TREE_OBJECTID, "free space"); } static int btrfs_read_roots(struct btrfs_fs_info *fs_info) { struct btrfs_root *tree_root = fs_info->tree_root; struct btrfs_root *root; struct btrfs_key location; int ret; ASSERT(fs_info->tree_root); ret = load_global_roots(tree_root); if (ret) return ret; location.type = BTRFS_ROOT_ITEM_KEY; location.offset = 0; if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE)) { location.objectid = BTRFS_BLOCK_GROUP_TREE_OBJECTID; root = btrfs_read_tree_root(tree_root, &location); if (IS_ERR(root)) { if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { ret = PTR_ERR(root); goto out; } } else { set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); fs_info->block_group_root = root; } } location.objectid = BTRFS_DEV_TREE_OBJECTID; root = btrfs_read_tree_root(tree_root, &location); if (IS_ERR(root)) { if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { ret = PTR_ERR(root); goto out; } } else { set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); fs_info->dev_root = root; } /* Initialize fs_info for all devices in any case */ ret = btrfs_init_devices_late(fs_info); if (ret) goto out; if (btrfs_fs_incompat(fs_info, REMAP_TREE)) { /* The remap_root has already been loaded in load_important_roots(). */ root = fs_info->remap_root; set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); root->root_key.objectid = BTRFS_REMAP_TREE_OBJECTID; root->root_key.type = BTRFS_ROOT_ITEM_KEY; root->root_key.offset = 0; /* Check that data reloc tree doesn't also exist. */ location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID; root = btrfs_read_tree_root(fs_info->tree_root, &location); if (!IS_ERR(root)) { btrfs_err(fs_info, "data reloc tree exists when remap-tree enabled"); btrfs_put_root(root); return -EIO; } else if (PTR_ERR(root) != -ENOENT) { btrfs_warn(fs_info, "error %ld when checking for data reloc tree", PTR_ERR(root)); } } else { /* * This tree can share blocks with some other fs tree during * relocation and we need a proper setup by btrfs_get_fs_root(). */ root = btrfs_get_fs_root(tree_root->fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID, true); if (IS_ERR(root)) { if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID; ret = PTR_ERR(root); goto out; } } else { set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); fs_info->data_reloc_root = root; } } location.objectid = BTRFS_QUOTA_TREE_OBJECTID; root = btrfs_read_tree_root(tree_root, &location); if (!IS_ERR(root)) { set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); fs_info->quota_root = root; } location.objectid = BTRFS_UUID_TREE_OBJECTID; root = btrfs_read_tree_root(tree_root, &location); if (IS_ERR(root)) { if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { ret = PTR_ERR(root); if (ret != -ENOENT) goto out; } } else { set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); fs_info->uuid_root = root; } if (btrfs_fs_incompat(fs_info, RAID_STRIPE_TREE)) { location.objectid = BTRFS_RAID_STRIPE_TREE_OBJECTID; root = btrfs_read_tree_root(tree_root, &location); if (IS_ERR(root)) { if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { ret = PTR_ERR(root); goto out; } } else { set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); fs_info->stripe_root = root; } } return 0; out: btrfs_warn(fs_info, "failed to read root (objectid=%llu): %d", location.objectid, ret); return ret; } static int validate_sys_chunk_array(const struct btrfs_fs_info *fs_info, const struct btrfs_super_block *sb) { unsigned int cur = 0; /* Offset inside the sys chunk array */ /* * At sb read time, fs_info is not fully initialized. Thus we have * to use super block sectorsize, which should have been validated. */ const u32 sectorsize = btrfs_super_sectorsize(sb); u32 sys_array_size = btrfs_super_sys_array_size(sb); if (unlikely(sys_array_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)) { btrfs_err(fs_info, "system chunk array too big %u > %u", sys_array_size, BTRFS_SYSTEM_CHUNK_ARRAY_SIZE); return -EUCLEAN; } while (cur < sys_array_size) { struct btrfs_disk_key *disk_key; struct btrfs_chunk *chunk; struct btrfs_key key; u64 type; u16 num_stripes; u32 len; int ret; disk_key = (struct btrfs_disk_key *)(sb->sys_chunk_array + cur); len = sizeof(*disk_key); if (unlikely(cur + len > sys_array_size)) goto short_read; cur += len; btrfs_disk_key_to_cpu(&key, disk_key); if (unlikely(key.type != BTRFS_CHUNK_ITEM_KEY)) { btrfs_err(fs_info, "unexpected item type %u in sys_array at offset %u", key.type, cur); return -EUCLEAN; } chunk = (struct btrfs_chunk *)(sb->sys_chunk_array + cur); num_stripes = btrfs_stack_chunk_num_stripes(chunk); if (unlikely(cur + btrfs_chunk_item_size(num_stripes) > sys_array_size)) goto short_read; type = btrfs_stack_chunk_type(chunk); if (unlikely(!(type & BTRFS_BLOCK_GROUP_SYSTEM))) { btrfs_err(fs_info, "invalid chunk type %llu in sys_array at offset %u", type, cur); return -EUCLEAN; } ret = btrfs_check_chunk_valid(fs_info, NULL, chunk, key.offset, sectorsize); if (ret < 0) return ret; cur += btrfs_chunk_item_size(num_stripes); } return 0; short_read: btrfs_err(fs_info, "super block sys chunk array short read, cur=%u sys_array_size=%u", cur, sys_array_size); return -EUCLEAN; } /* * Real super block validation * NOTE: super csum type and incompat features will not be checked here. * * @sb: super block to check * @mirror_num: the super block number to check its bytenr: * 0 the primary (1st) sb * 1, 2 2nd and 3rd backup copy * -1 skip bytenr check */ int btrfs_validate_super(const struct btrfs_fs_info *fs_info, const struct btrfs_super_block *sb, int mirror_num) { u64 nodesize = btrfs_super_nodesize(sb); u64 sectorsize = btrfs_super_sectorsize(sb); int ret = 0; const bool ignore_flags = btrfs_test_opt(fs_info, IGNORESUPERFLAGS); if (btrfs_super_magic(sb) != BTRFS_MAGIC) { btrfs_err(fs_info, "no valid FS found"); ret = -EINVAL; } if ((btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP)) { if (!ignore_flags) { btrfs_err(fs_info, "unrecognized or unsupported super flag 0x%llx", btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP); ret = -EINVAL; } else { btrfs_info(fs_info, "unrecognized or unsupported super flags: 0x%llx, ignored", btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP); } } if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) { btrfs_err(fs_info, "tree_root level too big: %d >= %d", btrfs_super_root_level(sb), BTRFS_MAX_LEVEL); ret = -EINVAL; } if (btrfs_super_chunk_root_level(sb) >= BTRFS_MAX_LEVEL) { btrfs_err(fs_info, "chunk_root level too big: %d >= %d", btrfs_super_chunk_root_level(sb), BTRFS_MAX_LEVEL); ret = -EINVAL; } if (btrfs_super_log_root_level(sb) >= BTRFS_MAX_LEVEL) { btrfs_err(fs_info, "log_root level too big: %d >= %d", btrfs_super_log_root_level(sb), BTRFS_MAX_LEVEL); ret = -EINVAL; } /* * Check sectorsize and nodesize first, other check will need it. * Check all possible sectorsize(4K, 8K, 16K, 32K, 64K) here. */ if (!is_power_of_2(sectorsize) || sectorsize < BTRFS_MIN_BLOCKSIZE || sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE) { btrfs_err(fs_info, "invalid sectorsize %llu", sectorsize); ret = -EINVAL; } if (!btrfs_supported_blocksize(sectorsize)) { btrfs_err(fs_info, "sectorsize %llu not yet supported for page size %lu", sectorsize, PAGE_SIZE); ret = -EINVAL; } if (!is_power_of_2(nodesize) || nodesize < sectorsize || nodesize > BTRFS_MAX_METADATA_BLOCKSIZE) { btrfs_err(fs_info, "invalid nodesize %llu", nodesize); ret = -EINVAL; } if (nodesize != le32_to_cpu(sb->__unused_leafsize)) { btrfs_err(fs_info, "invalid leafsize %u, should be %llu", le32_to_cpu(sb->__unused_leafsize), nodesize); ret = -EINVAL; } /* Root alignment check */ if (!IS_ALIGNED(btrfs_super_root(sb), sectorsize)) { btrfs_err(fs_info, "tree_root block unaligned: %llu", btrfs_super_root(sb)); ret = -EINVAL; } if (!IS_ALIGNED(btrfs_super_chunk_root(sb), sectorsize)) { btrfs_err(fs_info, "chunk_root block unaligned: %llu", btrfs_super_chunk_root(sb)); ret = -EINVAL; } if (!IS_ALIGNED(btrfs_super_log_root(sb), sectorsize)) { btrfs_err(fs_info, "log_root block unaligned: %llu", btrfs_super_log_root(sb)); ret = -EINVAL; } if (!fs_info->fs_devices->temp_fsid && memcmp(fs_info->fs_devices->fsid, sb->fsid, BTRFS_FSID_SIZE) != 0) { btrfs_err(fs_info, "superblock fsid doesn't match fsid of fs_devices: %pU != %pU", sb->fsid, fs_info->fs_devices->fsid); ret = -EINVAL; } if (memcmp(fs_info->fs_devices->metadata_uuid, btrfs_sb_fsid_ptr(sb), BTRFS_FSID_SIZE) != 0) { btrfs_err(fs_info, "superblock metadata_uuid doesn't match metadata uuid of fs_devices: %pU != %pU", btrfs_sb_fsid_ptr(sb), fs_info->fs_devices->metadata_uuid); ret = -EINVAL; } if (memcmp(fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid, BTRFS_FSID_SIZE) != 0) { btrfs_err(fs_info, "dev_item UUID does not match metadata fsid: %pU != %pU", fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid); ret = -EINVAL; } /* * Artificial requirement for block-group-tree to force newer features * (free-space-tree, no-holes) so the test matrix is smaller. */ if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE) && (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID) || !btrfs_fs_incompat(fs_info, NO_HOLES))) { btrfs_err(fs_info, "block-group-tree feature requires free-space-tree and no-holes"); ret = -EINVAL; } if (btrfs_fs_incompat(fs_info, REMAP_TREE)) { /* * Reduce test matrix for remap tree by requiring block-group-tree * and no-holes. Free-space-tree is a hard requirement. */ if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID) || !btrfs_fs_incompat(fs_info, NO_HOLES) || !btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE)) { btrfs_err(fs_info, "remap-tree feature requires free-space-tree, no-holes, and block-group-tree"); ret = -EINVAL; } if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { btrfs_err(fs_info, "remap-tree not supported with mixed-bg"); ret = -EINVAL; } if (btrfs_fs_incompat(fs_info, ZONED)) { btrfs_err(fs_info, "remap-tree not supported with zoned devices"); ret = -EINVAL; } if (sectorsize > PAGE_SIZE) { btrfs_err(fs_info, "remap-tree not supported when block size > page size"); ret = -EINVAL; } } /* * Hint to catch really bogus numbers, bitflips or so, more exact checks are * done later */ if (btrfs_super_bytes_used(sb) < 6 * btrfs_super_nodesize(sb)) { btrfs_err(fs_info, "bytes_used is too small %llu", btrfs_super_bytes_used(sb)); ret = -EINVAL; } if (!is_power_of_2(btrfs_super_stripesize(sb))) { btrfs_err(fs_info, "invalid stripesize %u", btrfs_super_stripesize(sb)); ret = -EINVAL; } if (btrfs_super_num_devices(sb) > (1UL << 31)) btrfs_warn(fs_info, "suspicious number of devices: %llu", btrfs_super_num_devices(sb)); if (btrfs_super_num_devices(sb) == 0) { btrfs_err(fs_info, "number of devices is 0"); ret = -EINVAL; } if (mirror_num >= 0 && btrfs_super_bytenr(sb) != btrfs_sb_offset(mirror_num)) { btrfs_err(fs_info, "super offset mismatch %llu != %llu", btrfs_super_bytenr(sb), btrfs_sb_offset(mirror_num)); ret = -EINVAL; } if (ret) return ret; ret = validate_sys_chunk_array(fs_info, sb); /* * Obvious sys_chunk_array corruptions, it must hold at least one key * and one chunk */ if (btrfs_super_sys_array_size(sb) > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) { btrfs_err(fs_info, "system chunk array too big %u > %u", btrfs_super_sys_array_size(sb), BTRFS_SYSTEM_CHUNK_ARRAY_SIZE); ret = -EINVAL; } if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key) + sizeof(struct btrfs_chunk)) { btrfs_err(fs_info, "system chunk array too small %u < %zu", btrfs_super_sys_array_size(sb), sizeof(struct btrfs_disk_key) + sizeof(struct btrfs_chunk)); ret = -EINVAL; } /* * The generation is a global counter, we'll trust it more than the others * but it's still possible that it's the one that's wrong. */ if (btrfs_super_generation(sb) < btrfs_super_chunk_root_generation(sb)) btrfs_warn(fs_info, "suspicious: generation < chunk_root_generation: %llu < %llu", btrfs_super_generation(sb), btrfs_super_chunk_root_generation(sb)); if (btrfs_super_generation(sb) < btrfs_super_cache_generation(sb) && btrfs_super_cache_generation(sb) != (u64)-1) btrfs_warn(fs_info, "suspicious: generation < cache_generation: %llu < %llu", btrfs_super_generation(sb), btrfs_super_cache_generation(sb)); return ret; } /* * Validation of super block at mount time. * Some checks already done early at mount time, like csum type and incompat * flags will be skipped. */ static int btrfs_validate_mount_super(struct btrfs_fs_info *fs_info) { return btrfs_validate_super(fs_info, fs_info->super_copy, 0); } /* * Validation of super block at write time. * Some checks like bytenr check will be skipped as their values will be * overwritten soon. * Extra checks like csum type and incompat flags will be done here. */ static int btrfs_validate_write_super(struct btrfs_fs_info *fs_info, struct btrfs_super_block *sb) { int ret; ret = btrfs_validate_super(fs_info, sb, -1); if (ret < 0) goto out; if (unlikely(!btrfs_supported_super_csum(btrfs_super_csum_type(sb)))) { ret = -EUCLEAN; btrfs_err(fs_info, "invalid csum type, has %u want %u", btrfs_super_csum_type(sb), BTRFS_CSUM_TYPE_CRC32); goto out; } if (unlikely(btrfs_super_incompat_flags(sb) & ~BTRFS_FEATURE_INCOMPAT_SUPP)) { ret = -EUCLEAN; btrfs_err(fs_info, "invalid incompat flags, has 0x%llx valid mask 0x%llx", btrfs_super_incompat_flags(sb), (unsigned long long)BTRFS_FEATURE_INCOMPAT_SUPP); goto out; } out: if (ret < 0) btrfs_err(fs_info, "super block corruption detected before writing it to disk"); return ret; } static int load_super_root(struct btrfs_root *root, u64 bytenr, u64 gen, int level) { struct btrfs_tree_parent_check check = { .level = level, .transid = gen, .owner_root = btrfs_root_id(root) }; int ret = 0; root->node = read_tree_block(root->fs_info, bytenr, &check); if (IS_ERR(root->node)) { ret = PTR_ERR(root->node); root->node = NULL; return ret; } if (unlikely(!extent_buffer_uptodate(root->node))) { free_extent_buffer(root->node); root->node = NULL; return -EIO; } btrfs_set_root_node(&root->root_item, root->node); root->commit_root = btrfs_root_node(root); btrfs_set_root_refs(&root->root_item, 1); return ret; } static int load_important_roots(struct btrfs_fs_info *fs_info) { struct btrfs_super_block *sb = fs_info->super_copy; u64 gen, bytenr; int level, ret; bytenr = btrfs_super_root(sb); gen = btrfs_super_generation(sb); level = btrfs_super_root_level(sb); ret = load_super_root(fs_info->tree_root, bytenr, gen, level); if (ret) { btrfs_warn(fs_info, "couldn't read tree root"); return ret; } if (btrfs_fs_incompat(fs_info, REMAP_TREE)) { bytenr = btrfs_super_remap_root(sb); gen = btrfs_super_remap_root_generation(sb); level = btrfs_super_remap_root_level(sb); ret = load_super_root(fs_info->remap_root, bytenr, gen, level); if (ret) { btrfs_warn(fs_info, "couldn't read remap root"); return ret; } } return 0; } static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) { int backup_index = find_newest_super_backup(fs_info); struct btrfs_super_block *sb = fs_info->super_copy; struct btrfs_root *tree_root = fs_info->tree_root; bool handle_error = false; int ret = 0; int i; for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) { if (handle_error) { if (!IS_ERR(tree_root->node)) free_extent_buffer(tree_root->node); tree_root->node = NULL; if (!btrfs_test_opt(fs_info, USEBACKUPROOT)) break; free_root_pointers(fs_info, 0); /* * Don't use the log in recovery mode, it won't be * valid */ btrfs_set_super_log_root(sb, 0); btrfs_warn(fs_info, "try to load backup roots slot %d", i); ret = read_backup_root(fs_info, i); backup_index = ret; if (ret < 0) return ret; } ret = load_important_roots(fs_info); if (ret) { handle_error = true; continue; } /* * No need to hold btrfs_root::objectid_mutex since the fs * hasn't been fully initialised and we are the only user */ ret = btrfs_init_root_free_objectid(tree_root); if (ret < 0) { handle_error = true; continue; } ASSERT(tree_root->free_objectid <= BTRFS_LAST_FREE_OBJECTID); ret = btrfs_read_roots(fs_info); if (ret < 0) { handle_error = true; continue; } /* All successful */ fs_info->generation = btrfs_header_generation(tree_root->node); btrfs_set_last_trans_committed(fs_info, fs_info->generation); fs_info->last_reloc_trans = 0; /* Always begin writing backup roots after the one being used */ if (backup_index < 0) { fs_info->backup_root_index = 0; } else { fs_info->backup_root_index = backup_index + 1; fs_info->backup_root_index %= BTRFS_NUM_BACKUP_ROOTS; } break; } return ret; } /* * Lockdep gets confused between our buffer_tree which requires IRQ locking because * we modify marks in the IRQ context, and our delayed inode xarray which doesn't * have these requirements. Use a class key so lockdep doesn't get them mixed up. */ static struct lock_class_key buffer_xa_class; void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) { INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); /* Use the same flags as mapping->i_pages. */ xa_init_flags(&fs_info->buffer_tree, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT); lockdep_set_class(&fs_info->buffer_tree.xa_lock, &buffer_xa_class); INIT_LIST_HEAD(&fs_info->trans_list); INIT_LIST_HEAD(&fs_info->dead_roots); INIT_LIST_HEAD(&fs_info->delayed_iputs); INIT_LIST_HEAD(&fs_info->delalloc_roots); INIT_LIST_HEAD(&fs_info->caching_block_groups); spin_lock_init(&fs_info->delalloc_root_lock); spin_lock_init(&fs_info->trans_lock); spin_lock_init(&fs_info->fs_roots_radix_lock); spin_lock_init(&fs_info->delayed_iput_lock); spin_lock_init(&fs_info->defrag_inodes_lock); spin_lock_init(&fs_info->super_lock); spin_lock_init(&fs_info->unused_bgs_lock); spin_lock_init(&fs_info->treelog_bg_lock); spin_lock_init(&fs_info->zone_active_bgs_lock); spin_lock_init(&fs_info->relocation_bg_lock); rwlock_init(&fs_info->tree_mod_log_lock); rwlock_init(&fs_info->global_root_lock); mutex_init(&fs_info->unused_bg_unpin_mutex); mutex_init(&fs_info->reclaim_bgs_lock); mutex_init(&fs_info->reloc_mutex); mutex_init(&fs_info->delalloc_root_mutex); mutex_init(&fs_info->zoned_meta_io_lock); mutex_init(&fs_info->zoned_data_reloc_io_lock); seqlock_init(&fs_info->profiles_lock); btrfs_loc